From f2b20d3410e4c0cc3be4a5b69e00120cab9f1d5e Mon Sep 17 00:00:00 2001 From: agozillon Date: Tue, 21 Oct 2025 21:54:25 +0200 Subject: [PATCH 001/530] [Flang][OpenMP][Dialect] Swap to using MLIR dialect enum to encode map flags (#164043) This PR shifts from using the LLVM OpenMP enumerator bit flags to an OpenMP dialect specific enumerator. This allows us to better represent map types that wouldn't be of interest to the LLVM backend and runtime in the dialect. Primarily things like ref_ptr/ref_ptee/ref_ptr_ptee/atach_none/attach_always/attach_auto which are of interest to the compiler for certrain transformations (primarily in the FIR transformation passes dealing with mapping), but the runtime has no need to know about them. It also means if another OpenMP implementation comes along they won't need to stick to the same bit flag system LLVM chose/do leg work to address it. --- flang/include/flang/Lower/DirectivesCommon.h | 1 - flang/include/flang/Utils/OpenMP.h | 5 +- flang/lib/Lower/OpenMP/ClauseProcessor.cpp | 56 +++--- flang/lib/Lower/OpenMP/ClauseProcessor.h | 3 +- flang/lib/Lower/OpenMP/Clauses.cpp | 2 - flang/lib/Lower/OpenMP/OpenMP.cpp | 46 ++--- flang/lib/Lower/OpenMP/Utils.cpp | 19 +- flang/lib/Lower/OpenMP/Utils.h | 2 +- .../Optimizer/OpenMP/AutomapToTargetData.cpp | 11 +- .../OpenMP/DoConcurrentConversion.cpp | 15 +- .../Optimizer/OpenMP/LowerWorkdistribute.cpp | 32 ++-- .../Optimizer/OpenMP/MapInfoFinalization.cpp | 62 +++---- .../OpenMP/MapsForPrivatizedSymbols.cpp | 7 +- flang/lib/Semantics/check-omp-structure.h | 1 - flang/lib/Utils/OpenMP.cpp | 18 +- .../Fir/convert-to-llvm-openmp-and-fir.fir | 8 +- ...rget-teams-private-implicit-scalar-map.f90 | 6 +- flang/test/Lower/OpenMP/common-block-map.f90 | 2 +- flang/test/Lower/OpenMP/declare-mapper.f90 | 2 +- flang/test/Lower/OpenMP/defaultmap.f90 | 6 +- .../Lower/OpenMP/has_device_addr-mapinfo.f90 | 4 +- .../local-intrinsic-sized-array-map.f90 | 2 +- .../Lower/OpenMP/optional-argument-map-2.f90 | 2 +- flang/test/Lower/OpenMP/target.f90 | 12 +- .../DoConcurrent/map_shape_info.f90 | 8 +- .../DoConcurrent/non_reference_to_device.f90 | 2 +- .../lower-workdistribute-fission-host.mlir | 8 +- .../lower-workdistribute-fission-target.mlir | 8 +- .../Transforms/omp-map-info-finalization.fir | 8 +- .../mlir/Dialect/OpenMP/OpenMPEnums.td | 54 ++++++ mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td | 2 +- mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp | 170 ++++++++++-------- .../OpenMP/OpenMPToLLVMIRTranslation.cpp | 64 ++++++- .../OpenMPToLLVM/convert-to-llvmir.mlir | 12 +- mlir/test/Dialect/OpenMP/ops.mlir | 80 +++++++-- 35 files changed, 422 insertions(+), 318 deletions(-) diff --git a/flang/include/flang/Lower/DirectivesCommon.h b/flang/include/flang/Lower/DirectivesCommon.h index 6ed3c1b7d61ee..2d6906738773a 100644 --- a/flang/include/flang/Lower/DirectivesCommon.h +++ b/flang/include/flang/Lower/DirectivesCommon.h @@ -39,7 +39,6 @@ #include "mlir/Dialect/OpenMP/OpenMPDialect.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/IR/Value.h" -#include "llvm/Frontend/OpenMP/OMPConstants.h" #include #include diff --git a/flang/include/flang/Utils/OpenMP.h b/flang/include/flang/Utils/OpenMP.h index 01a94c9099808..bad0abb6f5788 100644 --- a/flang/include/flang/Utils/OpenMP.h +++ b/flang/include/flang/Utils/OpenMP.h @@ -29,8 +29,9 @@ mlir::omp::MapInfoOp createMapInfoOp(mlir::OpBuilder &builder, mlir::Location loc, mlir::Value baseAddr, mlir::Value varPtrPtr, llvm::StringRef name, llvm::ArrayRef bounds, llvm::ArrayRef members, mlir::ArrayAttr membersIndex, - uint64_t mapType, mlir::omp::VariableCaptureKind mapCaptureType, - mlir::Type retTy, bool partialMap = false, + mlir::omp::ClauseMapFlags mapType, + mlir::omp::VariableCaptureKind mapCaptureType, mlir::Type retTy, + bool partialMap = false, mlir::FlatSymbolRefAttr mapperId = mlir::FlatSymbolRefAttr()); /// For an mlir value that does not have storage, allocate temporary storage diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp index 85398be778382..1c163e6de7e5a 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp @@ -1080,9 +1080,8 @@ bool ClauseProcessor::processHasDeviceAddr( [&](const omp::clause::HasDeviceAddr &clause, const parser::CharBlock &source) { mlir::Location location = converter.genLocation(source); - llvm::omp::OpenMPOffloadMappingFlags mapTypeBits = - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO | - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT; + mlir::omp::ClauseMapFlags mapTypeBits = + mlir::omp::ClauseMapFlags::to | mlir::omp::ClauseMapFlags::implicit; omp::ObjectList baseObjects; llvm::transform(clause.v, std::back_inserter(baseObjects), [&](const omp::Object &object) { @@ -1217,8 +1216,7 @@ bool ClauseProcessor::processLink( void ClauseProcessor::processMapObjects( lower::StatementContext &stmtCtx, mlir::Location clauseLocation, - const omp::ObjectList &objects, - llvm::omp::OpenMPOffloadMappingFlags mapTypeBits, + const omp::ObjectList &objects, mlir::omp::ClauseMapFlags mapTypeBits, std::map &parentMemberIndices, llvm::SmallVectorImpl &mapVars, llvm::SmallVectorImpl &mapSyms, @@ -1310,10 +1308,7 @@ void ClauseProcessor::processMapObjects( mlir::omp::MapInfoOp mapOp = utils::openmp::createMapInfoOp( firOpBuilder, location, baseOp, /*varPtrPtr=*/mlir::Value{}, asFortran.str(), bounds, - /*members=*/{}, /*membersIndex=*/mlir::ArrayAttr{}, - static_cast< - std::underlying_type_t>( - mapTypeBits), + /*members=*/{}, /*membersIndex=*/mlir::ArrayAttr{}, mapTypeBits, mlir::omp::VariableCaptureKind::ByRef, baseOp.getType(), /*partialMap=*/false, mapperId); @@ -1347,8 +1342,7 @@ bool ClauseProcessor::processMap( objects] = clause.t; if (attachMod) TODO(currentLocation, "ATTACH modifier is not implemented yet"); - llvm::omp::OpenMPOffloadMappingFlags mapTypeBits = - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_NONE; + mlir::omp::ClauseMapFlags mapTypeBits = mlir::omp::ClauseMapFlags::none; std::string mapperIdName = "__implicit_mapper"; // If the map type is specified, then process it else set the appropriate // default value @@ -1364,36 +1358,32 @@ bool ClauseProcessor::processMap( switch (type) { case Map::MapType::To: - mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO; + mapTypeBits |= mlir::omp::ClauseMapFlags::to; break; case Map::MapType::From: - mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; + mapTypeBits |= mlir::omp::ClauseMapFlags::from; break; case Map::MapType::Tofrom: - mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO | - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; + mapTypeBits |= + mlir::omp::ClauseMapFlags::to | mlir::omp::ClauseMapFlags::from; break; case Map::MapType::Storage: - // alloc and release is the default map_type for the Target Data - // Ops, i.e. if no bits for map_type is supplied then alloc/release - // (aka storage in 6.0+) is implicitly assumed based on the target - // directive. Default value for Target Data and Enter Data is alloc - // and for Exit Data it is release. + mapTypeBits |= mlir::omp::ClauseMapFlags::storage; break; } if (typeMods) { // TODO: Still requires "self" modifier, an OpenMP 6.0+ feature if (llvm::is_contained(*typeMods, Map::MapTypeModifier::Always)) - mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_ALWAYS; + mapTypeBits |= mlir::omp::ClauseMapFlags::always; if (llvm::is_contained(*typeMods, Map::MapTypeModifier::Present)) - mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_PRESENT; + mapTypeBits |= mlir::omp::ClauseMapFlags::present; if (llvm::is_contained(*typeMods, Map::MapTypeModifier::Close)) - mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_CLOSE; + mapTypeBits |= mlir::omp::ClauseMapFlags::close; if (llvm::is_contained(*typeMods, Map::MapTypeModifier::Delete)) - mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_DELETE; + mapTypeBits |= mlir::omp::ClauseMapFlags::del; if (llvm::is_contained(*typeMods, Map::MapTypeModifier::OmpxHold)) - mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_OMPX_HOLD; + mapTypeBits |= mlir::omp::ClauseMapFlags::ompx_hold; } if (iterator) { @@ -1437,12 +1427,12 @@ bool ClauseProcessor::processMotionClauses(lower::StatementContext &stmtCtx, TODO(clauseLocation, "Iterator modifier is not supported yet"); } - llvm::omp::OpenMPOffloadMappingFlags mapTypeBits = + mlir::omp::ClauseMapFlags mapTypeBits = std::is_same_v, omp::clause::To> - ? llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO - : llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; + ? mlir::omp::ClauseMapFlags::to + : mlir::omp::ClauseMapFlags::from; if (expectation && *expectation == omp::clause::To::Expectation::Present) - mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_PRESENT; + mapTypeBits |= mlir::omp::ClauseMapFlags::present; processMapObjects(stmtCtx, clauseLocation, objects, mapTypeBits, parentMemberIndices, result.mapVars, mapSymbols); }; @@ -1568,8 +1558,8 @@ bool ClauseProcessor::processUseDeviceAddr( [&](const omp::clause::UseDeviceAddr &clause, const parser::CharBlock &source) { mlir::Location location = converter.genLocation(source); - llvm::omp::OpenMPOffloadMappingFlags mapTypeBits = - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_RETURN_PARAM; + mlir::omp::ClauseMapFlags mapTypeBits = + mlir::omp::ClauseMapFlags::return_param; processMapObjects(stmtCtx, location, clause.v, mapTypeBits, parentMemberIndices, result.useDeviceAddrVars, useDeviceSyms); @@ -1589,8 +1579,8 @@ bool ClauseProcessor::processUseDevicePtr( [&](const omp::clause::UseDevicePtr &clause, const parser::CharBlock &source) { mlir::Location location = converter.genLocation(source); - llvm::omp::OpenMPOffloadMappingFlags mapTypeBits = - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_RETURN_PARAM; + mlir::omp::ClauseMapFlags mapTypeBits = + mlir::omp::ClauseMapFlags::return_param; processMapObjects(stmtCtx, location, clause.v, mapTypeBits, parentMemberIndices, result.useDevicePtrVars, useDeviceSyms); diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h index 9e352fa574a97..6452e39b97551 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.h +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h @@ -194,8 +194,7 @@ class ClauseProcessor { void processMapObjects( lower::StatementContext &stmtCtx, mlir::Location clauseLocation, - const omp::ObjectList &objects, - llvm::omp::OpenMPOffloadMappingFlags mapTypeBits, + const omp::ObjectList &objects, mlir::omp::ClauseMapFlags mapTypeBits, std::map &parentMemberIndices, llvm::SmallVectorImpl &mapVars, llvm::SmallVectorImpl &mapSyms, diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp index 2a4ebf10bcafd..d39f9dda92a28 100644 --- a/flang/lib/Lower/OpenMP/Clauses.cpp +++ b/flang/lib/Lower/OpenMP/Clauses.cpp @@ -16,8 +16,6 @@ #include "flang/Semantics/openmp-modifiers.h" #include "flang/Semantics/symbol.h" -#include "llvm/Frontend/OpenMP/OMPConstants.h" - #include #include #include diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 9495ea61058ca..a49961cc233c6 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -45,7 +45,6 @@ #include "mlir/Support/StateStack.h" #include "mlir/Transforms/RegionUtils.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/Frontend/OpenMP/OMPConstants.h" using namespace Fortran::lower::omp; using namespace Fortran::common::openmp; @@ -945,8 +944,7 @@ getDefaultmapIfPresent(const DefaultMapsTy &defaultMaps, mlir::Type varType) { return DefMap::ImplicitBehavior::Default; } -static std::pair +static std::pair getImplicitMapTypeAndKind(fir::FirOpBuilder &firOpBuilder, lower::AbstractConverter &converter, const DefaultMapsTy &defaultMaps, mlir::Type varType, @@ -967,8 +965,7 @@ getImplicitMapTypeAndKind(fir::FirOpBuilder &firOpBuilder, return size <= ptrSize && align <= ptrAlign; }; - llvm::omp::OpenMPOffloadMappingFlags mapFlag = - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT; + mlir::omp::ClauseMapFlags mapFlag = mlir::omp::ClauseMapFlags::implicit; auto implicitBehaviour = getDefaultmapIfPresent(defaultMaps, varType); if (implicitBehaviour == DefMap::ImplicitBehavior::Default) { @@ -986,8 +983,8 @@ getImplicitMapTypeAndKind(fir::FirOpBuilder &firOpBuilder, mlir::omp::DeclareTargetCaptureClause::link && declareTargetOp.getDeclareTargetDeviceType() != mlir::omp::DeclareTargetDeviceType::nohost) { - mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO; - mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; + mapFlag |= mlir::omp::ClauseMapFlags::to; + mapFlag |= mlir::omp::ClauseMapFlags::from; } } else if (fir::isa_trivial(varType) || fir::isa_char(varType)) { // Scalars behave as if they were "firstprivate". @@ -996,18 +993,18 @@ getImplicitMapTypeAndKind(fir::FirOpBuilder &firOpBuilder, if (isLiteralType(varType)) { captureKind = mlir::omp::VariableCaptureKind::ByCopy; } else { - mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO; + mapFlag |= mlir::omp::ClauseMapFlags::to; } } else if (!fir::isa_builtin_cptr_type(varType)) { - mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO; - mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; + mapFlag |= mlir::omp::ClauseMapFlags::to; + mapFlag |= mlir::omp::ClauseMapFlags::from; } return std::make_pair(mapFlag, captureKind); } switch (implicitBehaviour) { case DefMap::ImplicitBehavior::Alloc: - return std::make_pair(llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_NONE, + return std::make_pair(mlir::omp::ClauseMapFlags::storage, mlir::omp::VariableCaptureKind::ByRef); break; case DefMap::ImplicitBehavior::Firstprivate: @@ -1016,26 +1013,22 @@ getImplicitMapTypeAndKind(fir::FirOpBuilder &firOpBuilder, "behaviour"); break; case DefMap::ImplicitBehavior::From: - return std::make_pair(mapFlag |= - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM, + return std::make_pair(mapFlag |= mlir::omp::ClauseMapFlags::from, mlir::omp::VariableCaptureKind::ByRef); break; case DefMap::ImplicitBehavior::Present: - return std::make_pair(mapFlag |= - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_PRESENT, + return std::make_pair(mapFlag |= mlir::omp::ClauseMapFlags::present, mlir::omp::VariableCaptureKind::ByRef); break; case DefMap::ImplicitBehavior::To: - return std::make_pair(mapFlag |= - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO, + return std::make_pair(mapFlag |= mlir::omp::ClauseMapFlags::to, (fir::isa_trivial(varType) || fir::isa_char(varType)) ? mlir::omp::VariableCaptureKind::ByCopy : mlir::omp::VariableCaptureKind::ByRef); break; case DefMap::ImplicitBehavior::Tofrom: - return std::make_pair(mapFlag |= - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM | - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO, + return std::make_pair(mapFlag |= mlir::omp::ClauseMapFlags::from | + mlir::omp::ClauseMapFlags::to, mlir::omp::VariableCaptureKind::ByRef); break; case DefMap::ImplicitBehavior::Default: @@ -1044,9 +1037,8 @@ getImplicitMapTypeAndKind(fir::FirOpBuilder &firOpBuilder, break; } - return std::make_pair(mapFlag |= - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM | - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO, + return std::make_pair(mapFlag |= mlir::omp::ClauseMapFlags::from | + mlir::omp::ClauseMapFlags::to, mlir::omp::VariableCaptureKind::ByRef); } @@ -2612,18 +2604,14 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable, if (auto refType = mlir::dyn_cast(baseOp.getType())) eleType = refType.getElementType(); - std::pair + std::pair mapFlagAndKind = getImplicitMapTypeAndKind( firOpBuilder, converter, defaultMaps, eleType, loc, sym); mlir::Value mapOp = createMapInfoOp( firOpBuilder, converter.getCurrentLocation(), baseOp, /*varPtrPtr=*/mlir::Value{}, name.str(), bounds, /*members=*/{}, - /*membersIndex=*/mlir::ArrayAttr{}, - static_cast< - std::underlying_type_t>( - std::get<0>(mapFlagAndKind)), + /*membersIndex=*/mlir::ArrayAttr{}, std::get<0>(mapFlagAndKind), std::get<1>(mapFlagAndKind), baseOp.getType(), /*partialMap=*/false, mapperId); diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp index 37b926e2f23f2..6487f599df72a 100644 --- a/flang/lib/Lower/OpenMP/Utils.cpp +++ b/flang/lib/Lower/OpenMP/Utils.cpp @@ -273,7 +273,7 @@ mlir::Value createParentSymAndGenIntermediateMaps( semantics::SemanticsContext &semaCtx, lower::StatementContext &stmtCtx, omp::ObjectList &objectList, llvm::SmallVectorImpl &indices, OmpMapParentAndMemberData &parentMemberIndices, llvm::StringRef asFortran, - llvm::omp::OpenMPOffloadMappingFlags mapTypeBits) { + mlir::omp::ClauseMapFlags mapTypeBits) { fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); /// Checks if an omp::Object is an array expression with a subscript, e.g. @@ -414,11 +414,10 @@ mlir::Value createParentSymAndGenIntermediateMaps( // be safer to just pass OMP_MAP_NONE as the map type, but we may still // need some of the other map types the mapped member utilises, so for // now it's good to keep an eye on this. - llvm::omp::OpenMPOffloadMappingFlags interimMapType = mapTypeBits; - interimMapType &= ~llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO; - interimMapType &= ~llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; - interimMapType &= - ~llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_RETURN_PARAM; + mlir::omp::ClauseMapFlags interimMapType = mapTypeBits; + interimMapType &= ~mlir::omp::ClauseMapFlags::to; + interimMapType &= ~mlir::omp::ClauseMapFlags::from; + interimMapType &= ~mlir::omp::ClauseMapFlags::return_param; // Create a map for the intermediate member and insert it and it's // indices into the parentMemberIndices list to track it. @@ -427,10 +426,7 @@ mlir::Value createParentSymAndGenIntermediateMaps( /*varPtrPtr=*/mlir::Value{}, asFortran, /*bounds=*/interimBounds, /*members=*/{}, - /*membersIndex=*/mlir::ArrayAttr{}, - static_cast< - std::underlying_type_t>( - interimMapType), + /*membersIndex=*/mlir::ArrayAttr{}, interimMapType, mlir::omp::VariableCaptureKind::ByRef, curValue.getType()); parentMemberIndices.memberPlacementIndices.push_back(interimIndices); @@ -563,7 +559,8 @@ void insertChildMapInfoIntoParent( // it allows this to work with enter and exit without causing MLIR // verification issues. The more appropriate thing may be to take // the "main" map type clause from the directive being used. - uint64_t mapType = indices.second.memberMap[0].getMapType(); + mlir::omp::ClauseMapFlags mapType = + indices.second.memberMap[0].getMapType(); llvm::SmallVector members; members.reserve(indices.second.memberMap.size()); diff --git a/flang/lib/Lower/OpenMP/Utils.h b/flang/lib/Lower/OpenMP/Utils.h index 69499f9c7b621..ef1f37ac25529 100644 --- a/flang/lib/Lower/OpenMP/Utils.h +++ b/flang/lib/Lower/OpenMP/Utils.h @@ -134,7 +134,7 @@ mlir::Value createParentSymAndGenIntermediateMaps( semantics::SemanticsContext &semaCtx, lower::StatementContext &stmtCtx, omp::ObjectList &objectList, llvm::SmallVectorImpl &indices, OmpMapParentAndMemberData &parentMemberIndices, llvm::StringRef asFortran, - llvm::omp::OpenMPOffloadMappingFlags mapTypeBits); + mlir::omp::ClauseMapFlags mapTypeBits); omp::ObjectList gatherObjectsOf(omp::Object derivedTypeMember, semantics::SemanticsContext &semaCtx); diff --git a/flang/lib/Optimizer/OpenMP/AutomapToTargetData.cpp b/flang/lib/Optimizer/OpenMP/AutomapToTargetData.cpp index 8b9991301aae8..817434ff3dc30 100644 --- a/flang/lib/Optimizer/OpenMP/AutomapToTargetData.cpp +++ b/flang/lib/Optimizer/OpenMP/AutomapToTargetData.cpp @@ -20,8 +20,6 @@ #include "mlir/IR/Operation.h" #include "mlir/Pass/Pass.h" -#include "llvm/Frontend/OpenMP/OMPConstants.h" - namespace flangomp { #define GEN_PASS_DEF_AUTOMAPTOTARGETDATAPASS #include "flang/Optimizer/OpenMP/Passes.h.inc" @@ -120,12 +118,9 @@ class AutomapToTargetDataPass builder, memOp.getLoc(), memOp.getMemref().getType(), memOp.getMemref(), TypeAttr::get(fir::unwrapRefType(memOp.getMemref().getType())), - builder.getIntegerAttr( - builder.getIntegerType(64, false), - static_cast( - isa(memOp) - ? llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO - : llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_DELETE)), + builder.getAttr( + isa(memOp) ? omp::ClauseMapFlags::to + : omp::ClauseMapFlags::del), builder.getAttr( omp::VariableCaptureKind::ByCopy), /*var_ptr_ptr=*/mlir::Value{}, diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp index 03ff16366a9d2..65a23be243716 100644 --- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp +++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp @@ -22,7 +22,6 @@ #include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/RegionUtils.h" #include "llvm/ADT/SmallPtrSet.h" -#include "llvm/Frontend/OpenMP/OMPConstants.h" namespace flangomp { #define GEN_PASS_DEF_DOCONCURRENTCONVERSIONPASS @@ -568,16 +567,15 @@ class DoConcurrentConversion if (auto refType = mlir::dyn_cast(liveInType)) eleType = refType.getElementType(); - llvm::omp::OpenMPOffloadMappingFlags mapFlag = - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT; + mlir::omp::ClauseMapFlags mapFlag = mlir::omp::ClauseMapFlags::implicit; mlir::omp::VariableCaptureKind captureKind = mlir::omp::VariableCaptureKind::ByRef; if (fir::isa_trivial(eleType) || fir::isa_char(eleType)) { captureKind = mlir::omp::VariableCaptureKind::ByCopy; } else if (!fir::isa_builtin_cptr_type(eleType)) { - mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO; - mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; + mapFlag |= mlir::omp::ClauseMapFlags::to; + mapFlag |= mlir::omp::ClauseMapFlags::from; } llvm::SmallVector boundsOps; @@ -587,11 +585,8 @@ class DoConcurrentConversion builder, liveIn.getLoc(), rawAddr, /*varPtrPtr=*/{}, name.str(), boundsOps, /*members=*/{}, - /*membersIndex=*/mlir::ArrayAttr{}, - static_cast< - std::underlying_type_t>( - mapFlag), - captureKind, rawAddr.getType()); + /*membersIndex=*/mlir::ArrayAttr{}, mapFlag, captureKind, + rawAddr.getType()); } mlir::omp::TargetOp diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkdistribute.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkdistribute.cpp index 9278e17e74d1b..8a9b383ec1356 100644 --- a/flang/lib/Optimizer/OpenMP/LowerWorkdistribute.cpp +++ b/flang/lib/Optimizer/OpenMP/LowerWorkdistribute.cpp @@ -719,10 +719,9 @@ FailureOr splitTargetData(omp::TargetOp targetOp, SmallVector outerMapInfos; // Create new mapinfo ops for the inner target region for (auto mapInfo : mapInfos) { - auto originalMapType = - (llvm::omp::OpenMPOffloadMappingFlags)(mapInfo.getMapType()); + mlir::omp::ClauseMapFlags originalMapType = mapInfo.getMapType(); auto originalCaptureType = mapInfo.getMapCaptureType(); - llvm::omp::OpenMPOffloadMappingFlags newMapType; + mlir::omp::ClauseMapFlags newMapType; mlir::omp::VariableCaptureKind newCaptureType; // For bycopy, we keep the same map type and capture type // For byref, we change the map type to none and keep the capture type @@ -730,7 +729,7 @@ FailureOr splitTargetData(omp::TargetOp targetOp, newMapType = originalMapType; newCaptureType = originalCaptureType; } else if (originalCaptureType == mlir::omp::VariableCaptureKind::ByRef) { - newMapType = llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_NONE; + newMapType = mlir::omp::ClauseMapFlags::storage; newCaptureType = originalCaptureType; outerMapInfos.push_back(mapInfo); } else { @@ -738,11 +737,8 @@ FailureOr splitTargetData(omp::TargetOp targetOp, return failure(); } auto innerMapInfo = cast(rewriter.clone(*mapInfo)); - innerMapInfo.setMapTypeAttr(rewriter.getIntegerAttr( - rewriter.getIntegerType(64, false), - static_cast< - std::underlying_type_t>( - newMapType))); + innerMapInfo.setMapTypeAttr( + rewriter.getAttr(newMapType)); innerMapInfo.setMapCaptureType(newCaptureType); innerMapInfos.push_back(innerMapInfo.getResult()); } @@ -834,11 +830,11 @@ static TempOmpVar allocateTempOmpVar(Location loc, Type ty, alloc = rewriter.create(loc, allocType); } // Lambda to create mapinfo ops - auto getMapInfo = [&](uint64_t mappingFlags, const char *name) { + auto getMapInfo = [&](mlir::omp::ClauseMapFlags mappingFlags, + const char *name) { return rewriter.create( loc, alloc.getType(), alloc, TypeAttr::get(allocType), - rewriter.getIntegerAttr(rewriter.getIntegerType(64, /*isSigned=*/false), - mappingFlags), + rewriter.getAttr(mappingFlags), rewriter.getAttr( omp::VariableCaptureKind::ByRef), /*varPtrPtr=*/Value{}, @@ -849,14 +845,10 @@ static TempOmpVar allocateTempOmpVar(Location loc, Type ty, /*name=*/rewriter.getStringAttr(name), rewriter.getBoolAttr(false)); }; // Create mapinfo ops. - uint64_t mapFrom = - static_cast>( - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM); - uint64_t mapTo = - static_cast>( - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO); - auto mapInfoFrom = getMapInfo(mapFrom, "__flang_workdistribute_from"); - auto mapInfoTo = getMapInfo(mapTo, "__flang_workdistribute_to"); + auto mapInfoFrom = getMapInfo(mlir::omp::ClauseMapFlags::from, + "__flang_workdistribute_from"); + auto mapInfoTo = + getMapInfo(mlir::omp::ClauseMapFlags::to, "__flang_workdistribute_to"); return TempOmpVar{mapInfoFrom, mapInfoTo}; } diff --git a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp index 2bbd8034fa523..566e88b9d6588 100644 --- a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp +++ b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp @@ -43,7 +43,6 @@ #include "llvm/ADT/BitmaskEnum.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/StringSet.h" -#include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Support/raw_ostream.h" #include #include @@ -350,7 +349,7 @@ class MapInfoFinalizationPass /// the descriptor map onto the base address map. mlir::omp::MapInfoOp genBaseAddrMap(mlir::Value descriptor, mlir::OperandRange bounds, - int64_t mapType, + mlir::omp::ClauseMapFlags mapType, fir::FirOpBuilder &builder) { mlir::Location loc = descriptor.getLoc(); mlir::Value baseAddrAddr = fir::BoxOffsetOp::create( @@ -368,7 +367,7 @@ class MapInfoFinalizationPass return mlir::omp::MapInfoOp::create( builder, loc, baseAddrAddr.getType(), descriptor, mlir::TypeAttr::get(underlyingVarType), - builder.getIntegerAttr(builder.getIntegerType(64, false), mapType), + builder.getAttr(mapType), builder.getAttr( mlir::omp::VariableCaptureKind::ByRef), baseAddrAddr, /*members=*/mlir::SmallVector{}, @@ -428,22 +427,22 @@ class MapInfoFinalizationPass /// allowing `to` mappings, and `target update` not allowing both `to` and /// `from` simultaneously. We currently try to maintain the `implicit` flag /// where necessary, although it does not seem strictly required. - unsigned long getDescriptorMapType(unsigned long mapTypeFlag, - mlir::Operation *target) { - using mapFlags = llvm::omp::OpenMPOffloadMappingFlags; + mlir::omp::ClauseMapFlags + getDescriptorMapType(mlir::omp::ClauseMapFlags mapTypeFlag, + mlir::Operation *target) { + using mapFlags = mlir::omp::ClauseMapFlags; if (llvm::isa_and_nonnull(target)) return mapTypeFlag; - mapFlags flags = mapFlags::OMP_MAP_TO | - (mapFlags(mapTypeFlag) & - (mapFlags::OMP_MAP_IMPLICIT | mapFlags::OMP_MAP_ALWAYS)); + mapFlags flags = + mapFlags::to | (mapTypeFlag & (mapFlags::implicit | mapFlags::always)); // For unified_shared_memory, we additionally add `CLOSE` on the descriptor // to ensure device-local placement where required by tests relying on USM + // close semantics. if (moduleRequiresUSM(target->getParentOfType())) - flags |= mapFlags::OMP_MAP_CLOSE; - return llvm::to_underlying(flags); + flags |= mapFlags::close; + return flags; } /// Check if the mapOp is present in the HasDeviceAddr clause on @@ -493,11 +492,6 @@ class MapInfoFinalizationPass mlir::Value boxAddr = fir::BoxOffsetOp::create( builder, loc, op.getVarPtr(), fir::BoxFieldAttr::base_addr); - uint64_t mapTypeToImplicit = static_cast< - std::underlying_type_t>( - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO | - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT); - mlir::ArrayAttr newMembersAttr; llvm::SmallVector> memberIdx = {{0}}; newMembersAttr = builder.create2DI64ArrayAttr(memberIdx); @@ -506,8 +500,9 @@ class MapInfoFinalizationPass mlir::omp::MapInfoOp memberMapInfoOp = mlir::omp::MapInfoOp::create( builder, op.getLoc(), varPtr.getType(), varPtr, mlir::TypeAttr::get(boxCharType.getEleTy()), - builder.getIntegerAttr(builder.getIntegerType(64, /*isSigned=*/false), - mapTypeToImplicit), + builder.getAttr( + mlir::omp::ClauseMapFlags::to | + mlir::omp::ClauseMapFlags::implicit), builder.getAttr( mlir::omp::VariableCaptureKind::ByRef), /*varPtrPtr=*/boxAddr, @@ -568,12 +563,9 @@ class MapInfoFinalizationPass mlir::ArrayAttr newMembersAttr = builder.create2DI64ArrayAttr(memberIdx); // Force CLOSE in USM paths so the pointer gets device-local placement // when required by tests relying on USM + close semantics. - uint64_t mapTypeVal = - op.getMapType() | - llvm::to_underlying( - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_CLOSE); - mlir::IntegerAttr mapTypeAttr = builder.getIntegerAttr( - builder.getIntegerType(64, /*isSigned=*/false), mapTypeVal); + mlir::omp::ClauseMapFlagsAttr mapTypeAttr = + builder.getAttr( + op.getMapType() | mlir::omp::ClauseMapFlags::close); mlir::omp::MapInfoOp memberMap = mlir::omp::MapInfoOp::create( builder, loc, coord.getType(), coord, @@ -683,17 +675,16 @@ class MapInfoFinalizationPass // one place in the code may differ from that address in another place. // The contents of the descriptor (the base address in particular) will // remain unchanged though. - uint64_t mapType = op.getMapType(); + mlir::omp::ClauseMapFlags mapType = op.getMapType(); if (isHasDeviceAddrFlag) { - mapType |= llvm::to_underlying( - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_ALWAYS); + mapType |= mlir::omp::ClauseMapFlags::always; } mlir::omp::MapInfoOp newDescParentMapOp = mlir::omp::MapInfoOp::create( builder, op->getLoc(), op.getResult().getType(), descriptor, mlir::TypeAttr::get(fir::unwrapRefType(descriptor.getType())), - builder.getIntegerAttr(builder.getIntegerType(64, false), - getDescriptorMapType(mapType, target)), + builder.getAttr( + getDescriptorMapType(mapType, target)), op.getMapCaptureTypeAttr(), /*varPtrPtr=*/mlir::Value{}, newMembers, newMembersAttr, /*bounds=*/mlir::SmallVector{}, /*mapperId*/ mlir::FlatSymbolRefAttr(), op.getNameAttr(), @@ -896,11 +887,9 @@ class MapInfoFinalizationPass builder.create( op->getLoc(), op.getResult().getType(), op.getVarPtr(), op.getVarTypeAttr(), - builder.getIntegerAttr( - builder.getIntegerType(64, false), - llvm::to_underlying( - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO | - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_ALWAYS)), + builder.getAttr( + mlir::omp::ClauseMapFlags::to | + mlir::omp::ClauseMapFlags::always), op.getMapCaptureTypeAttr(), /*varPtrPtr=*/mlir::Value{}, mlir::SmallVector{}, mlir::ArrayAttr{}, /*bounds=*/mlir::SmallVector{}, @@ -1240,9 +1229,8 @@ class MapInfoFinalizationPass // we need to change this check for early return OR live with // over-mapping. bool hasImplicitMap = - (llvm::omp::OpenMPOffloadMappingFlags(op.getMapType()) & - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT) == - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT; + (op.getMapType() & mlir::omp::ClauseMapFlags::implicit) == + mlir::omp::ClauseMapFlags::implicit; if (hasImplicitMap) return; diff --git a/flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp b/flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp index 30328573b74fc..0972861b8450a 100644 --- a/flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp +++ b/flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp @@ -35,7 +35,6 @@ #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/SymbolTable.h" #include "mlir/Pass/Pass.h" -#include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Support/Debug.h" #include @@ -70,9 +69,6 @@ class MapsForPrivatizedSymbolsPass return size <= ptrSize && align <= ptrAlign; }; - uint64_t mapTypeTo = static_cast< - std::underlying_type_t>( - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO); Operation *definingOp = var.getDefiningOp(); Value varPtr = var; @@ -122,8 +118,7 @@ class MapsForPrivatizedSymbolsPass builder, loc, varPtr.getType(), varPtr, TypeAttr::get(llvm::cast(varPtr.getType()) .getElementType()), - builder.getIntegerAttr(builder.getIntegerType(64, /*isSigned=*/false), - mapTypeTo), + builder.getAttr(omp::ClauseMapFlags::to), builder.getAttr(captureKind), /*varPtrPtr=*/Value{}, /*members=*/SmallVector{}, diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h index 4cb0b743107d2..b3fd6c8b4b7c0 100644 --- a/flang/lib/Semantics/check-omp-structure.h +++ b/flang/lib/Semantics/check-omp-structure.h @@ -19,7 +19,6 @@ #include "flang/Parser/parse-tree.h" #include "flang/Semantics/openmp-directive-sets.h" #include "flang/Semantics/semantics.h" -#include "llvm/Frontend/OpenMP/OMPConstants.h" using OmpClauseSet = Fortran::common::EnumSet; diff --git a/flang/lib/Utils/OpenMP.cpp b/flang/lib/Utils/OpenMP.cpp index 2261912fec22f..15a42c3f50866 100644 --- a/flang/lib/Utils/OpenMP.cpp +++ b/flang/lib/Utils/OpenMP.cpp @@ -22,8 +22,9 @@ mlir::omp::MapInfoOp createMapInfoOp(mlir::OpBuilder &builder, mlir::Location loc, mlir::Value baseAddr, mlir::Value varPtrPtr, llvm::StringRef name, llvm::ArrayRef bounds, llvm::ArrayRef members, mlir::ArrayAttr membersIndex, - uint64_t mapType, mlir::omp::VariableCaptureKind mapCaptureType, - mlir::Type retTy, bool partialMap, mlir::FlatSymbolRefAttr mapperId) { + mlir::omp::ClauseMapFlags mapType, + mlir::omp::VariableCaptureKind mapCaptureType, mlir::Type retTy, + bool partialMap, mlir::FlatSymbolRefAttr mapperId) { if (auto boxTy = llvm::dyn_cast(baseAddr.getType())) { baseAddr = fir::BoxAddrOp::create(builder, loc, baseAddr); @@ -42,7 +43,7 @@ mlir::omp::MapInfoOp createMapInfoOp(mlir::OpBuilder &builder, mlir::omp::MapInfoOp op = mlir::omp::MapInfoOp::create(builder, loc, retTy, baseAddr, varType, - builder.getIntegerAttr(builder.getIntegerType(64, false), mapType), + builder.getAttr(mapType), builder.getAttr(mapCaptureType), varPtrPtr, members, membersIndex, bounds, mapperId, builder.getStringAttr(name), builder.getBoolAttr(partialMap)); @@ -75,8 +76,7 @@ mlir::Value mapTemporaryValue(fir::FirOpBuilder &firOpBuilder, firOpBuilder.setInsertionPoint(targetOp); - llvm::omp::OpenMPOffloadMappingFlags mapFlag = - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT; + mlir::omp::ClauseMapFlags mapFlag = mlir::omp::ClauseMapFlags::implicit; mlir::omp::VariableCaptureKind captureKind = mlir::omp::VariableCaptureKind::ByRef; @@ -88,16 +88,14 @@ mlir::Value mapTemporaryValue(fir::FirOpBuilder &firOpBuilder, if (fir::isa_trivial(eleType) || fir::isa_char(eleType)) { captureKind = mlir::omp::VariableCaptureKind::ByCopy; } else if (!fir::isa_builtin_cptr_type(eleType)) { - mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO; + mapFlag |= mlir::omp::ClauseMapFlags::to; } mlir::Value mapOp = createMapInfoOp(firOpBuilder, copyVal.getLoc(), copyVal, /*varPtrPtr=*/mlir::Value{}, name.str(), bounds, /*members=*/llvm::SmallVector{}, - /*membersIndex=*/mlir::ArrayAttr{}, - static_cast>( - mapFlag), - captureKind, copyVal.getType()); + /*membersIndex=*/mlir::ArrayAttr{}, mapFlag, captureKind, + copyVal.getType()); auto argIface = llvm::cast(*targetOp); mlir::Region ®ion = targetOp.getRegion(); diff --git a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir index 38d51110bbde3..30ed2f0f2f760 100644 --- a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir +++ b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir @@ -252,7 +252,7 @@ func.func @_QPomp_target_data() { %c0_6 = arith.constant 0 : index %10 = arith.subi %c1024_1, %c1_5 : index %11 = omp.map.bounds lower_bound(%c0_6 : index) upper_bound(%10 : index) extent(%c1024_1 : index) stride(%c1_5 : index) start_idx(%c1_5 : index) - %12 = omp.map.info var_ptr(%2 : !fir.ref>, !fir.array<1024xi32>) map_clauses(always, exit_release_or_enter_alloc) capture(ByRef) bounds(%11) -> !fir.ref> {name = "c"} + %12 = omp.map.info var_ptr(%2 : !fir.ref>, !fir.array<1024xi32>) map_clauses(always, storage) capture(ByRef) bounds(%11) -> !fir.ref> {name = "c"} omp.target_enter_data map_entries(%6, %9, %12 : !fir.ref>, !fir.ref>, !fir.ref>) %c1_7 = arith.constant 1 : index %c0_8 = arith.constant 0 : index @@ -268,7 +268,7 @@ func.func @_QPomp_target_data() { %c0_12 = arith.constant 0 : index %19 = arith.subi %c1024_1, %c1_11 : index %20 = omp.map.bounds lower_bound(%c0_12 : index) upper_bound(%19 : index) extent(%c1024_1 : index) stride(%c1_11 : index) start_idx(%c1_11 : index) - %21 = omp.map.info var_ptr(%2 : !fir.ref>, !fir.array<1024xi32>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) bounds(%20) -> !fir.ref> {name = "c"} + %21 = omp.map.info var_ptr(%2 : !fir.ref>, !fir.array<1024xi32>) map_clauses(storage) capture(ByRef) bounds(%20) -> !fir.ref> {name = "c"} %c1_13 = arith.constant 1 : index %c0_14 = arith.constant 0 : index %22 = arith.subi %c1024_2, %c1_13 : index @@ -305,7 +305,7 @@ func.func @_QPomp_target_data() { // CHECK: %[[VAL_23:.*]] = llvm.mlir.constant(0 : index) : i64 // CHECK: %[[VAL_24:.*]] = llvm.mlir.constant(1023 : index) : i64 // CHECK: %[[VAL_25:.*]] = omp.map.bounds lower_bound(%[[VAL_23]] : i64) upper_bound(%[[VAL_24]] : i64) extent(%[[VAL_10]] : i64) stride(%[[VAL_22]] : i64) start_idx(%[[VAL_22]] : i64) -// CHECK: %[[VAL_26:.*]] = omp.map.info var_ptr(%[[VAL_4]] : !llvm.ptr, !llvm.array<1024 x i32>) map_clauses(always, exit_release_or_enter_alloc) capture(ByRef) bounds(%[[VAL_25]]) -> !llvm.ptr {name = "c"} +// CHECK: %[[VAL_26:.*]] = omp.map.info var_ptr(%[[VAL_4]] : !llvm.ptr, !llvm.array<1024 x i32>) map_clauses(always, storage) capture(ByRef) bounds(%[[VAL_25]]) -> !llvm.ptr {name = "c"} // CHECK: omp.target_enter_data map_entries(%[[VAL_16]], %[[VAL_21]], %[[VAL_26]] : !llvm.ptr, !llvm.ptr, !llvm.ptr) // CHECK: %[[VAL_27:.*]] = llvm.mlir.constant(1 : index) : i64 // CHECK: %[[VAL_28:.*]] = llvm.mlir.constant(0 : index) : i64 @@ -321,7 +321,7 @@ func.func @_QPomp_target_data() { // CHECK: %[[VAL_38:.*]] = llvm.mlir.constant(0 : index) : i64 // CHECK: %[[VAL_39:.*]] = llvm.mlir.constant(1023 : index) : i64 // CHECK: %[[VAL_40:.*]] = omp.map.bounds lower_bound(%[[VAL_38]] : i64) upper_bound(%[[VAL_39]] : i64) extent(%[[VAL_10]] : i64) stride(%[[VAL_37]] : i64) start_idx(%[[VAL_37]] : i64) -// CHECK: %[[VAL_41:.*]] = omp.map.info var_ptr(%[[VAL_4]] : !llvm.ptr, !llvm.array<1024 x i32>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) bounds(%[[VAL_40]]) -> !llvm.ptr {name = "c"} +// CHECK: %[[VAL_41:.*]] = omp.map.info var_ptr(%[[VAL_4]] : !llvm.ptr, !llvm.array<1024 x i32>) map_clauses(storage) capture(ByRef) bounds(%[[VAL_40]]) -> !llvm.ptr {name = "c"} // CHECK: %[[VAL_42:.*]] = llvm.mlir.constant(1 : index) : i64 // CHECK: %[[VAL_43:.*]] = llvm.mlir.constant(0 : index) : i64 // CHECK: %[[VAL_44:.*]] = llvm.mlir.constant(1023 : index) : i64 diff --git a/flang/test/Lower/OpenMP/DelayedPrivatization/target-teams-private-implicit-scalar-map.f90 b/flang/test/Lower/OpenMP/DelayedPrivatization/target-teams-private-implicit-scalar-map.f90 index 39f9738932d44..126f341a58192 100644 --- a/flang/test/Lower/OpenMP/DelayedPrivatization/target-teams-private-implicit-scalar-map.f90 +++ b/flang/test/Lower/OpenMP/DelayedPrivatization/target-teams-private-implicit-scalar-map.f90 @@ -23,9 +23,9 @@ program test_default_implicit_firstprivate !CHECK: %[[VAL_4:.*]] = fir.declare %{{.*}} {uniq_name = "_QFEk"} : (!fir.ref) -> !fir.ref !CHECK: %[[VAL_5:.*]] = fir.declare %{{.*}} {uniq_name = "_QFExdgfx"} : (!fir.ref) -> !fir.ref !CHECK: %[[VAL_6:.*]] = fir.declare %{{.*}} {uniq_name = "_QFExfpvx"} : (!fir.ref) -> !fir.ref -!CHECK: %[[VAL_7:.*]] = omp.map.info var_ptr(%[[VAL_2]] : !fir.ref, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref {name = "i"} -!CHECK: %[[VAL_8:.*]] = omp.map.info var_ptr(%[[VAL_3]] : !fir.ref, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref {name = "j"} -!CHECK: %[[VAL_9:.*]] = omp.map.info var_ptr(%[[VAL_4]] : !fir.ref, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref {name = "k"} +!CHECK: %[[VAL_7:.*]] = omp.map.info var_ptr(%[[VAL_2]] : !fir.ref, i32) map_clauses(implicit) capture(ByCopy) -> !fir.ref {name = "i"} +!CHECK: %[[VAL_8:.*]] = omp.map.info var_ptr(%[[VAL_3]] : !fir.ref, i32) map_clauses(implicit) capture(ByCopy) -> !fir.ref {name = "j"} +!CHECK: %[[VAL_9:.*]] = omp.map.info var_ptr(%[[VAL_4]] : !fir.ref, i32) map_clauses(implicit) capture(ByCopy) -> !fir.ref {name = "k"} !CHECK: %[[VAL_10:.*]] = fir.box_offset %[[VAL_0]] base_addr : (!fir.ref>>>) -> !fir.llvm_ptr>> !CHECK: %[[VAL_11:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref>>>, i32) map_clauses(implicit, tofrom) capture(ByRef) var_ptr_ptr(%[[VAL_10]] : !fir.llvm_ptr>>) bounds({{.*}}) -> !fir.llvm_ptr>> {name = ""} !CHECK: %[[VAL_12:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref>>>, !fir.box>>) map_clauses(implicit, to) capture(ByRef) members(%[[VAL_11]] : [0] : !fir.llvm_ptr>>) -> !fir.ref>>> {name = "allocarr"} diff --git a/flang/test/Lower/OpenMP/common-block-map.f90 b/flang/test/Lower/OpenMP/common-block-map.f90 index a0a1b1fec3e36..7c690c96ddb0b 100644 --- a/flang/test/Lower/OpenMP/common-block-map.f90 +++ b/flang/test/Lower/OpenMP/common-block-map.f90 @@ -36,7 +36,7 @@ subroutine map_full_block !CHECK: %[[CONV:.*]] = fir.convert %[[COORD]] : (!fir.ref) -> !fir.ref !CHECK: %[[CB_MEMBER_2:.*]]:2 = hlfir.declare %[[CONV]] storage(%[[COMMON_BLOCK]][4]) {uniq_name = "_QFmap_mix_of_membersEvar2"} : (!fir.ref, !fir.ref>) -> (!fir.ref, !fir.ref) !CHECK: %[[MAP_EXP:.*]] = omp.map.info var_ptr(%[[CB_MEMBER_2]]#1 : !fir.ref, i32) map_clauses(tofrom) capture(ByRef) -> !fir.ref {name = "var2"} -!CHECK: %[[MAP_IMP:.*]] = omp.map.info var_ptr(%[[CB_MEMBER_1]]#1 : !fir.ref, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref {name = "var1"} +!CHECK: %[[MAP_IMP:.*]] = omp.map.info var_ptr(%[[CB_MEMBER_1]]#1 : !fir.ref, i32) map_clauses(implicit) capture(ByCopy) -> !fir.ref {name = "var1"} !CHECK: omp.target map_entries(%[[MAP_EXP]] -> %[[ARG_EXP:.*]], %[[MAP_IMP]] -> %[[ARG_IMP:.*]] : !fir.ref, !fir.ref) { !CHECK: %[[EXP_MEMBER:.*]]:2 = hlfir.declare %[[ARG_EXP]] {uniq_name = "_QFmap_mix_of_membersEvar2"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: %[[IMP_MEMBER:.*]]:2 = hlfir.declare %[[ARG_IMP]] {uniq_name = "_QFmap_mix_of_membersEvar1"} : (!fir.ref) -> (!fir.ref, !fir.ref) diff --git a/flang/test/Lower/OpenMP/declare-mapper.f90 b/flang/test/Lower/OpenMP/declare-mapper.f90 index 3d4d0da1e18a3..c389d0ff4bd15 100644 --- a/flang/test/Lower/OpenMP/declare-mapper.f90 +++ b/flang/test/Lower/OpenMP/declare-mapper.f90 @@ -80,7 +80,7 @@ subroutine declare_mapper_2 !CHECK: %[[VAL_8:.*]] = omp.map.bounds lower_bound(%[[VAL_6]] : index) upper_bound(%[[VAL_7]] : index) extent(%[[VAL_2]] : index) stride(%[[VAL_5]] : index) start_idx(%[[VAL_5]] : index) !CHECK: %[[VAL_9:.*]] = omp.map.info var_ptr(%[[VAL_4]] : !fir.ref>, !fir.array<250xf32>) map_clauses(tofrom) capture(ByRef) bounds(%[[VAL_8]]) -> !fir.ref> {name = "v%[[VAL_10:.*]]"} !CHECK: %[[VAL_11:.*]] = hlfir.designate %[[VAL_1]]#0{"temp"} : (!fir.ref<[[MY_TYPE]]>) -> !fir.ref>>}>> - !CHECK: %[[VAL_12:.*]] = omp.map.info var_ptr(%[[VAL_11]] : !fir.ref>>}>>, !fir.type<_QFdeclare_mapper_2Tmy_type{num_vals:i32,values:!fir.box>>}>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !fir.ref>>}>> {name = "v%[[VAL_13:.*]]"} + !CHECK: %[[VAL_12:.*]] = omp.map.info var_ptr(%[[VAL_11]] : !fir.ref>>}>>, !fir.type<_QFdeclare_mapper_2Tmy_type{num_vals:i32,values:!fir.box>>}>) map_clauses(storage) capture(ByRef) -> !fir.ref>>}>> {name = "v%[[VAL_13:.*]]"} !CHECK: %[[VAL_14:.*]] = omp.map.info var_ptr(%[[VAL_1]]#1 : !fir.ref<[[MY_TYPE]]>, [[MY_TYPE]]) map_clauses(tofrom) capture(ByRef) members(%[[VAL_9]], %[[VAL_12]] : [3], [1] : !fir.ref>, !fir.ref>>}>>) -> !fir.ref<[[MY_TYPE]]> {name = "v", partial_map = true} !CHECK: omp.declare_mapper.info map_entries(%[[VAL_14]], %[[VAL_9]], %[[VAL_12]] : !fir.ref<[[MY_TYPE]]>, !fir.ref>, !fir.ref>>}>>) !CHECK: } diff --git a/flang/test/Lower/OpenMP/defaultmap.f90 b/flang/test/Lower/OpenMP/defaultmap.f90 index 0b26f5db0feb3..b9c902fe43f13 100644 --- a/flang/test/Lower/OpenMP/defaultmap.f90 +++ b/flang/test/Lower/OpenMP/defaultmap.f90 @@ -5,7 +5,7 @@ subroutine defaultmap_allocatable_present() implicit none integer, dimension(:), allocatable :: arr -! CHECK: %[[MAP_1:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref>>>, i32) map_clauses(implicit, present, exit_release_or_enter_alloc) capture(ByRef) var_ptr_ptr({{.*}}) bounds({{.*}}) -> !fir.llvm_ptr>> {name = ""} +! CHECK: %[[MAP_1:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref>>>, i32) map_clauses(implicit, present) capture(ByRef) var_ptr_ptr({{.*}}) bounds({{.*}}) -> !fir.llvm_ptr>> {name = ""} ! CHECK: %[[MAP_2:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref>>>, !fir.box>>) map_clauses(implicit, to) capture(ByRef) members({{.*}}) -> !fir.ref>>> {name = "arr"} !$omp target defaultmap(present: allocatable) arr(1) = 10 @@ -32,7 +32,7 @@ subroutine defaultmap_all_default() integer :: aggregate(16) integer :: scalar_int -! CHECK: %[[MAP_1:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref {name = "scalar_int"} +! CHECK: %[[MAP_1:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref, i32) map_clauses(implicit) capture(ByCopy) -> !fir.ref {name = "scalar_int"} ! CHECK: %[[MAP_2:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref>>>, i32) map_clauses(implicit, tofrom) capture(ByRef) var_ptr_ptr({{.*}}) bounds({{.*}}) -> !fir.llvm_ptr>> {name = ""} ! CHECK: %[[MAP_3:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref>>>, !fir.box>>) map_clauses(implicit, to) capture(ByRef) members({{.*}}) -> !fir.ref>>> {name = "arr"} ! CHECK: %[[MAP_4:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref>, !fir.array<16xi32>) map_clauses(implicit, tofrom) capture(ByRef) bounds({{.*}}) -> !fir.ref> {name = "aggregate"} @@ -54,7 +54,7 @@ subroutine defaultmap_pointer_to() ! CHECK-FPRIV: %[[MAP_1:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref>>>, i32) map_clauses(implicit, to) capture(ByRef) var_ptr_ptr({{.*}}) bounds({{.*}}) -> !fir.llvm_ptr>> {name = ""} ! CHECK: %[[MAP_2:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref>>>, !fir.box>>) map_clauses(implicit, to) capture(ByRef) members({{.*}}) -> !fir.ref>>> {name = "arr_ptr"} ! CHECK-FPRIV: %[[MAP_3:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref, i32) map_clauses(to) capture(ByCopy) -> !fir.ref -! CHECK-NO-FPRIV: %[[MAP_3:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref {name = "scalar_int"} +! CHECK-NO-FPRIV: %[[MAP_3:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref, i32) map_clauses(implicit) capture(ByCopy) -> !fir.ref {name = "scalar_int"} !$omp target defaultmap(to: pointer) arr_ptr(1) = scalar_int + 20 !$omp end target diff --git a/flang/test/Lower/OpenMP/has_device_addr-mapinfo.f90 b/flang/test/Lower/OpenMP/has_device_addr-mapinfo.f90 index 8d8c0433758ee..e7bced4c0d29d 100644 --- a/flang/test/Lower/OpenMP/has_device_addr-mapinfo.f90 +++ b/flang/test/Lower/OpenMP/has_device_addr-mapinfo.f90 @@ -17,7 +17,7 @@ integer function s(a) ! Check that the map.info for `a` only takes a single parameter. -!CHECK-DAG: %[[MAP_A:[0-9]+]] = "omp.map.info"(%[[STORAGE_A:[0-9#]+]]) <{map_capture_type = #omp, map_type = 517 : ui64, name = "a", operandSegmentSizes = array, partial_map = false, var_type = !fir.box>}> : (!fir.ref>>) -> !fir.ref> -!CHECK-DAG: %[[MAP_T:[0-9]+]] = "omp.map.info"(%[[STORAGE_T:[0-9#]+]]) <{map_capture_type = #omp, map_type = 2 : ui64, name = "t", operandSegmentSizes = array, partial_map = false, var_type = i32}> : (!fir.ref) -> !fir.ref +!CHECK-DAG: %[[MAP_A:[0-9]+]] = "omp.map.info"(%[[STORAGE_A:[0-9#]+]]) <{map_capture_type = #omp, map_type = #omp, name = "a", operandSegmentSizes = array, partial_map = false, var_type = !fir.box>}> : (!fir.ref>>) -> !fir.ref> +!CHECK-DAG: %[[MAP_T:[0-9]+]] = "omp.map.info"(%[[STORAGE_T:[0-9#]+]]) <{map_capture_type = #omp, map_type = #omp, name = "t", operandSegmentSizes = array, partial_map = false, var_type = i32}> : (!fir.ref) -> !fir.ref !CHECK: "omp.target"(%[[MAP_A]], %[[MAP_T]]) diff --git a/flang/test/Lower/OpenMP/local-intrinsic-sized-array-map.f90 b/flang/test/Lower/OpenMP/local-intrinsic-sized-array-map.f90 index ab2cdf380b783..76dba67df5d07 100644 --- a/flang/test/Lower/OpenMP/local-intrinsic-sized-array-map.f90 +++ b/flang/test/Lower/OpenMP/local-intrinsic-sized-array-map.f90 @@ -11,7 +11,7 @@ !HLFIRDIALECT: %[[B_DECLARE:.*]]:2 = hlfir.declare %[[B_ALLOCA]](%[[B_SHAPE]]) {uniq_name = "_QFlocal_variable_intrinsic_sizeEb"} : (!fir.ref>, !fir.shape<1>) -> (!fir.box>, !fir.ref>) !HLFIRDIALECT: %[[BOUNDS:.*]] = omp.map.bounds lower_bound({{.*}} : index) upper_bound({{.*}} : index) extent({{.*}} : index) stride({{.*}} : index) start_idx({{.*}} : index) {stride_in_bytes = true} !HLFIRDIALECT: %[[MAP_DATA_B:.*]] = omp.map.info var_ptr(%[[B_DECLARE]]#1 : !fir.ref>, f32) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref> {name = "b"} -!HLFIRDIALECT: %[[MAP_DATA_SZ:.*]] = omp.map.info var_ptr(%[[SZ_DATA]] : !fir.ref, index) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref {name = ""} +!HLFIRDIALECT: %[[MAP_DATA_SZ:.*]] = omp.map.info var_ptr(%[[SZ_DATA]] : !fir.ref, index) map_clauses(implicit) capture(ByCopy) -> !fir.ref {name = ""} !HLFIRDIALECT: omp.target map_entries(%[[MAP_DATA_B]] -> %[[ARG1:.*]], %[[MAP_DATA_SZ]] -> %[[ARG2:.*]] : !fir.ref>, !fir.ref) { !HLFIRDIALECT: %[[SZ_LD:.*]] = fir.load %[[ARG2]] : !fir.ref !HLFIRDIALECT: %[[SZ_CONV:.*]] = fir.convert %[[SZ_LD]] : (index) -> i64 diff --git a/flang/test/Lower/OpenMP/optional-argument-map-2.f90 b/flang/test/Lower/OpenMP/optional-argument-map-2.f90 index a7744074d3c68..791d509028dee 100644 --- a/flang/test/Lower/OpenMP/optional-argument-map-2.f90 +++ b/flang/test/Lower/OpenMP/optional-argument-map-2.f90 @@ -96,7 +96,7 @@ end module mod ! CHECK-NO-FPRIV: } ! CHECK-NO-FPRIV: %[[VAL_13:.*]] = arith.subi %[[VAL_14:.*]]#0, %[[VAL_10]] : index ! CHECK-NO-FPRIV: %[[VAL_15:.*]] = omp.map.bounds lower_bound(%[[VAL_9]] : index) upper_bound(%[[VAL_13]] : index) extent(%[[VAL_14]]#0 : index) stride(%[[VAL_14]]#1 : index) start_idx(%[[VAL_9]] : index) {stride_in_bytes = true} -! CHECK-NO-FPRIV: %[[VAL_16:.*]] = omp.map.info var_ptr(%[[VAL_3]]#1 : !fir.ref>, !fir.char<1,?>) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) bounds(%[[VAL_15]]) -> !fir.ref> {name = "a"} +! CHECK-NO-FPRIV: %[[VAL_16:.*]] = omp.map.info var_ptr(%[[VAL_3]]#1 : !fir.ref>, !fir.char<1,?>) map_clauses(implicit) capture(ByCopy) bounds(%[[VAL_15]]) -> !fir.ref> {name = "a"} ! CHECK-NO-FPRIV: fir.store %[[ARG0]] to %[[VAL_0]] : !fir.ref> ! CHECK-NO-FPRIV: %[[VAL_17:.*]] = arith.constant 0 : index ! CHECK-NO-FPRIV: %[[VAL_18:.*]] = arith.constant 1 : index diff --git a/flang/test/Lower/OpenMP/target.f90 b/flang/test/Lower/OpenMP/target.f90 index 1aef64a3b83a8..26bd62edf9d0c 100644 --- a/flang/test/Lower/OpenMP/target.f90 +++ b/flang/test/Lower/OpenMP/target.f90 @@ -69,7 +69,7 @@ subroutine omp_target_enter_mt !CHECK: %[[BOUNDS_1:.*]] = omp.map.bounds lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}}) !CHECK: %[[MAP_1:.*]] = omp.map.info var_ptr(%{{.*}}) map_clauses(to) capture(ByRef) bounds(%[[BOUNDS_1]]) -> !fir.ref> {name = "b"} !CHECK: %[[BOUNDS_2:.*]] = omp.map.bounds lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}}) - !CHECK: %[[MAP_2:.*]] = omp.map.info var_ptr({{.*}}) map_clauses(always, exit_release_or_enter_alloc) capture(ByRef) bounds(%[[BOUNDS_2]]) -> !fir.ref> {name = "c"} + !CHECK: %[[MAP_2:.*]] = omp.map.info var_ptr({{.*}}) map_clauses(always, storage) capture(ByRef) bounds(%[[BOUNDS_2]]) -> !fir.ref> {name = "c"} !CHECK: %[[BOUNDS_3:.*]] = omp.map.bounds lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}}) !CHECK: %[[MAP_3:.*]] = omp.map.info var_ptr({{.*}}) map_clauses(to) capture(ByRef) bounds(%[[BOUNDS_3]]) -> !fir.ref> {name = "d"} !CHECK: omp.target_enter_data map_entries(%[[MAP_0]], %[[MAP_1]], %[[MAP_2]], %[[MAP_3]] : !fir.ref>, !fir.ref>, !fir.ref>, !fir.ref>) @@ -150,9 +150,9 @@ subroutine omp_target_exit_mt !CHECK: %[[BOUNDS_1:.*]] = omp.map.bounds lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}}) !CHECK: %[[MAP_1:.*]] = omp.map.info var_ptr({{.*}}) map_clauses(from) capture(ByRef) bounds(%[[BOUNDS_1]]) -> !fir.ref> {name = "b"} !CHECK: %[[BOUNDS_2:.*]] = omp.map.bounds lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}}) - !CHECK: %[[MAP_2:.*]] = omp.map.info var_ptr({{.*}}) map_clauses(exit_release_or_enter_alloc) capture(ByRef) bounds(%[[BOUNDS_2]]) -> !fir.ref> {name = "c"} + !CHECK: %[[MAP_2:.*]] = omp.map.info var_ptr({{.*}}) map_clauses(storage) capture(ByRef) bounds(%[[BOUNDS_2]]) -> !fir.ref> {name = "c"} !CHECK: %[[BOUNDS_3:.*]] = omp.map.bounds lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}}) - !CHECK: %[[MAP_3:.*]] = omp.map.info var_ptr({{.*}}) map_clauses(always, delete) capture(ByRef) bounds(%[[BOUNDS_3]]) -> !fir.ref> {name = "d"} + !CHECK: %[[MAP_3:.*]] = omp.map.info var_ptr({{.*}}) map_clauses(always, delete, storage) capture(ByRef) bounds(%[[BOUNDS_3]]) -> !fir.ref> {name = "d"} !CHECK: %[[BOUNDS_4:.*]] = omp.map.bounds lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}}) !CHECK: %[[MAP_4:.*]] = omp.map.info var_ptr({{.*}}) map_clauses(from) capture(ByRef) bounds(%[[BOUNDS_4]]) -> !fir.ref> {name = "e"} !CHECK: omp.target_exit_data map_entries(%[[MAP_0]], %[[MAP_1]], %[[MAP_2]], %[[MAP_3]], %[[MAP_4]] : !fir.ref>, !fir.ref>, !fir.ref>, !fir.ref>, !fir.ref>) @@ -482,7 +482,7 @@ subroutine omp_target_implicit_bounds(n) integer :: a(n) !CHECK: %[[VAL_14:.*]] = omp.map.bounds lower_bound(%c0{{.*}} : index) upper_bound(%[[UB]] : index) extent(%[[VAL_7]] : index) stride(%c1{{.*}} : index) start_idx(%c1{{.*}} : index) !CHECK: %[[VAL_15:.*]] = omp.map.info var_ptr(%[[VAL_10]]#1 : !fir.ref>, i32) map_clauses(implicit, tofrom) capture(ByRef) bounds(%[[VAL_14]]) -> !fir.ref> {name = "a"} - !CHECK: %[[VAL_16:.*]] = omp.map.info var_ptr(%[[VAL_COPY]] : !fir.ref, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref {name = ""} + !CHECK: %[[VAL_16:.*]] = omp.map.info var_ptr(%[[VAL_COPY]] : !fir.ref, i32) map_clauses(implicit) capture(ByCopy) -> !fir.ref {name = ""} !CHECK: omp.target map_entries(%[[VAL_15]] -> %[[VAL_17:.*]], %[[VAL_16]] -> %[[VAL_18:.*]] : !fir.ref>, !fir.ref) { !$omp target !CHECK: %[[VAL_19:.*]] = fir.load %[[VAL_18]] : !fir.ref @@ -642,8 +642,8 @@ subroutine target_unstructured integer :: i = 1 !CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtarget_unstructuredEj"} : (!fir.ref) -> (!fir.ref, !fir.ref) integer :: j = 11 - !CHECK-NO-FPRIV: %[[VAL_4:.*]] = omp.map.info var_ptr(%[[VAL_1]]#1 : !fir.ref, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref {name = "i"} - !CHECK-NO-FPRIV: %[[VAL_5:.*]] = omp.map.info var_ptr(%[[VAL_3]]#1 : !fir.ref, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref {name = "j"} + !CHECK-NO-FPRIV: %[[VAL_4:.*]] = omp.map.info var_ptr(%[[VAL_1]]#1 : !fir.ref, i32) map_clauses(implicit) capture(ByCopy) -> !fir.ref {name = "i"} + !CHECK-NO-FPRIV: %[[VAL_5:.*]] = omp.map.info var_ptr(%[[VAL_3]]#1 : !fir.ref, i32) map_clauses(implicit) capture(ByCopy) -> !fir.ref {name = "j"} !CHECK-NO-FPRIV: omp.target map_entries(%[[VAL_4]] -> %[[VAL_6:.*]], %[[VAL_5]] -> %[[VAL_7:.*]] : !fir.ref, !fir.ref) { !CHECK-FPRIV: %[[VAL_4:.*]] = omp.map.info var_ptr(%[[VAL_1]]#0 : !fir.ref, i32) map_clauses(to) capture(ByCopy) -> !fir.ref !CHECK-FPRIV: %[[VAL_5:.*]] = omp.map.info var_ptr(%[[VAL_3]]#0 : !fir.ref, i32) map_clauses(to) capture(ByCopy) -> !fir.ref diff --git a/flang/test/Transforms/DoConcurrent/map_shape_info.f90 b/flang/test/Transforms/DoConcurrent/map_shape_info.f90 index 3dca1340ae6b9..40f66c19718e8 100644 --- a/flang/test/Transforms/DoConcurrent/map_shape_info.f90 +++ b/flang/test/Transforms/DoConcurrent/map_shape_info.f90 @@ -30,12 +30,12 @@ end program do_concurrent_shape ! CHECK: %[[DIM0_EXT_MAP:.*]] = omp.map.info ! CHECK-SAME: var_ptr(%[[DIM0_EXT]] : !fir.ref, index) -! CHECK-SAME: map_clauses(implicit, exit_release_or_enter_alloc) +! CHECK-SAME: map_clauses(implicit) ! CHECK-SAME: capture(ByCopy) -> !fir.ref {name = "_QFEa.extent.dim0"} ! CHECK: %[[DIM1_EXT_MAP:.*]] = omp.map.info ! CHECK-SAME: var_ptr(%[[DIM1_EXT]] : !fir.ref, index) -! CHECK-SAME: map_clauses(implicit, exit_release_or_enter_alloc) +! CHECK-SAME: map_clauses(implicit) ! CHECK-SAME: capture(ByCopy) -> !fir.ref {name = "_QFEa.extent.dim1"} ! CHECK: omp.target host_eval({{.*}}) map_entries( @@ -79,12 +79,12 @@ end subroutine do_concurrent_shape_shift ! CHECK: %[[DIM0_STRT_MAP:.*]] = omp.map.info ! CHECK-SAME: var_ptr(%[[DIM0_STRT]] : !fir.ref, index) -! CHECK-SAME: map_clauses(implicit, exit_release_or_enter_alloc) +! CHECK-SAME: map_clauses(implicit) ! CHECK-SAME: capture(ByCopy) -> !fir.ref {name = "_QF{{.*}}Ea.start_idx.dim0"} ! CHECK: %[[DIM0_EXT_MAP:.*]] = omp.map.info ! CHECK-SAME: var_ptr(%[[DIM0_EXT]] : !fir.ref, index) -! CHECK-SAME: map_clauses(implicit, exit_release_or_enter_alloc) +! CHECK-SAME: map_clauses(implicit) ! CHECK-SAME: capture(ByCopy) -> !fir.ref {name = "_QF{{.*}}Ea.extent.dim0"} ! CHECK: omp.target host_eval({{.*}}) map_entries( diff --git a/flang/test/Transforms/DoConcurrent/non_reference_to_device.f90 b/flang/test/Transforms/DoConcurrent/non_reference_to_device.f90 index b6b2136e2d405..af48eb4852e91 100644 --- a/flang/test/Transforms/DoConcurrent/non_reference_to_device.f90 +++ b/flang/test/Transforms/DoConcurrent/non_reference_to_device.f90 @@ -24,7 +24,7 @@ end subroutine test_non_refernece ! CHECK: omp.map.info var_ptr(%{{.*}} : !fir.ref, index) ! CHECK: %[[DIM_MAP:.*]] = omp.map.info var_ptr(%{{.*}} : !fir.ref, index) -! CHECK-SAME: map_clauses(implicit, exit_release_or_enter_alloc) +! CHECK-SAME: map_clauses(implicit) ! CHECK-SAME: capture(ByCopy) -> !fir.ref {name = ""} diff --git a/flang/test/Transforms/OpenMP/lower-workdistribute-fission-host.mlir b/flang/test/Transforms/OpenMP/lower-workdistribute-fission-host.mlir index 04e60ca8bbf37..aef72e4fd001e 100644 --- a/flang/test/Transforms/OpenMP/lower-workdistribute-fission-host.mlir +++ b/flang/test/Transforms/OpenMP/lower-workdistribute-fission-host.mlir @@ -12,10 +12,10 @@ // CHECK: %[[VAL_4:.*]] = omp.map.info var_ptr(%[[VAL_1]] : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "ub"} // CHECK: %[[VAL_5:.*]] = omp.map.info var_ptr(%[[VAL_2]] : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "step"} // CHECK: %[[VAL_6:.*]] = omp.map.info var_ptr(%[[ARG3:.*]] : !fir.ref, index) map_clauses(tofrom) capture(ByRef) -> !fir.ref {name = "addr"} -// CHECK: %[[VAL_7:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref, index) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !fir.ref {name = "lb"} -// CHECK: %[[VAL_8:.*]] = omp.map.info var_ptr(%[[VAL_1]] : !fir.ref, index) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !fir.ref {name = "ub"} -// CHECK: %[[VAL_9:.*]] = omp.map.info var_ptr(%[[VAL_2]] : !fir.ref, index) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !fir.ref {name = "step"} -// CHECK: %[[VAL_10:.*]] = omp.map.info var_ptr(%[[ARG3]] : !fir.ref, index) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !fir.ref {name = "addr"} +// CHECK: %[[VAL_7:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref, index) map_clauses(storage) capture(ByRef) -> !fir.ref {name = "lb"} +// CHECK: %[[VAL_8:.*]] = omp.map.info var_ptr(%[[VAL_1]] : !fir.ref, index) map_clauses(storage) capture(ByRef) -> !fir.ref {name = "ub"} +// CHECK: %[[VAL_9:.*]] = omp.map.info var_ptr(%[[VAL_2]] : !fir.ref, index) map_clauses(storage) capture(ByRef) -> !fir.ref {name = "step"} +// CHECK: %[[VAL_10:.*]] = omp.map.info var_ptr(%[[ARG3]] : !fir.ref, index) map_clauses(storage) capture(ByRef) -> !fir.ref {name = "addr"} // CHECK: omp.target_data map_entries(%[[VAL_3]], %[[VAL_4]], %[[VAL_5]], %[[VAL_6]] : !fir.ref, !fir.ref, !fir.ref, !fir.ref) { // CHECK: %[[VAL_11:.*]] = fir.alloca index // CHECK: %[[VAL_12:.*]] = omp.map.info var_ptr(%[[VAL_11]] : !fir.ref, index) map_clauses(from) capture(ByRef) -> !fir.ref {name = "__flang_workdistribute_from"} diff --git a/flang/test/Transforms/OpenMP/lower-workdistribute-fission-target.mlir b/flang/test/Transforms/OpenMP/lower-workdistribute-fission-target.mlir index 062eb701b52ef..25f0350ab98b2 100644 --- a/flang/test/Transforms/OpenMP/lower-workdistribute-fission-target.mlir +++ b/flang/test/Transforms/OpenMP/lower-workdistribute-fission-target.mlir @@ -12,10 +12,10 @@ // CHECK: %[[VAL_4:.*]] = omp.map.info var_ptr(%[[VAL_1]] : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "ub"} // CHECK: %[[VAL_5:.*]] = omp.map.info var_ptr(%[[VAL_2]] : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "step"} // CHECK: %[[VAL_6:.*]] = omp.map.info var_ptr(%[[ARG3:.*]] : !fir.ref, index) map_clauses(tofrom) capture(ByRef) -> !fir.ref {name = "addr"} -// CHECK: %[[VAL_7:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref, index) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !fir.ref {name = "lb"} -// CHECK: %[[VAL_8:.*]] = omp.map.info var_ptr(%[[VAL_1]] : !fir.ref, index) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !fir.ref {name = "ub"} -// CHECK: %[[VAL_9:.*]] = omp.map.info var_ptr(%[[VAL_2]] : !fir.ref, index) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !fir.ref {name = "step"} -// CHECK: %[[VAL_10:.*]] = omp.map.info var_ptr(%[[ARG3]] : !fir.ref, index) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !fir.ref {name = "addr"} +// CHECK: %[[VAL_7:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref, index) map_clauses(storage) capture(ByRef) -> !fir.ref {name = "lb"} +// CHECK: %[[VAL_8:.*]] = omp.map.info var_ptr(%[[VAL_1]] : !fir.ref, index) map_clauses(storage) capture(ByRef) -> !fir.ref {name = "ub"} +// CHECK: %[[VAL_9:.*]] = omp.map.info var_ptr(%[[VAL_2]] : !fir.ref, index) map_clauses(storage) capture(ByRef) -> !fir.ref {name = "step"} +// CHECK: %[[VAL_10:.*]] = omp.map.info var_ptr(%[[ARG3]] : !fir.ref, index) map_clauses(storage) capture(ByRef) -> !fir.ref {name = "addr"} // CHECK: omp.target_data map_entries(%[[VAL_3]], %[[VAL_4]], %[[VAL_5]], %[[VAL_6]] : !fir.ref, !fir.ref, !fir.ref, !fir.ref) { // CHECK: %[[VAL_11:.*]] = fir.alloca index // CHECK: %[[VAL_12:.*]] = omp.map.info var_ptr(%[[VAL_11]] : !fir.ref, index) map_clauses(from) capture(ByRef) -> !fir.ref {name = "__flang_workdistribute_from"} diff --git a/flang/test/Transforms/omp-map-info-finalization.fir b/flang/test/Transforms/omp-map-info-finalization.fir index 7bc0ae4a72b05..b30a2fc4e9a80 100644 --- a/flang/test/Transforms/omp-map-info-finalization.fir +++ b/flang/test/Transforms/omp-map-info-finalization.fir @@ -257,7 +257,7 @@ func.func @alloca_dtype_map_op_block_add(%arg0 : !fir.ref>>,vertexy:!fir.box>>}>>>>,array_i:!fir.array<10xi32>}>>) -> !fir.ref>>,vertexy:!fir.box>>}>>>>> %3 = omp.map.bounds lower_bound(%c1_15 : index) upper_bound(%c1_15 : index) extent(%c1_15 : index) stride(%c1_15 : index) start_idx(%c1_15 : index) {stride_in_bytes = true} - %4 = omp.map.info var_ptr(%2 : !fir.ref>>,vertexy:!fir.box>>}>>>>>, !fir.box>>,vertexy:!fir.box>>}>>>>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) bounds(%3) -> !fir.ref>>,vertexy:!fir.box>>}>>>>> {name = "alloca_dtype%vertexes(2_8)%vertexy"} + %4 = omp.map.info var_ptr(%2 : !fir.ref>>,vertexy:!fir.box>>}>>>>>, !fir.box>>,vertexy:!fir.box>>}>>>>) map_clauses(storage) capture(ByRef) bounds(%3) -> !fir.ref>>,vertexy:!fir.box>>}>>>>> {name = "alloca_dtype%vertexes(2_8)%vertexy"} %5 = fir.load %2 : !fir.ref>>,vertexy:!fir.box>>}>>>>> %c2_i64 = arith.constant 2 : i64 %c1_20 = arith.constant 1 : index @@ -266,7 +266,7 @@ func.func @alloca_dtype_map_op_block_add(%arg0 : !fir.ref>>,vertexy:!fir.box>>}>>>>, index) -> !fir.ref>>,vertexy:!fir.box>>}>> %9 = fir.coordinate_of %8, vertexy : (!fir.ref>>,vertexy:!fir.box>>}>>) -> !fir.ref>>> %10 = omp.map.info var_ptr(%9 : !fir.ref>>>, !fir.box>>) map_clauses(tofrom) capture(ByRef) bounds(%1) -> !fir.ref>>> {name = "alloca_dtype%vertexes(2_8)%vertexy"} - %11 = omp.map.info var_ptr(%0#1 : !fir.ref>>,vertexy:!fir.box>>}>>>>,array_i:!fir.array<10xi32>}>>, !fir.type<_QFmaptype_nested_derived_type_member_idxTdtype{i:f32,vertexes:!fir.box>>,vertexy:!fir.box>>}>>>>,array_i:!fir.array<10xi32>}>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) members(%4, %10 : [1], [1,2] : !fir.ref>>,vertexy:!fir.box>>}>>>>>, !fir.ref>>>) -> !fir.ref>>,vertexy:!fir.box>>}>>>>,array_i:!fir.array<10xi32>}>> {name = "alloca_dtype", partial_map = true} + %11 = omp.map.info var_ptr(%0#1 : !fir.ref>>,vertexy:!fir.box>>}>>>>,array_i:!fir.array<10xi32>}>>, !fir.type<_QFmaptype_nested_derived_type_member_idxTdtype{i:f32,vertexes:!fir.box>>,vertexy:!fir.box>>}>>>>,array_i:!fir.array<10xi32>}>) map_clauses(storage) capture(ByRef) members(%4, %10 : [1], [1,2] : !fir.ref>>,vertexy:!fir.box>>}>>>>>, !fir.ref>>>) -> !fir.ref>>,vertexy:!fir.box>>}>>>>,array_i:!fir.array<10xi32>}>> {name = "alloca_dtype", partial_map = true} omp.target map_entries(%11 -> %arg1 : !fir.ref>>,vertexy:!fir.box>>}>>>>,array_i:!fir.array<10xi32>}>>) { omp.terminator } @@ -277,7 +277,7 @@ func.func @alloca_dtype_map_op_block_add(%arg0 : !fir.ref>) -> (!fir.ref>, !fir.ref>) // CHECK: %[[DESC_1:.*]] = fir.coordinate_of %[[DECLARE]]#0, vertexes : (!fir.ref>) -> !fir.ref>>,vertexy:!fir.box>>}]]>>>>> // CHECK: %[[BASE_ADDR_1:.*]] = fir.box_offset %[[DESC_1]] base_addr : (!fir.ref>>>>) -> !fir.llvm_ptr>>> -// CHECK: %[[BASE_ADDR_MAP_1:.*]] = omp.map.info var_ptr(%[[DESC_1]] : !fir.ref>>>>, !fir.type<[[REC_TY2]]>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) var_ptr_ptr(%[[BASE_ADDR_1]] : !fir.llvm_ptr>>>) bounds(%{{.*}}) -> !fir.llvm_ptr>>> {{.*}} +// CHECK: %[[BASE_ADDR_MAP_1:.*]] = omp.map.info var_ptr(%[[DESC_1]] : !fir.ref>>>>, !fir.type<[[REC_TY2]]>) map_clauses(storage) capture(ByRef) var_ptr_ptr(%[[BASE_ADDR_1]] : !fir.llvm_ptr>>>) bounds(%{{.*}}) -> !fir.llvm_ptr>>> {{.*}} // CHECK: %[[DESC_MAP_1:.*]] = omp.map.info var_ptr(%[[DESC_1]] : !fir.ref>>>>, !fir.box>>>) map_clauses(to) capture(ByRef) -> !fir.ref>>>> {{.*}} // CHECK: %[[DESC_LD_1:.*]] = fir.load %[[DESC_1]] : !fir.ref>>>> // CHECK: %[[MEMBER_ACCESS_1:.*]] = fir.coordinate_of %[[DESC_LD_1]], %{{.*}} : (!fir.box>>>, index) -> !fir.ref> @@ -285,7 +285,7 @@ func.func @alloca_dtype_map_op_block_add(%arg0 : !fir.ref>>>) -> !fir.llvm_ptr>> // CHECK: %[[BASE_ADDR_MAP_2:.*]] = omp.map.info var_ptr(%[[DESC_2]] : !fir.ref>>>, i32) map_clauses(tofrom) capture(ByRef) var_ptr_ptr(%[[BASE_ADDR_2]] : !fir.llvm_ptr>>) bounds(%{{.*}}) -> !fir.llvm_ptr>> {{.*}} // CHECK: %[[DESC_MAP_2:.*]] = omp.map.info var_ptr(%[[DESC_2]] : !fir.ref>>>, !fir.box>>) map_clauses(to) capture(ByRef) -> !fir.ref>>> {{.*}} -// CHECK: %[[TOP_PARENT_MAP:.*]] = omp.map.info var_ptr(%0#1 : !fir.ref>, !fir.type<[[REC_TY]]>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) members(%6, %5, %14, %13 : [1], [1, 0], [1, 0, 2], [1, 0, 2, 0] : !fir.ref>>>>, !fir.llvm_ptr>>>, !fir.ref>>>, !fir.llvm_ptr>>) -> !fir.ref> {{{.*}} partial_map = true} +// CHECK: %[[TOP_PARENT_MAP:.*]] = omp.map.info var_ptr(%0#1 : !fir.ref>, !fir.type<[[REC_TY]]>) map_clauses(storage) capture(ByRef) members(%6, %5, %14, %13 : [1], [1, 0], [1, 0, 2], [1, 0, 2, 0] : !fir.ref>>>>, !fir.llvm_ptr>>>, !fir.ref>>>, !fir.llvm_ptr>>) -> !fir.ref> {{{.*}} partial_map = true} // CHECK: omp.target map_entries(%[[TOP_PARENT_MAP]] -> %{{.*}}, %[[DESC_MAP_1]] -> %{{.*}}, %[[BASE_ADDR_MAP_1]] -> %{{.*}}, %[[DESC_MAP_2]] -> %{{.*}}, %[[BASE_ADDR_MAP_2]] -> %{{.*}} : !fir.ref>, !fir.ref>>>>, !fir.llvm_ptr>>>, !fir.ref>>>, !fir.llvm_ptr>>) { // ----- diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td index f693a0737e0fc..d9882cbcb5977 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td @@ -101,6 +101,60 @@ def ClauseRequires : OpenMP_BitEnumAttr< def ClauseRequiresAttr : OpenMP_EnumAttr; + +//===----------------------------------------------------------------------===// +// clause_map_flag enum. +//===----------------------------------------------------------------------===// + +def ClauseMapFlagsNone : I32BitEnumAttrCaseNone<"none">; +def ClauseMapFlagsStorage : I32BitEnumAttrCaseBit<"storage", 0>; // alloc/release synonym +def ClauseMapFlagsTo : I32BitEnumAttrCaseBit<"to", 1>; +def ClauseMapFlagsFrom : I32BitEnumAttrCaseBit<"from", 2>; +def ClauseMapFlagsAlways : I32BitEnumAttrCaseBit<"always", 3>; +def ClauseMapFlagsDelete : I32BitEnumAttrCaseBit<"del", 4>; // delete, is reserved by C/C++ +def ClauseMapFlagsReturnParam : I32BitEnumAttrCaseBit<"return_param", 5>; +def ClauseMapFlagsPrivate : I32BitEnumAttrCaseBit<"priv", 6>; // private, is reserved by C/C++ +def ClauseMapFlagsLiteral : I32BitEnumAttrCaseBit<"literal", 7>; +def ClauseMapFlagsImplicit : I32BitEnumAttrCaseBit<"implicit", 8>; +def ClauseMapFlagsClose : I32BitEnumAttrCaseBit<"close", 9>; +def ClauseMapFlagsPresent : I32BitEnumAttrCaseBit<"present", 10>; +def ClauseMapFlagsOMPXHold : I32BitEnumAttrCaseBit<"ompx_hold", 11>; +def ClauseMapFlagsAttach : I32BitEnumAttrCaseBit<"attach", 12>; +def ClauseMapFlagsAttachAlways : I32BitEnumAttrCaseBit<"attach_always", 13>; +def ClauseMapFlagsAttachNone : I32BitEnumAttrCaseBit<"attach_none", 14>; +def ClauseMapFlagsAttachAuto : I32BitEnumAttrCaseBit<"attach_auto", 15>; +def ClauseMapFlagsRefPtr : I32BitEnumAttrCaseBit<"ref_ptr", 16>; +def ClauseMapFlagsRefPtee : I32BitEnumAttrCaseBit<"ref_ptee", 17>; +def ClauseMapFlagsRefPtrPtee : I32BitEnumAttrCaseBit<"ref_ptr_ptee", 18>; + +def ClauseMapFlags : OpenMP_BitEnumAttr< + "ClauseMapFlags", + "Map types and modifiers tied to data maps", [ + ClauseMapFlagsNone, + ClauseMapFlagsStorage, + ClauseMapFlagsTo, + ClauseMapFlagsFrom, + ClauseMapFlagsAlways, + ClauseMapFlagsDelete, + ClauseMapFlagsReturnParam, + ClauseMapFlagsPrivate, + ClauseMapFlagsLiteral, + ClauseMapFlagsImplicit, + ClauseMapFlagsClose, + ClauseMapFlagsPresent, + ClauseMapFlagsOMPXHold, + ClauseMapFlagsAttach, + ClauseMapFlagsAttachAlways, + ClauseMapFlagsAttachNone, + ClauseMapFlagsAttachAuto, + ClauseMapFlagsRefPtr, + ClauseMapFlagsRefPtee, + ClauseMapFlagsRefPtrPtee + ]>; + +def ClauseMapFlagsAttr : OpenMP_EnumAttr; + //===----------------------------------------------------------------------===// // clause_task_depend enum. //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td index b73091ea0ca53..377f1febf6b8f 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td @@ -1198,7 +1198,7 @@ def MapBoundsOp : OpenMP_Op<"map.bounds", def MapInfoOp : OpenMP_Op<"map.info", [AttrSizedOperandSegments]> { let arguments = (ins OpenMP_PointerLikeType:$var_ptr, TypeAttr:$var_type, - UI64Attr:$map_type, + ClauseMapFlagsAttr:$map_type, VariableCaptureKindAttr:$map_capture_type, Optional:$var_ptr_ptr, Variadic:$members, diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index fd4cabbada96f..1b069c62a8be9 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -32,7 +32,6 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/ADT/bit.h" -#include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Support/InterleavedRange.h" #include #include @@ -1737,10 +1736,10 @@ static LogicalResult verifySynchronizationHint(Operation *op, uint64_t hint) { // Parser, printer and verifier for Target //===----------------------------------------------------------------------===// -// Helper function to get bitwise AND of `value` and 'flag' -static uint64_t mapTypeToBitFlag(uint64_t value, - llvm::omp::OpenMPOffloadMappingFlags flag) { - return value & llvm::to_underlying(flag); +// Helper function to get bitwise AND of `value` and 'flag' then return it as a +// boolean +static bool mapTypeToBool(ClauseMapFlags value, ClauseMapFlags flag) { + return (value & flag) == flag; } /// Parses a map_entries map type from a string format back into its numeric @@ -1748,10 +1747,9 @@ static uint64_t mapTypeToBitFlag(uint64_t value, /// /// map-clause = `map_clauses ( ( `(` `always, `? `implicit, `? `ompx_hold, `? /// `close, `? `present, `? ( `to` | `from` | `delete` `)` )+ `)` ) -static ParseResult parseMapClause(OpAsmParser &parser, IntegerAttr &mapType) { - llvm::omp::OpenMPOffloadMappingFlags mapTypeBits = - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_NONE; - +static ParseResult parseMapClause(OpAsmParser &parser, + ClauseMapFlagsAttr &mapType) { + ClauseMapFlags mapTypeBits = ClauseMapFlags::none; // This simply verifies the correct keyword is read in, the // keyword itself is stored inside of the operation auto parseTypeAndMod = [&]() -> ParseResult { @@ -1760,35 +1758,64 @@ static ParseResult parseMapClause(OpAsmParser &parser, IntegerAttr &mapType) { return failure(); if (mapTypeMod == "always") - mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_ALWAYS; + mapTypeBits |= ClauseMapFlags::always; if (mapTypeMod == "implicit") - mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT; + mapTypeBits |= ClauseMapFlags::implicit; if (mapTypeMod == "ompx_hold") - mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_OMPX_HOLD; + mapTypeBits |= ClauseMapFlags::ompx_hold; if (mapTypeMod == "close") - mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_CLOSE; + mapTypeBits |= ClauseMapFlags::close; if (mapTypeMod == "present") - mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_PRESENT; + mapTypeBits |= ClauseMapFlags::present; if (mapTypeMod == "to") - mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO; + mapTypeBits |= ClauseMapFlags::to; if (mapTypeMod == "from") - mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; + mapTypeBits |= ClauseMapFlags::from; if (mapTypeMod == "tofrom") - mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO | - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; + mapTypeBits |= ClauseMapFlags::to | ClauseMapFlags::from; if (mapTypeMod == "delete") - mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_DELETE; + mapTypeBits |= ClauseMapFlags::del; + + if (mapTypeMod == "storage") + mapTypeBits |= ClauseMapFlags::storage; if (mapTypeMod == "return_param") - mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_RETURN_PARAM; + mapTypeBits |= ClauseMapFlags::return_param; + + if (mapTypeMod == "private") + mapTypeBits |= ClauseMapFlags::priv; + + if (mapTypeMod == "literal") + mapTypeBits |= ClauseMapFlags::literal; + + if (mapTypeMod == "attach") + mapTypeBits |= ClauseMapFlags::attach; + + if (mapTypeMod == "attach_always") + mapTypeBits |= ClauseMapFlags::attach_always; + + if (mapTypeMod == "attach_none") + mapTypeBits |= ClauseMapFlags::attach_none; + + if (mapTypeMod == "attach_auto") + mapTypeBits |= ClauseMapFlags::attach_auto; + + if (mapTypeMod == "ref_ptr") + mapTypeBits |= ClauseMapFlags::ref_ptr; + + if (mapTypeMod == "ref_ptee") + mapTypeBits |= ClauseMapFlags::ref_ptee; + + if (mapTypeMod == "ref_ptr_ptee") + mapTypeBits |= ClauseMapFlags::ref_ptr_ptee; return success(); }; @@ -1796,9 +1823,8 @@ static ParseResult parseMapClause(OpAsmParser &parser, IntegerAttr &mapType) { if (parser.parseCommaSeparatedList(parseTypeAndMod)) return failure(); - mapType = parser.getBuilder().getIntegerAttr( - parser.getBuilder().getIntegerType(64, /*isSigned=*/false), - llvm::to_underlying(mapTypeBits)); + mapType = + parser.getBuilder().getAttr(mapTypeBits); return success(); } @@ -1806,60 +1832,62 @@ static ParseResult parseMapClause(OpAsmParser &parser, IntegerAttr &mapType) { /// Prints a map_entries map type from its numeric value out into its string /// format. static void printMapClause(OpAsmPrinter &p, Operation *op, - IntegerAttr mapType) { - uint64_t mapTypeBits = mapType.getUInt(); - - bool emitAllocRelease = true; + ClauseMapFlagsAttr mapType) { llvm::SmallVector mapTypeStrs; + ClauseMapFlags mapFlags = mapType.getValue(); // handling of always, close, present placed at the beginning of the string // to aid readability - if (mapTypeToBitFlag(mapTypeBits, - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_ALWAYS)) + if (mapTypeToBool(mapFlags, ClauseMapFlags::always)) mapTypeStrs.push_back("always"); - if (mapTypeToBitFlag(mapTypeBits, - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT)) + if (mapTypeToBool(mapFlags, ClauseMapFlags::implicit)) mapTypeStrs.push_back("implicit"); - if (mapTypeToBitFlag(mapTypeBits, - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_OMPX_HOLD)) + if (mapTypeToBool(mapFlags, ClauseMapFlags::ompx_hold)) mapTypeStrs.push_back("ompx_hold"); - if (mapTypeToBitFlag(mapTypeBits, - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_CLOSE)) + if (mapTypeToBool(mapFlags, ClauseMapFlags::close)) mapTypeStrs.push_back("close"); - if (mapTypeToBitFlag(mapTypeBits, - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) + if (mapTypeToBool(mapFlags, ClauseMapFlags::present)) mapTypeStrs.push_back("present"); // special handling of to/from/tofrom/delete and release/alloc, release + // alloc are the abscense of one of the other flags, whereas tofrom requires // both the to and from flag to be set. - bool to = mapTypeToBitFlag(mapTypeBits, - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO); - bool from = mapTypeToBitFlag( - mapTypeBits, llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM); - if (to && from) { - emitAllocRelease = false; + bool to = mapTypeToBool(mapFlags, ClauseMapFlags::to); + bool from = mapTypeToBool(mapFlags, ClauseMapFlags::from); + + if (to && from) mapTypeStrs.push_back("tofrom"); - } else if (from) { - emitAllocRelease = false; + else if (from) mapTypeStrs.push_back("from"); - } else if (to) { - emitAllocRelease = false; + else if (to) mapTypeStrs.push_back("to"); - } - if (mapTypeToBitFlag(mapTypeBits, - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_DELETE)) { - emitAllocRelease = false; + + if (mapTypeToBool(mapFlags, ClauseMapFlags::del)) mapTypeStrs.push_back("delete"); - } - if (mapTypeToBitFlag( - mapTypeBits, - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_RETURN_PARAM)) { - emitAllocRelease = false; + if (mapTypeToBool(mapFlags, ClauseMapFlags::return_param)) mapTypeStrs.push_back("return_param"); - } - if (emitAllocRelease) - mapTypeStrs.push_back("exit_release_or_enter_alloc"); + if (mapTypeToBool(mapFlags, ClauseMapFlags::storage)) + mapTypeStrs.push_back("storage"); + if (mapTypeToBool(mapFlags, ClauseMapFlags::priv)) + mapTypeStrs.push_back("private"); + if (mapTypeToBool(mapFlags, ClauseMapFlags::literal)) + mapTypeStrs.push_back("literal"); + if (mapTypeToBool(mapFlags, ClauseMapFlags::attach)) + mapTypeStrs.push_back("attach"); + if (mapTypeToBool(mapFlags, ClauseMapFlags::attach_always)) + mapTypeStrs.push_back("attach_always"); + if (mapTypeToBool(mapFlags, ClauseMapFlags::attach_none)) + mapTypeStrs.push_back("attach_none"); + if (mapTypeToBool(mapFlags, ClauseMapFlags::attach_auto)) + mapTypeStrs.push_back("attach_auto"); + if (mapTypeToBool(mapFlags, ClauseMapFlags::ref_ptr)) + mapTypeStrs.push_back("ref_ptr"); + if (mapTypeToBool(mapFlags, ClauseMapFlags::ref_ptee)) + mapTypeStrs.push_back("ref_ptee"); + if (mapTypeToBool(mapFlags, ClauseMapFlags::ref_ptr_ptee)) + mapTypeStrs.push_back("ref_ptr_ptee"); + if (mapFlags == ClauseMapFlags::none) + mapTypeStrs.push_back("none"); for (unsigned int i = 0; i < mapTypeStrs.size(); ++i) { p << mapTypeStrs[i]; @@ -1963,21 +1991,15 @@ static LogicalResult verifyMapClause(Operation *op, OperandRange mapVars) { return emitError(op->getLoc(), "missing map operation"); if (auto mapInfoOp = mapOp.getDefiningOp()) { - uint64_t mapTypeBits = mapInfoOp.getMapType(); - - bool to = mapTypeToBitFlag( - mapTypeBits, llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO); - bool from = mapTypeToBitFlag( - mapTypeBits, llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM); - bool del = mapTypeToBitFlag( - mapTypeBits, llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_DELETE); - - bool always = mapTypeToBitFlag( - mapTypeBits, llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_ALWAYS); - bool close = mapTypeToBitFlag( - mapTypeBits, llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_CLOSE); - bool implicit = mapTypeToBitFlag( - mapTypeBits, llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT); + mlir::omp::ClauseMapFlags mapTypeBits = mapInfoOp.getMapType(); + + bool to = mapTypeToBool(mapTypeBits, ClauseMapFlags::to); + bool from = mapTypeToBool(mapTypeBits, ClauseMapFlags::from); + bool del = mapTypeToBool(mapTypeBits, ClauseMapFlags::del); + + bool always = mapTypeToBool(mapTypeBits, ClauseMapFlags::always); + bool close = mapTypeToBool(mapTypeBits, ClauseMapFlags::close); + bool implicit = mapTypeToBool(mapTypeBits, ClauseMapFlags::implicit); if ((isa(op) || isa(op)) && del) return emitError(op->getLoc(), diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 8de49dd397d26..b851414ece118 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -3833,6 +3833,58 @@ static llvm::Value *getSizeInBytes(DataLayout &dl, const mlir::Type &type, return builder.getInt64(dl.getTypeSizeInBits(type) / 8); } +// Convert the MLIR map flag set to the runtime map flag set for embedding +// in LLVM-IR. This is important as the two bit-flag lists do not correspond +// 1-to-1 as there's flags the runtime doesn't care about and vice versa. +// Certain flags are discarded here such as RefPtee and co. +static llvm::omp::OpenMPOffloadMappingFlags +convertClauseMapFlags(omp::ClauseMapFlags mlirFlags) { + auto mapTypeToBool = [&mlirFlags](omp::ClauseMapFlags flag) { + return (mlirFlags & flag) == flag; + }; + + llvm::omp::OpenMPOffloadMappingFlags mapType = + llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_NONE; + + if (mapTypeToBool(omp::ClauseMapFlags::to)) + mapType |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO; + + if (mapTypeToBool(omp::ClauseMapFlags::from)) + mapType |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; + + if (mapTypeToBool(omp::ClauseMapFlags::always)) + mapType |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_ALWAYS; + + if (mapTypeToBool(omp::ClauseMapFlags::del)) + mapType |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_DELETE; + + if (mapTypeToBool(omp::ClauseMapFlags::return_param)) + mapType |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_RETURN_PARAM; + + if (mapTypeToBool(omp::ClauseMapFlags::priv)) + mapType |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_PRIVATE; + + if (mapTypeToBool(omp::ClauseMapFlags::literal)) + mapType |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_LITERAL; + + if (mapTypeToBool(omp::ClauseMapFlags::implicit)) + mapType |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT; + + if (mapTypeToBool(omp::ClauseMapFlags::close)) + mapType |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_CLOSE; + + if (mapTypeToBool(omp::ClauseMapFlags::present)) + mapType |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_PRESENT; + + if (mapTypeToBool(omp::ClauseMapFlags::ompx_hold)) + mapType |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_OMPX_HOLD; + + if (mapTypeToBool(omp::ClauseMapFlags::attach)) + mapType |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_ATTACH; + + return mapType; +} + static void collectMapDataFromMapOperands( MapInfoData &mapData, SmallVectorImpl &mapVars, LLVM::ModuleTranslation &moduleTranslation, DataLayout &dl, @@ -3880,8 +3932,7 @@ static void collectMapDataFromMapOperands( getSizeInBytes(dl, mapOp.getVarType(), mapOp, mapData.Pointers.back(), mapData.BaseType.back(), builder, moduleTranslation)); mapData.MapClause.push_back(mapOp.getOperation()); - mapData.Types.push_back( - llvm::omp::OpenMPOffloadMappingFlags(mapOp.getMapType())); + mapData.Types.push_back(convertClauseMapFlags(mapOp.getMapType())); mapData.Names.push_back(LLVM::createMappingInformation( mapOp.getLoc(), *moduleTranslation.getOpenMPBuilder())); mapData.DevicePointers.push_back(llvm::OpenMPIRBuilder::DeviceInfoTy::None); @@ -3950,8 +4001,7 @@ static void collectMapDataFromMapOperands( Value offloadPtr = mapOp.getVarPtrPtr() ? mapOp.getVarPtrPtr() : mapOp.getVarPtr(); llvm::Value *origValue = moduleTranslation.lookupValue(offloadPtr); - auto mapType = - static_cast(mapOp.getMapType()); + auto mapType = convertClauseMapFlags(mapOp.getMapType()); auto mapTypeAlways = llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_ALWAYS; mapData.OriginalValue.push_back(origValue); @@ -4299,8 +4349,7 @@ static void processMapMembersWithParent( // in part as we currently have substantially less information on the data // being mapped at this stage. if (checkIfPointerMap(memberClause)) { - auto mapFlag = - llvm::omp::OpenMPOffloadMappingFlags(memberClause.getMapType()); + auto mapFlag = convertClauseMapFlags(memberClause.getMapType()); mapFlag &= ~llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TARGET_PARAM; mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF; ompBuilder.setCorrectMemberOfFlag(mapFlag, memberOfFlag); @@ -4319,8 +4368,7 @@ static void processMapMembersWithParent( // Same MemberOfFlag to indicate its link with parent and other members // of. - auto mapFlag = - llvm::omp::OpenMPOffloadMappingFlags(memberClause.getMapType()); + auto mapFlag = convertClauseMapFlags(memberClause.getMapType()); mapFlag &= ~llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TARGET_PARAM; mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF; ompBuilder.setCorrectMemberOfFlag(mapFlag, memberOfFlag); diff --git a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir index 7d8ccd910cdf4..f2fbe91a41ecd 100644 --- a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir +++ b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir @@ -216,22 +216,22 @@ func.func @task_depend(%arg0: !llvm.ptr) { // CHECK: (%[[ARG0:.*]]: !llvm.ptr, %[[ARG1:.*]]: !llvm.ptr, %[[ARG2:.*]]: !llvm.ptr, %[[ARG3:.*]]: !llvm.ptr) // CHECK: %[[MAP0:.*]] = omp.map.info var_ptr(%[[ARG0]] : !llvm.ptr, i32) map_clauses(to) capture(ByRef) -> !llvm.ptr {name = ""} // CHECK: %[[MAP1:.*]] = omp.map.info var_ptr(%[[ARG1]] : !llvm.ptr, i32) map_clauses(to) capture(ByRef) -> !llvm.ptr {name = ""} -// CHECK: %[[MAP2:.*]] = omp.map.info var_ptr(%[[ARG2]] : !llvm.ptr, i32) map_clauses(always, exit_release_or_enter_alloc) capture(ByRef) -> !llvm.ptr {name = ""} +// CHECK: %[[MAP2:.*]] = omp.map.info var_ptr(%[[ARG2]] : !llvm.ptr, i32) map_clauses(always, storage) capture(ByRef) -> !llvm.ptr {name = ""} // CHECK: omp.target_enter_data map_entries(%[[MAP0]], %[[MAP1]], %[[MAP2]] : !llvm.ptr, !llvm.ptr, !llvm.ptr) // CHECK: %[[MAP3:.*]] = omp.map.info var_ptr(%[[ARG0]] : !llvm.ptr, i32) map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""} // CHECK: %[[MAP4:.*]] = omp.map.info var_ptr(%[[ARG1]] : !llvm.ptr, i32) map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""} -// CHECK: %[[MAP5:.*]] = omp.map.info var_ptr(%[[ARG2]] : !llvm.ptr, i32) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !llvm.ptr {name = ""} +// CHECK: %[[MAP5:.*]] = omp.map.info var_ptr(%[[ARG2]] : !llvm.ptr, i32) map_clauses(storage) capture(ByRef) -> !llvm.ptr {name = ""} // CHECK: %[[MAP6:.*]] = omp.map.info var_ptr(%[[ARG3]] : !llvm.ptr, i32) map_clauses(always, delete) capture(ByRef) -> !llvm.ptr {name = ""} // CHECK: omp.target_exit_data map_entries(%[[MAP3]], %[[MAP4]], %[[MAP5]], %[[MAP6]] : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) llvm.func @_QPomp_target_data(%a : !llvm.ptr, %b : !llvm.ptr, %c : !llvm.ptr, %d : !llvm.ptr) { %0 = omp.map.info var_ptr(%a : !llvm.ptr, i32) map_clauses(to) capture(ByRef) -> !llvm.ptr {name = ""} %1 = omp.map.info var_ptr(%b : !llvm.ptr, i32) map_clauses(to) capture(ByRef) -> !llvm.ptr {name = ""} - %2 = omp.map.info var_ptr(%c : !llvm.ptr, i32) map_clauses(always, exit_release_or_enter_alloc) capture(ByRef) -> !llvm.ptr {name = ""} + %2 = omp.map.info var_ptr(%c : !llvm.ptr, i32) map_clauses(always, storage) capture(ByRef) -> !llvm.ptr {name = ""} omp.target_enter_data map_entries(%0, %1, %2 : !llvm.ptr, !llvm.ptr, !llvm.ptr) {} %3 = omp.map.info var_ptr(%a : !llvm.ptr, i32) map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""} %4 = omp.map.info var_ptr(%b : !llvm.ptr, i32) map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""} - %5 = omp.map.info var_ptr(%c : !llvm.ptr, i32) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !llvm.ptr {name = ""} + %5 = omp.map.info var_ptr(%c : !llvm.ptr, i32) map_clauses(storage) capture(ByRef) -> !llvm.ptr {name = ""} %6 = omp.map.info var_ptr(%d : !llvm.ptr, i32) map_clauses(always, delete) capture(ByRef) -> !llvm.ptr {name = ""} omp.target_exit_data map_entries(%3, %4, %5, %6 : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) {} llvm.return @@ -266,7 +266,7 @@ llvm.func @_QPomp_target_data_region(%a : !llvm.ptr, %i : !llvm.ptr) { // CHECK: %[[ARG_1:.*]]: !llvm.ptr) { // CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(64 : i32) : i32 // CHECK: %[[MAP1:.*]] = omp.map.info var_ptr(%[[ARG_0]] : !llvm.ptr, !llvm.array<1024 x i32>) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""} -// CHECK: %[[MAP2:.*]] = omp.map.info var_ptr(%[[ARG_1]] : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = ""} +// CHECK: %[[MAP2:.*]] = omp.map.info var_ptr(%[[ARG_1]] : !llvm.ptr, i32) map_clauses(implicit, storage) capture(ByCopy) -> !llvm.ptr {name = ""} // CHECK: omp.target thread_limit(%[[VAL_0]] : i32) map_entries(%[[MAP1]] -> %[[BB_ARG0:.*]], %[[MAP2]] -> %[[BB_ARG1:.*]] : !llvm.ptr, !llvm.ptr) { // CHECK: %[[VAL_1:.*]] = llvm.mlir.constant(10 : i32) : i32 // CHECK: llvm.store %[[VAL_1]], %[[BB_ARG1]] : i32, !llvm.ptr @@ -278,7 +278,7 @@ llvm.func @_QPomp_target_data_region(%a : !llvm.ptr, %i : !llvm.ptr) { llvm.func @_QPomp_target(%a : !llvm.ptr, %i : !llvm.ptr) { %0 = llvm.mlir.constant(64 : i32) : i32 %1 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.array<1024 x i32>) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""} - %3 = omp.map.info var_ptr(%i : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = ""} + %3 = omp.map.info var_ptr(%i : !llvm.ptr, i32) map_clauses(implicit, storage) capture(ByCopy) -> !llvm.ptr {name = ""} omp.target thread_limit(%0 : i32) map_entries(%1 -> %arg0, %3 -> %arg1 : !llvm.ptr, !llvm.ptr) { %2 = llvm.mlir.constant(10 : i32) : i32 llvm.store %2, %arg1 : i32, !llvm.ptr diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir index cbd863f88fd1f..ac29e20907b55 100644 --- a/mlir/test/Dialect/OpenMP/ops.mlir +++ b/mlir/test/Dialect/OpenMP/ops.mlir @@ -828,11 +828,11 @@ func.func @omp_target(%if_cond : i1, %device : si32, %num_threads : i32, %devic // Test with optional map clause. // CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%[[VAL_1:.*]] : memref, tensor) map_clauses(always, to) capture(ByRef) -> memref {name = ""} // CHECK: %[[MAP_B:.*]] = omp.map.info var_ptr(%[[VAL_2:.*]] : memref, tensor) map_clauses(tofrom) capture(ByRef) -> memref {name = ""} - // CHECK: %[[MAP_C:.*]] = omp.map.info var_ptr(%[[VAL_3:.*]] : memref, tensor) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref {name = ""} + // CHECK: %[[MAP_C:.*]] = omp.map.info var_ptr(%[[VAL_3:.*]] : memref, tensor) map_clauses(storage) capture(ByRef) -> memref {name = ""} // CHECK: omp.target is_device_ptr(%[[VAL_4:.*]] : memref) has_device_addr(%[[MAP_A]] -> {{.*}} : memref) map_entries(%[[MAP_B]] -> {{.*}}, %[[MAP_C]] -> {{.*}} : memref, memref) { %mapv0 = omp.map.info var_ptr(%device_addr : memref, tensor) map_clauses(always, to) capture(ByRef) -> memref {name = ""} %mapv1 = omp.map.info var_ptr(%map1 : memref, tensor) map_clauses(tofrom) capture(ByRef) -> memref {name = ""} - %mapv2 = omp.map.info var_ptr(%map2 : memref, tensor) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref {name = ""} + %mapv2 = omp.map.info var_ptr(%map2 : memref, tensor) map_clauses(storage) capture(ByRef) -> memref {name = ""} omp.target is_device_ptr(%device_ptr : memref) has_device_addr(%mapv0 -> %arg0 : memref) map_entries(%mapv1 -> %arg1, %mapv2 -> %arg2 : memref, memref) { omp.terminator } @@ -868,20 +868,20 @@ func.func @omp_target_data (%if_cond : i1, %device : si32, %device_ptr: memref, tensor) map_clauses(tofrom) capture(ByRef) -> memref {name = ""} - // CHECK: %[[MAP_B:.*]] = omp.map.info var_ptr(%[[VAL_2:.*]] : memref, tensor) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref {name = ""} + // CHECK: %[[MAP_B:.*]] = omp.map.info var_ptr(%[[VAL_2:.*]] : memref, tensor) map_clauses(storage) capture(ByRef) -> memref {name = ""} // CHECK: omp.target_data map_entries(%[[MAP_A]], %[[MAP_B]] : memref, memref) %mapv3 = omp.map.info var_ptr(%map1 : memref, tensor) map_clauses(tofrom) capture(ByRef) -> memref {name = ""} - %mapv4 = omp.map.info var_ptr(%map2 : memref, tensor) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref {name = ""} + %mapv4 = omp.map.info var_ptr(%map2 : memref, tensor) map_clauses(storage) capture(ByRef) -> memref {name = ""} omp.target_data map_entries(%mapv3, %mapv4 : memref, memref) {} - // CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%[[VAL_3:.*]] : memref, tensor) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref {name = ""} + // CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%[[VAL_3:.*]] : memref, tensor) map_clauses(storage) capture(ByRef) -> memref {name = ""} // CHECK: omp.target_enter_data device(%[[VAL_1:.*]] : si32) if(%[[VAL_0:.*]]) map_entries(%[[MAP_A]] : memref) nowait - %mapv5 = omp.map.info var_ptr(%map1 : memref, tensor) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref {name = ""} + %mapv5 = omp.map.info var_ptr(%map1 : memref, tensor) map_clauses(storage) capture(ByRef) -> memref {name = ""} omp.target_enter_data if(%if_cond) device(%device : si32) nowait map_entries(%mapv5 : memref) - // CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%[[VAL_3:.*]] : memref, tensor) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref {name = ""} + // CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%[[VAL_3:.*]] : memref, tensor) map_clauses(storage) capture(ByRef) -> memref {name = ""} // CHECK: omp.target_exit_data device(%[[VAL_1:.*]] : si32) if(%[[VAL_0:.*]]) map_entries(%[[MAP_A]] : memref) nowait - %mapv6 = omp.map.info var_ptr(%map2 : memref, tensor) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref {name = ""} + %mapv6 = omp.map.info var_ptr(%map2 : memref, tensor) map_clauses(storage) capture(ByRef) -> memref {name = ""} omp.target_exit_data if(%if_cond) device(%device : si32) nowait map_entries(%mapv6 : memref) // CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%[[VAL_2:.*]] : memref, tensor) map_clauses(ompx_hold, to) capture(ByRef) -> memref {name = ""} @@ -2790,13 +2790,13 @@ func.func @omp_targets_with_map_bounds(%arg0: !llvm.ptr, %arg1: !llvm.ptr) -> () // CHECK: %[[C_12:.*]] = llvm.mlir.constant(2 : index) : i64 // CHECK: %[[C_13:.*]] = llvm.mlir.constant(2 : index) : i64 // CHECK: %[[BOUNDS1:.*]] = omp.map.bounds lower_bound(%[[C_11]] : i64) upper_bound(%[[C_10]] : i64) stride(%[[C_12]] : i64) start_idx(%[[C_13]] : i64) - // CHECK: %[[MAP1:.*]] = omp.map.info var_ptr(%[[ARG1]] : !llvm.ptr, !llvm.array<10 x i32>) map_clauses(exit_release_or_enter_alloc) capture(ByCopy) mapper(@my_mapper) bounds(%[[BOUNDS1]]) -> !llvm.ptr {name = ""} + // CHECK: %[[MAP1:.*]] = omp.map.info var_ptr(%[[ARG1]] : !llvm.ptr, !llvm.array<10 x i32>) map_clauses(storage) capture(ByCopy) mapper(@my_mapper) bounds(%[[BOUNDS1]]) -> !llvm.ptr {name = ""} %6 = llvm.mlir.constant(9 : index) : i64 %7 = llvm.mlir.constant(1 : index) : i64 %8 = llvm.mlir.constant(2 : index) : i64 %9 = llvm.mlir.constant(2 : index) : i64 %10 = omp.map.bounds lower_bound(%7 : i64) upper_bound(%6 : i64) stride(%8 : i64) start_idx(%9 : i64) - %mapv2 = omp.map.info var_ptr(%arg1 : !llvm.ptr, !llvm.array<10 x i32>) map_clauses(exit_release_or_enter_alloc) capture(ByCopy) mapper(@my_mapper) bounds(%10) -> !llvm.ptr {name = ""} + %mapv2 = omp.map.info var_ptr(%arg1 : !llvm.ptr, !llvm.array<10 x i32>) map_clauses(storage) capture(ByCopy) mapper(@my_mapper) bounds(%10) -> !llvm.ptr {name = ""} // CHECK: omp.target map_entries(%[[MAP0]] -> {{.*}}, %[[MAP1]] -> {{.*}} : !llvm.ptr, !llvm.ptr) omp.target map_entries(%mapv1 -> %arg2, %mapv2 -> %arg3 : !llvm.ptr, !llvm.ptr) { @@ -2806,14 +2806,14 @@ func.func @omp_targets_with_map_bounds(%arg0: !llvm.ptr, %arg1: !llvm.ptr) -> () // CHECK: omp.target_data map_entries(%[[MAP0]], %[[MAP1]] : !llvm.ptr, !llvm.ptr) omp.target_data map_entries(%mapv1, %mapv2 : !llvm.ptr, !llvm.ptr){} - // CHECK: %[[MAP2:.*]] = omp.map.info var_ptr(%[[ARG0]] : !llvm.ptr, !llvm.array<10 x i32>) map_clauses(exit_release_or_enter_alloc) capture(VLAType) bounds(%[[BOUNDS0]]) -> !llvm.ptr {name = ""} + // CHECK: %[[MAP2:.*]] = omp.map.info var_ptr(%[[ARG0]] : !llvm.ptr, !llvm.array<10 x i32>) map_clauses(storage) capture(VLAType) bounds(%[[BOUNDS0]]) -> !llvm.ptr {name = ""} // CHECK: omp.target_enter_data map_entries(%[[MAP2]] : !llvm.ptr) - %mapv3 = omp.map.info var_ptr(%arg0 : !llvm.ptr, !llvm.array<10 x i32>) map_clauses(exit_release_or_enter_alloc) capture(VLAType) bounds(%4) -> !llvm.ptr {name = ""} + %mapv3 = omp.map.info var_ptr(%arg0 : !llvm.ptr, !llvm.array<10 x i32>) map_clauses(storage) capture(VLAType) bounds(%4) -> !llvm.ptr {name = ""} omp.target_enter_data map_entries(%mapv3 : !llvm.ptr){} - // CHECK: %[[MAP3:.*]] = omp.map.info var_ptr(%[[ARG1]] : !llvm.ptr, !llvm.array<10 x i32>) map_clauses(exit_release_or_enter_alloc) capture(This) bounds(%[[BOUNDS1]]) -> !llvm.ptr {name = ""} + // CHECK: %[[MAP3:.*]] = omp.map.info var_ptr(%[[ARG1]] : !llvm.ptr, !llvm.array<10 x i32>) map_clauses(storage) capture(This) bounds(%[[BOUNDS1]]) -> !llvm.ptr {name = ""} // CHECK: omp.target_exit_data map_entries(%[[MAP3]] : !llvm.ptr) - %mapv4 = omp.map.info var_ptr(%arg1 : !llvm.ptr, !llvm.array<10 x i32>) map_clauses(exit_release_or_enter_alloc) capture(This) bounds(%10) -> !llvm.ptr {name = ""} + %mapv4 = omp.map.info var_ptr(%arg1 : !llvm.ptr, !llvm.array<10 x i32>) map_clauses(storage) capture(This) bounds(%10) -> !llvm.ptr {name = ""} omp.target_exit_data map_entries(%mapv4 : !llvm.ptr){} return @@ -2852,7 +2852,7 @@ func.func @omp_target_enter_update_exit_data_depend(%a: memref, %b: memre // CHECK-NEXT: [[MAP2:%.*]] = omp.map.info %map_a = omp.map.info var_ptr(%a: memref, tensor) map_clauses(to) capture(ByRef) -> memref %map_b = omp.map.info var_ptr(%b: memref, tensor) map_clauses(from) capture(ByRef) -> memref - %map_c = omp.map.info var_ptr(%c: memref, tensor) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref + %map_c = omp.map.info var_ptr(%c: memref, tensor) map_clauses(storage) capture(ByRef) -> memref // Do some work on the host that writes to 'a' omp.task depend(taskdependout -> %a : memref) { @@ -3014,7 +3014,7 @@ func.func @parallel_op_reduction_and_private(%priv_var: !llvm.ptr, %priv_var2: ! // CHECK-LABEL: omp_target_private func.func @omp_target_private(%map1: memref, %map2: memref, %priv_var: !llvm.ptr) -> () { %mapv1 = omp.map.info var_ptr(%map1 : memref, tensor) map_clauses(tofrom) capture(ByRef) -> memref {name = ""} - %mapv2 = omp.map.info var_ptr(%map2 : memref, tensor) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref {name = ""} + %mapv2 = omp.map.info var_ptr(%map2 : memref, tensor) map_clauses(storage) capture(ByRef) -> memref {name = ""} // CHECK: omp.target // CHECK-SAME: private( @@ -3047,7 +3047,7 @@ func.func @omp_target_private(%map1: memref, %map2: memref, %priv_ // CHECK-LABEL: omp_target_private_with_map_idx func.func @omp_target_private_with_map_idx(%map1: memref, %map2: memref, %priv_var: !llvm.ptr) -> () { %mapv1 = omp.map.info var_ptr(%map1 : memref, tensor) map_clauses(tofrom) capture(ByRef) -> memref {name = ""} - %mapv2 = omp.map.info var_ptr(%map2 : memref, tensor) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref {name = ""} + %mapv2 = omp.map.info var_ptr(%map2 : memref, tensor) map_clauses(storage) capture(ByRef) -> memref {name = ""} // CHECK: omp.target @@ -3321,3 +3321,49 @@ func.func @omp_workdistribute() { } return } + +func.func @omp_target_map_clause_type_test(%arg0 : memref) -> () { + // Test new map clause additions + // CHECK: %{{.*}}map_clauses(none){{.*}} + // CHECK: %{{.*}}map_clauses(to){{.*}} + // CHECK: %{{.*}}map_clauses(from){{.*}} + // CHECK: %{{.*}}map_clauses(tofrom){{.*}} + // CHECK: %{{.*}}map_clauses(storage){{.*}} + // CHECK: %{{.*}}map_clauses(delete){{.*}} + // CHECK: %{{.*}}map_clauses(return_param){{.*}} + // CHECK: %{{.*}}map_clauses(private){{.*}} + // CHECK: %{{.*}}map_clauses(literal){{.*}} + // CHECK: %{{.*}}map_clauses(implicit){{.*}} + // CHECK: %{{.*}}map_clauses(close){{.*}} + // CHECK: %{{.*}}map_clauses(present){{.*}} + // CHECK: %{{.*}}map_clauses(ompx_hold){{.*}} + // CHECK: %{{.*}}map_clauses(attach){{.*}} + // CHECK: %{{.*}}map_clauses(attach_always){{.*}} + // CHECK: %{{.*}}map_clauses(attach_none){{.*}} + // CHECK: %{{.*}}map_clauses(attach_auto){{.*}} + // CHECK: %{{.*}}map_clauses(ref_ptr){{.*}} + // CHECK: %{{.*}}map_clauses(ref_ptee){{.*}} + // CHECK: %{{.*}}map_clauses(ref_ptr_ptee){{.*}} + %mapv0 = omp.map.info var_ptr(%arg0 : memref, tensor) map_clauses(none) capture(ByRef) -> memref {name = ""} + %mapv1 = omp.map.info var_ptr(%arg0 : memref, tensor) map_clauses(to) capture(ByRef) -> memref {name = ""} + %mapv2 = omp.map.info var_ptr(%arg0 : memref, tensor) map_clauses(from) capture(ByRef) -> memref {name = ""} + %mapv3 = omp.map.info var_ptr(%arg0 : memref, tensor) map_clauses(tofrom) capture(ByRef) -> memref {name = ""} + %mapv4 = omp.map.info var_ptr(%arg0 : memref, tensor) map_clauses(storage) capture(ByRef) -> memref {name = ""} + %mapv5 = omp.map.info var_ptr(%arg0 : memref, tensor) map_clauses(delete) capture(ByRef) -> memref {name = ""} + %mapv6 = omp.map.info var_ptr(%arg0 : memref, tensor) map_clauses(return_param) capture(ByRef) -> memref {name = ""} + %mapv7 = omp.map.info var_ptr(%arg0 : memref, tensor) map_clauses(private) capture(ByRef) -> memref {name = ""} + %mapv8 = omp.map.info var_ptr(%arg0 : memref, tensor) map_clauses(literal) capture(ByRef) -> memref {name = ""} + %mapv9 = omp.map.info var_ptr(%arg0 : memref, tensor) map_clauses(implicit) capture(ByRef) -> memref {name = ""} + %mapv10 = omp.map.info var_ptr(%arg0 : memref, tensor) map_clauses(close) capture(ByRef) -> memref {name = ""} + %mapv11 = omp.map.info var_ptr(%arg0 : memref, tensor) map_clauses(present) capture(ByRef) -> memref {name = ""} + %mapv12 = omp.map.info var_ptr(%arg0 : memref, tensor) map_clauses(ompx_hold) capture(ByRef) -> memref {name = ""} + %mapv13 = omp.map.info var_ptr(%arg0 : memref, tensor) map_clauses(attach) capture(ByRef) -> memref {name = ""} + %mapv14 = omp.map.info var_ptr(%arg0 : memref, tensor) map_clauses(attach_always) capture(ByRef) -> memref {name = ""} + %mapv15 = omp.map.info var_ptr(%arg0 : memref, tensor) map_clauses(attach_none) capture(ByRef) -> memref {name = ""} + %mapv16 = omp.map.info var_ptr(%arg0 : memref, tensor) map_clauses(attach_auto) capture(ByRef) -> memref {name = ""} + %mapv17 = omp.map.info var_ptr(%arg0 : memref, tensor) map_clauses(ref_ptr) capture(ByRef) -> memref {name = ""} + %mapv18 = omp.map.info var_ptr(%arg0 : memref, tensor) map_clauses(ref_ptee) capture(ByRef) -> memref {name = ""} + %mapv19 = omp.map.info var_ptr(%arg0 : memref, tensor) map_clauses(ref_ptr_ptee) capture(ByRef) -> memref {name = ""} + + return +} From 9a9fbbba5c81efed72c08a70bbe58427190d69db Mon Sep 17 00:00:00 2001 From: Congzhe Date: Tue, 21 Oct 2025 16:12:26 -0400 Subject: [PATCH 002/530] [InstructionSimplify] Enhance simplifySelectInst() (#163453) Fold select instructions with true and false values that act as the same phi, which cleans up the IR and open up opportunities for other passes such as loop vectorization. --- llvm/lib/Analysis/InstructionSimplify.cpp | 92 ++++++- .../InstCombine/select_with_identical_phi.ll | 243 ++++++++++++++++++ 2 files changed, 334 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/InstCombine/select_with_identical_phi.ll diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index 8da51d039f197..b573023372d81 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -4866,6 +4866,89 @@ static Value *simplifySelectWithFCmp(Value *Cond, Value *T, Value *F, return nullptr; } +/// Look for the following pattern and simplify %to_fold to %identicalPhi. +/// Here %phi, %to_fold and %phi.next perform the same functionality as +/// %identicalPhi and hence the select instruction %to_fold can be folded +/// into %identicalPhi. +/// +/// BB1: +/// %identicalPhi = phi [ X, %BB0 ], [ %identicalPhi.next, %BB1 ] +/// %phi = phi [ X, %BB0 ], [ %phi.next, %BB1 ] +/// ... +/// %identicalPhi.next = select %cmp, %val, %identicalPhi +/// (or select %cmp, %identicalPhi, %val) +/// %to_fold = select %cmp2, %identicalPhi, %phi +/// %phi.next = select %cmp, %val, %to_fold +/// (or select %cmp, %to_fold, %val) +/// +/// Prove that %phi and %identicalPhi are the same by induction: +/// +/// Base case: Both %phi and %identicalPhi are equal on entry to the loop. +/// Inductive case: +/// Suppose %phi and %identicalPhi are equal at iteration i. +/// We look at their values at iteration i+1 which are %phi.next and +/// %identicalPhi.next. They would have become different only when %cmp is +/// false and the corresponding values %to_fold and %identicalPhi differ +/// (similar reason for the other "or" case in the bracket). +/// +/// The only condition when %to_fold and %identicalPh could differ is when %cmp2 +/// is false and %to_fold is %phi, which contradicts our inductive hypothesis +/// that %phi and %identicalPhi are equal. Thus %phi and %identicalPhi are +/// always equal at iteration i+1. +bool isSimplifierIdenticalPHI(PHINode &PN, PHINode &IdenticalPN) { + if (PN.getParent() != IdenticalPN.getParent()) + return false; + if (PN.getNumIncomingValues() != 2) + return false; + + // Check that only the backedge incoming value is different. + unsigned DiffVals = 0; + BasicBlock *DiffValBB = nullptr; + for (unsigned i = 0; i < 2; i++) { + BasicBlock *PredBB = PN.getIncomingBlock(i); + if (PN.getIncomingValueForBlock(PredBB) != + IdenticalPN.getIncomingValueForBlock(PredBB)) { + DiffVals++; + DiffValBB = PredBB; + } + } + if (DiffVals != 1) + return false; + // Now check that the backedge incoming values are two select + // instructions with the same condition. Either their true + // values are the same, or their false values are the same. + auto *SI = dyn_cast(PN.getIncomingValueForBlock(DiffValBB)); + auto *IdenticalSI = + dyn_cast(IdenticalPN.getIncomingValueForBlock(DiffValBB)); + if (!SI || !IdenticalSI) + return false; + if (SI->getCondition() != IdenticalSI->getCondition()) + return false; + + SelectInst *SIOtherVal = nullptr; + Value *IdenticalSIOtherVal = nullptr; + if (SI->getTrueValue() == IdenticalSI->getTrueValue()) { + SIOtherVal = dyn_cast(SI->getFalseValue()); + IdenticalSIOtherVal = IdenticalSI->getFalseValue(); + } else if (SI->getFalseValue() == IdenticalSI->getFalseValue()) { + SIOtherVal = dyn_cast(SI->getTrueValue()); + IdenticalSIOtherVal = IdenticalSI->getTrueValue(); + } else { + return false; + } + + // Now check that the other values in select, i.e., %to_fold and + // %identicalPhi, are essentially the same value. + if (!SIOtherVal || IdenticalSIOtherVal != &IdenticalPN) + return false; + if (!(SIOtherVal->getTrueValue() == &IdenticalPN && + SIOtherVal->getFalseValue() == &PN) && + !(SIOtherVal->getTrueValue() == &PN && + SIOtherVal->getFalseValue() == &IdenticalPN)) + return false; + return true; +} + /// Given operands for a SelectInst, see if we can fold the result. /// If not, this returns null. static Value *simplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal, @@ -5041,7 +5124,14 @@ static Value *simplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal, std::optional Imp = isImpliedByDomCondition(Cond, Q.CxtI, Q.DL); if (Imp) return *Imp ? TrueVal : FalseVal; - + // Look for same PHIs in the true and false values. + if (auto *TruePHI = dyn_cast(TrueVal)) + if (auto *FalsePHI = dyn_cast(FalseVal)) { + if (isSimplifierIdenticalPHI(*TruePHI, *FalsePHI)) + return FalseVal; + if (isSimplifierIdenticalPHI(*FalsePHI, *TruePHI)) + return TrueVal; + } return nullptr; } diff --git a/llvm/test/Transforms/InstCombine/select_with_identical_phi.ll b/llvm/test/Transforms/InstCombine/select_with_identical_phi.ll new file mode 100644 index 0000000000000..7816781250799 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/select_with_identical_phi.ll @@ -0,0 +1,243 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -S -passes=instcombine | FileCheck %s +@A = extern_weak global float, align 4 + +; %same.as.v1 is a select with two phis %v1 and %phi.to.remove as the true +; and false values, while %v1 and %phi.to.remove are actually the same. +; Fold the selection instruction %same.as.v1 to %v1. +define void @select_with_identical_phi(ptr %m, ptr %n, i32 %count) { +; CHECK-LABEL: @select_with_identical_phi( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[V0:%.*]] = phi float [ 0x4415AF1D80000000, [[ENTRY:%.*]] ], [ [[V0_1:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[Q:%.*]] = phi ptr [ [[M:%.*]], [[ENTRY]] ], [ [[Q_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[C:%.*]] = phi ptr [ [[N:%.*]], [[ENTRY]] ], [ [[C_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[Q_LOAD:%.*]] = load float, ptr [[Q]], align 4 +; CHECK-NEXT: [[C_LOAD:%.*]] = load float, ptr [[C]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = fsub float [[Q_LOAD]], [[C_LOAD]] +; CHECK-NEXT: [[CMP1:%.*]] = fcmp olt float [[SUB]], [[V0]] +; CHECK-NEXT: [[V0_1]] = select i1 [[CMP1]], float [[SUB]], float [[V0]] +; CHECK-NEXT: [[CMP2:%.*]] = fcmp ogt float [[SUB]], [[V1]] +; CHECK-NEXT: [[V1_1]] = select i1 [[CMP2]], float [[SUB]], float [[V1]] +; CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I]], 1 +; CHECK-NEXT: [[Q_NEXT]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 4 +; CHECK-NEXT: [[C_NEXT]] = getelementptr inbounds nuw i8, ptr [[C]], i64 4 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], [[COUNT:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]] +; CHECK: exit: +; CHECK-NEXT: store float [[V1_1]], ptr @A, align 4 +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %v0 = phi float [ 0x4415AF1D80000000, %entry ], [ %v0.1, %for.body ] + %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ] + %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ] + %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ] + %q = phi ptr [ %m, %entry ], [ %q.next, %for.body ] + %c = phi ptr [ %n, %entry ], [ %c.next, %for.body ] + %q.load = load float, ptr %q + %c.load = load float, ptr %c + %sub = fsub float %q.load, %c.load + %cmp1 = fcmp olt float %sub, %v0 + %v0.1 = select i1 %cmp1, float %sub, float %v0 + %same.as.v1 = select i1 %cmp1, float %v1, float %phi.to.remove + %cmp2 = fcmp ogt float %sub, %same.as.v1 + %v1.1 = select i1 %cmp2, float %sub, float %v1 + %phi.to.remove.next = select i1 %cmp2, float %sub, float %same.as.v1 + %inc.i = add nuw nsw i32 %i, 1 + %q.next = getelementptr inbounds i8, ptr %q, i64 4 + %c.next = getelementptr inbounds i8, ptr %c, i64 4 + %exitcond = icmp eq i32 %inc.i, %count + br i1 %exitcond, label %exit, label %for.body + +exit: + %vl.1.lcssa = phi float [ %v1.1, %for.body ] + store float %vl.1.lcssa, ptr @A + ret void +} + +; The difference from select_with_identical_phi() is that the true and false values in +; %phi.to.remove.next and %v1.1 are swapped. +; Check that %same.as.v1 can be folded. +define void @select_with_identical_phi_2(ptr %m, ptr %n, i32 %count) { +; CHECK-LABEL: @select_with_identical_phi_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[V0:%.*]] = phi float [ 0x4415AF1D80000000, [[ENTRY:%.*]] ], [ [[V0_1:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[Q:%.*]] = phi ptr [ [[M:%.*]], [[ENTRY]] ], [ [[Q_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[C:%.*]] = phi ptr [ [[N:%.*]], [[ENTRY]] ], [ [[C_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[Q_LOAD:%.*]] = load float, ptr [[Q]], align 4 +; CHECK-NEXT: [[C_LOAD:%.*]] = load float, ptr [[C]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = fsub float [[Q_LOAD]], [[C_LOAD]] +; CHECK-NEXT: [[CMP1:%.*]] = fcmp olt float [[SUB]], [[V0]] +; CHECK-NEXT: [[V0_1]] = select i1 [[CMP1]], float [[SUB]], float [[V0]] +; CHECK-NEXT: [[CMP2:%.*]] = fcmp ogt float [[SUB]], [[V1]] +; CHECK-NEXT: [[V1_1]] = select i1 [[CMP2]], float [[V1]], float [[SUB]] +; CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I]], 1 +; CHECK-NEXT: [[Q_NEXT]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 4 +; CHECK-NEXT: [[C_NEXT]] = getelementptr inbounds nuw i8, ptr [[C]], i64 4 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], [[COUNT:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]] +; CHECK: exit: +; CHECK-NEXT: store float [[V1_1]], ptr @A, align 4 +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %v0 = phi float [ 0x4415AF1D80000000, %entry ], [ %v0.1, %for.body ] + %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ] + %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ] + %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ] + %q = phi ptr [ %m, %entry ], [ %q.next, %for.body ] + %c = phi ptr [ %n, %entry ], [ %c.next, %for.body ] + %q.load = load float, ptr %q + %c.load = load float, ptr %c + %sub = fsub float %q.load, %c.load + %cmp1 = fcmp olt float %sub, %v0 + %v0.1 = select i1 %cmp1, float %sub, float %v0 + %same.as.v1 = select i1 %cmp1, float %v1, float %phi.to.remove + %cmp2 = fcmp ogt float %sub, %same.as.v1 + %v1.1 = select i1 %cmp2, float %v1, float %sub + %phi.to.remove.next = select i1 %cmp2, float %same.as.v1, float %sub + %inc.i = add nuw nsw i32 %i, 1 + %q.next = getelementptr inbounds i8, ptr %q, i64 4 + %c.next = getelementptr inbounds i8, ptr %c, i64 4 + %exitcond = icmp eq i32 %inc.i, %count + br i1 %exitcond, label %exit, label %for.body + +exit: + %vl.1.lcssa = phi float [ %v1.1, %for.body ] + store float %vl.1.lcssa, ptr @A + ret void +} + +; The difference from select_with_identical_phi() is that the true and false values in +; same.as.v1 are swapped. +; Check that %same.as.v1 can be folded. +define void @select_with_identical_phi_3(ptr %m, ptr %n, i32 %count) { +; CHECK-LABEL: @select_with_identical_phi_3( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[V0:%.*]] = phi float [ 0x4415AF1D80000000, [[ENTRY:%.*]] ], [ [[V0_1:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[Q:%.*]] = phi ptr [ [[M:%.*]], [[ENTRY]] ], [ [[Q_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[C:%.*]] = phi ptr [ [[N:%.*]], [[ENTRY]] ], [ [[C_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[Q_LOAD:%.*]] = load float, ptr [[Q]], align 4 +; CHECK-NEXT: [[C_LOAD:%.*]] = load float, ptr [[C]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = fsub float [[Q_LOAD]], [[C_LOAD]] +; CHECK-NEXT: [[CMP1:%.*]] = fcmp olt float [[SUB]], [[V0]] +; CHECK-NEXT: [[V0_1]] = select i1 [[CMP1]], float [[SUB]], float [[V0]] +; CHECK-NEXT: [[CMP2:%.*]] = fcmp ogt float [[SUB]], [[V1]] +; CHECK-NEXT: [[V1_1]] = select i1 [[CMP2]], float [[SUB]], float [[V1]] +; CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I]], 1 +; CHECK-NEXT: [[Q_NEXT]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 4 +; CHECK-NEXT: [[C_NEXT]] = getelementptr inbounds nuw i8, ptr [[C]], i64 4 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], [[COUNT:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]] +; CHECK: exit: +; CHECK-NEXT: store float [[V1_1]], ptr @A, align 4 +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %v0 = phi float [ 0x4415AF1D80000000, %entry ], [ %v0.1, %for.body ] + %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ] + %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ] + %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ] + %q = phi ptr [ %m, %entry ], [ %q.next, %for.body ] + %c = phi ptr [ %n, %entry ], [ %c.next, %for.body ] + %q.load = load float, ptr %q + %c.load = load float, ptr %c + %sub = fsub float %q.load, %c.load + %cmp1 = fcmp olt float %sub, %v0 + %v0.1 = select i1 %cmp1, float %sub, float %v0 + %same.as.v1 = select i1 %cmp1, float %phi.to.remove, float %v1 + %cmp2 = fcmp ogt float %sub, %same.as.v1 + %v1.1 = select i1 %cmp2, float %sub, float %v1 + %phi.to.remove.next = select i1 %cmp2, float %sub, float %same.as.v1 + %inc.i = add nuw nsw i32 %i, 1 + %q.next = getelementptr inbounds i8, ptr %q, i64 4 + %c.next = getelementptr inbounds i8, ptr %c, i64 4 + %exitcond = icmp eq i32 %inc.i, %count + br i1 %exitcond, label %exit, label %for.body + +exit: + %vl.1.lcssa = phi float [ %v1.1, %for.body ] + store float %vl.1.lcssa, ptr @A + ret void +} + +; The difference from select_with_identical_phi() is that the true and false values in +; %same.as.v1, %phi.to.remove.next and %v1.1 are swapped. +; Check that %same.as.v1 can be folded. +define void @select_with_identical_phi_4(ptr %m, ptr %n, i32 %count) { +; CHECK-LABEL: @select_with_identical_phi_4( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[V0:%.*]] = phi float [ 0x4415AF1D80000000, [[ENTRY:%.*]] ], [ [[V0_1:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[Q:%.*]] = phi ptr [ [[M:%.*]], [[ENTRY]] ], [ [[Q_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[C:%.*]] = phi ptr [ [[N:%.*]], [[ENTRY]] ], [ [[C_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[Q_LOAD:%.*]] = load float, ptr [[Q]], align 4 +; CHECK-NEXT: [[C_LOAD:%.*]] = load float, ptr [[C]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = fsub float [[Q_LOAD]], [[C_LOAD]] +; CHECK-NEXT: [[CMP1:%.*]] = fcmp olt float [[SUB]], [[V0]] +; CHECK-NEXT: [[V0_1]] = select i1 [[CMP1]], float [[SUB]], float [[V0]] +; CHECK-NEXT: [[CMP2:%.*]] = fcmp ogt float [[SUB]], [[V1]] +; CHECK-NEXT: [[V1_1]] = select i1 [[CMP2]], float [[V1]], float [[SUB]] +; CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I]], 1 +; CHECK-NEXT: [[Q_NEXT]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 4 +; CHECK-NEXT: [[C_NEXT]] = getelementptr inbounds nuw i8, ptr [[C]], i64 4 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], [[COUNT:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]] +; CHECK: exit: +; CHECK-NEXT: store float [[V1_1]], ptr @A, align 4 +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %v0 = phi float [ 0x4415AF1D80000000, %entry ], [ %v0.1, %for.body ] + %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ] + %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ] + %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ] + %q = phi ptr [ %m, %entry ], [ %q.next, %for.body ] + %c = phi ptr [ %n, %entry ], [ %c.next, %for.body ] + %q.load = load float, ptr %q + %c.load = load float, ptr %c + %sub = fsub float %q.load, %c.load + %cmp1 = fcmp olt float %sub, %v0 + %v0.1 = select i1 %cmp1, float %sub, float %v0 + %same.as.v1 = select i1 %cmp1, float %phi.to.remove, float %v1 + %cmp2 = fcmp ogt float %sub, %same.as.v1 + %v1.1 = select i1 %cmp2, float %v1, float %sub + %phi.to.remove.next = select i1 %cmp2, float %same.as.v1, float %sub + %inc.i = add nuw nsw i32 %i, 1 + %q.next = getelementptr inbounds i8, ptr %q, i64 4 + %c.next = getelementptr inbounds i8, ptr %c, i64 4 + %exitcond = icmp eq i32 %inc.i, %count + br i1 %exitcond, label %exit, label %for.body + +exit: + %vl.1.lcssa = phi float [ %v1.1, %for.body ] + store float %vl.1.lcssa, ptr @A + ret void +} From e1ee75f217766548f26bc644aeb675b6bfad5c62 Mon Sep 17 00:00:00 2001 From: Andy Kaylor Date: Tue, 21 Oct 2025 13:16:04 -0700 Subject: [PATCH 003/530] [CIR][NFC] Add infrastructure for handling X86 builtins (#164465) This adds the infrastructure for handling x86-specific builtin calls. It does not add any actual handling, just the code structure and a large switch statement containing all of the builtins that will eventually need to be handled. Target-specific builtins for other targets will be added later. --- clang/include/clang/CIR/MissingFeatures.h | 1 + clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp | 95 +++ clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp | 814 +++++++++++++++++++++ clang/lib/CIR/CodeGen/CIRGenFunction.h | 6 + clang/lib/CIR/CodeGen/CMakeLists.txt | 3 +- 5 files changed, 918 insertions(+), 1 deletion(-) create mode 100644 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h index 2374c8d1850ac..62125fa306cb8 100644 --- a/clang/include/clang/CIR/MissingFeatures.h +++ b/clang/include/clang/CIR/MissingFeatures.h @@ -257,6 +257,7 @@ struct MissingFeatures { static bool generateDebugInfo() { return false; } static bool globalViewIndices() { return false; } static bool globalViewIntLowering() { return false; } + static bool handleBuiltinICEArguments() { return false; } static bool hip() { return false; } static bool incrementProfileCounter() { return false; } static bool innermostEHScope() { return false; } diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp index ea31871806bd7..798e9d9fbb99e 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp @@ -463,12 +463,107 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID, return emitLibraryCall(*this, fd, e, cgm.getBuiltinLibFunction(fd, builtinID)); + // Some target-specific builtins can have aggregate return values, e.g. + // __builtin_arm_mve_vld2q_u32. So if the result is an aggregate, force + // returnValue to be non-null, so that the target-specific emission code can + // always just emit into it. + cir::TypeEvaluationKind evalKind = getEvaluationKind(e->getType()); + if (evalKind == cir::TEK_Aggregate && returnValue.isNull()) { + cgm.errorNYI(e->getSourceRange(), "aggregate return value from builtin"); + return getUndefRValue(e->getType()); + } + + // Now see if we can emit a target-specific builtin. + if (mlir::Value v = emitTargetBuiltinExpr(builtinID, e, returnValue)) { + switch (evalKind) { + case cir::TEK_Scalar: + if (mlir::isa(v.getType())) + return RValue::get(nullptr); + return RValue::get(v); + case cir::TEK_Aggregate: + cgm.errorNYI(e->getSourceRange(), "aggregate return value from builtin"); + return getUndefRValue(e->getType()); + case cir::TEK_Complex: + llvm_unreachable("No current target builtin returns complex"); + } + llvm_unreachable("Bad evaluation kind in EmitBuiltinExpr"); + } + cgm.errorNYI(e->getSourceRange(), std::string("unimplemented builtin call: ") + getContext().BuiltinInfo.getName(builtinID)); return getUndefRValue(e->getType()); } +static mlir::Value emitTargetArchBuiltinExpr(CIRGenFunction *cgf, + unsigned builtinID, + const CallExpr *e, + ReturnValueSlot &returnValue, + llvm::Triple::ArchType arch) { + // When compiling in HipStdPar mode we have to be conservative in rejecting + // target specific features in the FE, and defer the possible error to the + // AcceleratorCodeSelection pass, wherein iff an unsupported target builtin is + // referenced by an accelerator executable function, we emit an error. + // Returning nullptr here leads to the builtin being handled in + // EmitStdParUnsupportedBuiltin. + if (cgf->getLangOpts().HIPStdPar && cgf->getLangOpts().CUDAIsDevice && + arch != cgf->getTarget().getTriple().getArch()) + return {}; + + switch (arch) { + case llvm::Triple::arm: + case llvm::Triple::armeb: + case llvm::Triple::thumb: + case llvm::Triple::thumbeb: + case llvm::Triple::aarch64: + case llvm::Triple::aarch64_32: + case llvm::Triple::aarch64_be: + case llvm::Triple::bpfeb: + case llvm::Triple::bpfel: + // These are actually NYI, but that will be reported by emitBuiltinExpr. + // At this point, we don't even know that the builtin is target-specific. + return nullptr; + + case llvm::Triple::x86: + case llvm::Triple::x86_64: + return cgf->emitX86BuiltinExpr(builtinID, e); + + case llvm::Triple::ppc: + case llvm::Triple::ppcle: + case llvm::Triple::ppc64: + case llvm::Triple::ppc64le: + case llvm::Triple::r600: + case llvm::Triple::amdgcn: + case llvm::Triple::systemz: + case llvm::Triple::nvptx: + case llvm::Triple::nvptx64: + case llvm::Triple::wasm32: + case llvm::Triple::wasm64: + case llvm::Triple::hexagon: + case llvm::Triple::riscv32: + case llvm::Triple::riscv64: + // These are actually NYI, but that will be reported by emitBuiltinExpr. + // At this point, we don't even know that the builtin is target-specific. + return {}; + default: + return {}; + } +} + +mlir::Value +CIRGenFunction::emitTargetBuiltinExpr(unsigned builtinID, const CallExpr *e, + ReturnValueSlot &returnValue) { + if (getContext().BuiltinInfo.isAuxBuiltinID(builtinID)) { + assert(getContext().getAuxTargetInfo() && "Missing aux target info"); + return emitTargetArchBuiltinExpr( + this, getContext().BuiltinInfo.getAuxBuiltinID(builtinID), e, + returnValue, getContext().getAuxTargetInfo()->getTriple().getArch()); + } + + return emitTargetArchBuiltinExpr(this, builtinID, e, returnValue, + getTarget().getTriple().getArch()); +} + /// Given a builtin id for a function like "__builtin_fabsf", return a Function* /// for "fabsf". cir::FuncOp CIRGenModule::getBuiltinLibFunction(const FunctionDecl *fd, diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp new file mode 100644 index 0000000000000..3c9c7ecf35aff --- /dev/null +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp @@ -0,0 +1,814 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This contains code to emit x86/x86_64 Builtin calls as CIR or a function +// call to be later resolved. +// +//===----------------------------------------------------------------------===// + +#include "CIRGenFunction.h" +#include "CIRGenModule.h" +#include "clang/Basic/Builtins.h" +#include "clang/Basic/TargetBuiltins.h" +#include "clang/CIR/MissingFeatures.h" +#include "llvm/IR/IntrinsicsX86.h" + +using namespace clang; +using namespace clang::CIRGen; + +mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, + const CallExpr *e) { + if (builtinID == Builtin::BI__builtin_cpu_is) { + cgm.errorNYI(e->getSourceRange(), "__builtin_cpu_is"); + return {}; + } + if (builtinID == Builtin::BI__builtin_cpu_supports) { + cgm.errorNYI(e->getSourceRange(), "__builtin_cpu_supports"); + return {}; + } + if (builtinID == Builtin::BI__builtin_cpu_init) { + cgm.errorNYI(e->getSourceRange(), "__builtin_cpu_init"); + return {}; + } + + // Handle MSVC intrinsics before argument evaluation to prevent double + // evaluation. + assert(!cir::MissingFeatures::msvcBuiltins()); + + // Find out if any arguments are required to be integer constant expressions. + assert(!cir::MissingFeatures::handleBuiltinICEArguments()); + + switch (builtinID) { + default: + return {}; + case X86::BI_mm_prefetch: + case X86::BI_mm_clflush: + case X86::BI_mm_lfence: + case X86::BI_mm_pause: + case X86::BI_mm_mfence: + case X86::BI_mm_sfence: + case X86::BI__rdtsc: + case X86::BI__builtin_ia32_rdtscp: + case X86::BI__builtin_ia32_lzcnt_u16: + case X86::BI__builtin_ia32_lzcnt_u32: + case X86::BI__builtin_ia32_lzcnt_u64: + case X86::BI__builtin_ia32_tzcnt_u16: + case X86::BI__builtin_ia32_tzcnt_u32: + case X86::BI__builtin_ia32_tzcnt_u64: + case X86::BI__builtin_ia32_undef128: + case X86::BI__builtin_ia32_undef256: + case X86::BI__builtin_ia32_undef512: + case X86::BI__builtin_ia32_vec_ext_v4hi: + case X86::BI__builtin_ia32_vec_ext_v16qi: + case X86::BI__builtin_ia32_vec_ext_v8hi: + case X86::BI__builtin_ia32_vec_ext_v4si: + case X86::BI__builtin_ia32_vec_ext_v4sf: + case X86::BI__builtin_ia32_vec_ext_v2di: + case X86::BI__builtin_ia32_vec_ext_v32qi: + case X86::BI__builtin_ia32_vec_ext_v16hi: + case X86::BI__builtin_ia32_vec_ext_v8si: + case X86::BI__builtin_ia32_vec_ext_v4di: + case X86::BI__builtin_ia32_vec_set_v4hi: + case X86::BI__builtin_ia32_vec_set_v16qi: + case X86::BI__builtin_ia32_vec_set_v8hi: + case X86::BI__builtin_ia32_vec_set_v4si: + case X86::BI__builtin_ia32_vec_set_v2di: + case X86::BI__builtin_ia32_vec_set_v32qi: + case X86::BI__builtin_ia32_vec_set_v16hi: + case X86::BI__builtin_ia32_vec_set_v8si: + case X86::BI__builtin_ia32_vec_set_v4di: + case X86::BI_mm_setcsr: + case X86::BI__builtin_ia32_ldmxcsr: + case X86::BI_mm_getcsr: + case X86::BI__builtin_ia32_stmxcsr: + case X86::BI__builtin_ia32_xsave: + case X86::BI__builtin_ia32_xsave64: + case X86::BI__builtin_ia32_xrstor: + case X86::BI__builtin_ia32_xrstor64: + case X86::BI__builtin_ia32_xsaveopt: + case X86::BI__builtin_ia32_xsaveopt64: + case X86::BI__builtin_ia32_xrstors: + case X86::BI__builtin_ia32_xrstors64: + case X86::BI__builtin_ia32_xsavec: + case X86::BI__builtin_ia32_xsavec64: + case X86::BI__builtin_ia32_xsaves: + case X86::BI__builtin_ia32_xsaves64: + case X86::BI__builtin_ia32_xsetbv: + case X86::BI_xsetbv: + case X86::BI__builtin_ia32_xgetbv: + case X86::BI_xgetbv: + case X86::BI__builtin_ia32_storedqudi128_mask: + case X86::BI__builtin_ia32_storedqusi128_mask: + case X86::BI__builtin_ia32_storedquhi128_mask: + case X86::BI__builtin_ia32_storedquqi128_mask: + case X86::BI__builtin_ia32_storeupd128_mask: + case X86::BI__builtin_ia32_storeups128_mask: + case X86::BI__builtin_ia32_storedqudi256_mask: + case X86::BI__builtin_ia32_storedqusi256_mask: + case X86::BI__builtin_ia32_storedquhi256_mask: + case X86::BI__builtin_ia32_storedquqi256_mask: + case X86::BI__builtin_ia32_storeupd256_mask: + case X86::BI__builtin_ia32_storeups256_mask: + case X86::BI__builtin_ia32_storedqudi512_mask: + case X86::BI__builtin_ia32_storedqusi512_mask: + case X86::BI__builtin_ia32_storedquhi512_mask: + case X86::BI__builtin_ia32_storedquqi512_mask: + case X86::BI__builtin_ia32_storeupd512_mask: + case X86::BI__builtin_ia32_storeups512_mask: + case X86::BI__builtin_ia32_storesbf16128_mask: + case X86::BI__builtin_ia32_storesh128_mask: + case X86::BI__builtin_ia32_storess128_mask: + case X86::BI__builtin_ia32_storesd128_mask: + case X86::BI__builtin_ia32_cvtmask2b128: + case X86::BI__builtin_ia32_cvtmask2b256: + case X86::BI__builtin_ia32_cvtmask2b512: + case X86::BI__builtin_ia32_cvtmask2w128: + case X86::BI__builtin_ia32_cvtmask2w256: + case X86::BI__builtin_ia32_cvtmask2w512: + case X86::BI__builtin_ia32_cvtmask2d128: + case X86::BI__builtin_ia32_cvtmask2d256: + case X86::BI__builtin_ia32_cvtmask2d512: + case X86::BI__builtin_ia32_cvtmask2q128: + case X86::BI__builtin_ia32_cvtmask2q256: + case X86::BI__builtin_ia32_cvtmask2q512: + case X86::BI__builtin_ia32_cvtb2mask128: + case X86::BI__builtin_ia32_cvtb2mask256: + case X86::BI__builtin_ia32_cvtb2mask512: + case X86::BI__builtin_ia32_cvtw2mask128: + case X86::BI__builtin_ia32_cvtw2mask256: + case X86::BI__builtin_ia32_cvtw2mask512: + case X86::BI__builtin_ia32_cvtd2mask128: + case X86::BI__builtin_ia32_cvtd2mask256: + case X86::BI__builtin_ia32_cvtd2mask512: + case X86::BI__builtin_ia32_cvtq2mask128: + case X86::BI__builtin_ia32_cvtq2mask256: + case X86::BI__builtin_ia32_cvtq2mask512: + case X86::BI__builtin_ia32_cvtdq2ps512_mask: + case X86::BI__builtin_ia32_cvtqq2ps512_mask: + case X86::BI__builtin_ia32_cvtqq2pd512_mask: + case X86::BI__builtin_ia32_vcvtw2ph512_mask: + case X86::BI__builtin_ia32_vcvtdq2ph512_mask: + case X86::BI__builtin_ia32_vcvtqq2ph512_mask: + case X86::BI__builtin_ia32_cvtudq2ps512_mask: + case X86::BI__builtin_ia32_cvtuqq2ps512_mask: + case X86::BI__builtin_ia32_cvtuqq2pd512_mask: + case X86::BI__builtin_ia32_vcvtuw2ph512_mask: + case X86::BI__builtin_ia32_vcvtudq2ph512_mask: + case X86::BI__builtin_ia32_vcvtuqq2ph512_mask: + case X86::BI__builtin_ia32_vfmaddss3: + case X86::BI__builtin_ia32_vfmaddsd3: + case X86::BI__builtin_ia32_vfmaddsh3_mask: + case X86::BI__builtin_ia32_vfmaddss3_mask: + case X86::BI__builtin_ia32_vfmaddsd3_mask: + case X86::BI__builtin_ia32_vfmaddss: + case X86::BI__builtin_ia32_vfmaddsd: + case X86::BI__builtin_ia32_vfmaddsh3_maskz: + case X86::BI__builtin_ia32_vfmaddss3_maskz: + case X86::BI__builtin_ia32_vfmaddsd3_maskz: + case X86::BI__builtin_ia32_vfmaddsh3_mask3: + case X86::BI__builtin_ia32_vfmaddss3_mask3: + case X86::BI__builtin_ia32_vfmaddsd3_mask3: + case X86::BI__builtin_ia32_vfmsubsh3_mask3: + case X86::BI__builtin_ia32_vfmsubss3_mask3: + case X86::BI__builtin_ia32_vfmsubsd3_mask3: + case X86::BI__builtin_ia32_vfmaddph512_mask: + case X86::BI__builtin_ia32_vfmaddph512_maskz: + case X86::BI__builtin_ia32_vfmaddph512_mask3: + case X86::BI__builtin_ia32_vfmaddps512_mask: + case X86::BI__builtin_ia32_vfmaddps512_maskz: + case X86::BI__builtin_ia32_vfmaddps512_mask3: + case X86::BI__builtin_ia32_vfmsubps512_mask3: + case X86::BI__builtin_ia32_vfmaddpd512_mask: + case X86::BI__builtin_ia32_vfmaddpd512_maskz: + case X86::BI__builtin_ia32_vfmaddpd512_mask3: + case X86::BI__builtin_ia32_vfmsubpd512_mask3: + case X86::BI__builtin_ia32_vfmsubph512_mask3: + case X86::BI__builtin_ia32_vfmaddsubph512_mask: + case X86::BI__builtin_ia32_vfmaddsubph512_maskz: + case X86::BI__builtin_ia32_vfmaddsubph512_mask3: + case X86::BI__builtin_ia32_vfmsubaddph512_mask3: + case X86::BI__builtin_ia32_vfmaddsubps512_mask: + case X86::BI__builtin_ia32_vfmaddsubps512_maskz: + case X86::BI__builtin_ia32_vfmaddsubps512_mask3: + case X86::BI__builtin_ia32_vfmsubaddps512_mask3: + case X86::BI__builtin_ia32_vfmaddsubpd512_mask: + case X86::BI__builtin_ia32_vfmaddsubpd512_maskz: + case X86::BI__builtin_ia32_vfmaddsubpd512_mask3: + case X86::BI__builtin_ia32_vfmsubaddpd512_mask3: + case X86::BI__builtin_ia32_movdqa32store128_mask: + case X86::BI__builtin_ia32_movdqa64store128_mask: + case X86::BI__builtin_ia32_storeaps128_mask: + case X86::BI__builtin_ia32_storeapd128_mask: + case X86::BI__builtin_ia32_movdqa32store256_mask: + case X86::BI__builtin_ia32_movdqa64store256_mask: + case X86::BI__builtin_ia32_storeaps256_mask: + case X86::BI__builtin_ia32_storeapd256_mask: + case X86::BI__builtin_ia32_movdqa32store512_mask: + case X86::BI__builtin_ia32_movdqa64store512_mask: + case X86::BI__builtin_ia32_storeaps512_mask: + case X86::BI__builtin_ia32_storeapd512_mask: + case X86::BI__builtin_ia32_loadups128_mask: + case X86::BI__builtin_ia32_loadups256_mask: + case X86::BI__builtin_ia32_loadups512_mask: + case X86::BI__builtin_ia32_loadupd128_mask: + case X86::BI__builtin_ia32_loadupd256_mask: + case X86::BI__builtin_ia32_loadupd512_mask: + case X86::BI__builtin_ia32_loaddquqi128_mask: + case X86::BI__builtin_ia32_loaddquqi256_mask: + case X86::BI__builtin_ia32_loaddquqi512_mask: + case X86::BI__builtin_ia32_loaddquhi128_mask: + case X86::BI__builtin_ia32_loaddquhi256_mask: + case X86::BI__builtin_ia32_loaddquhi512_mask: + case X86::BI__builtin_ia32_loaddqusi128_mask: + case X86::BI__builtin_ia32_loaddqusi256_mask: + case X86::BI__builtin_ia32_loaddqusi512_mask: + case X86::BI__builtin_ia32_loaddqudi128_mask: + case X86::BI__builtin_ia32_loaddqudi256_mask: + case X86::BI__builtin_ia32_loaddqudi512_mask: + case X86::BI__builtin_ia32_loadsbf16128_mask: + case X86::BI__builtin_ia32_loadsh128_mask: + case X86::BI__builtin_ia32_loadss128_mask: + case X86::BI__builtin_ia32_loadsd128_mask: + case X86::BI__builtin_ia32_loadaps128_mask: + case X86::BI__builtin_ia32_loadaps256_mask: + case X86::BI__builtin_ia32_loadaps512_mask: + case X86::BI__builtin_ia32_loadapd128_mask: + case X86::BI__builtin_ia32_loadapd256_mask: + case X86::BI__builtin_ia32_loadapd512_mask: + case X86::BI__builtin_ia32_movdqa32load128_mask: + case X86::BI__builtin_ia32_movdqa32load256_mask: + case X86::BI__builtin_ia32_movdqa32load512_mask: + case X86::BI__builtin_ia32_movdqa64load128_mask: + case X86::BI__builtin_ia32_movdqa64load256_mask: + case X86::BI__builtin_ia32_movdqa64load512_mask: + case X86::BI__builtin_ia32_expandloaddf128_mask: + case X86::BI__builtin_ia32_expandloaddf256_mask: + case X86::BI__builtin_ia32_expandloaddf512_mask: + case X86::BI__builtin_ia32_expandloadsf128_mask: + case X86::BI__builtin_ia32_expandloadsf256_mask: + case X86::BI__builtin_ia32_expandloadsf512_mask: + case X86::BI__builtin_ia32_expandloaddi128_mask: + case X86::BI__builtin_ia32_expandloaddi256_mask: + case X86::BI__builtin_ia32_expandloaddi512_mask: + case X86::BI__builtin_ia32_expandloadsi128_mask: + case X86::BI__builtin_ia32_expandloadsi256_mask: + case X86::BI__builtin_ia32_expandloadsi512_mask: + case X86::BI__builtin_ia32_expandloadhi128_mask: + case X86::BI__builtin_ia32_expandloadhi256_mask: + case X86::BI__builtin_ia32_expandloadhi512_mask: + case X86::BI__builtin_ia32_expandloadqi128_mask: + case X86::BI__builtin_ia32_expandloadqi256_mask: + case X86::BI__builtin_ia32_expandloadqi512_mask: + case X86::BI__builtin_ia32_compressstoredf128_mask: + case X86::BI__builtin_ia32_compressstoredf256_mask: + case X86::BI__builtin_ia32_compressstoredf512_mask: + case X86::BI__builtin_ia32_compressstoresf128_mask: + case X86::BI__builtin_ia32_compressstoresf256_mask: + case X86::BI__builtin_ia32_compressstoresf512_mask: + case X86::BI__builtin_ia32_compressstoredi128_mask: + case X86::BI__builtin_ia32_compressstoredi256_mask: + case X86::BI__builtin_ia32_compressstoredi512_mask: + case X86::BI__builtin_ia32_compressstoresi128_mask: + case X86::BI__builtin_ia32_compressstoresi256_mask: + case X86::BI__builtin_ia32_compressstoresi512_mask: + case X86::BI__builtin_ia32_compressstorehi128_mask: + case X86::BI__builtin_ia32_compressstorehi256_mask: + case X86::BI__builtin_ia32_compressstorehi512_mask: + case X86::BI__builtin_ia32_compressstoreqi128_mask: + case X86::BI__builtin_ia32_compressstoreqi256_mask: + case X86::BI__builtin_ia32_compressstoreqi512_mask: + case X86::BI__builtin_ia32_expanddf128_mask: + case X86::BI__builtin_ia32_expanddf256_mask: + case X86::BI__builtin_ia32_expanddf512_mask: + case X86::BI__builtin_ia32_expandsf128_mask: + case X86::BI__builtin_ia32_expandsf256_mask: + case X86::BI__builtin_ia32_expandsf512_mask: + case X86::BI__builtin_ia32_expanddi128_mask: + case X86::BI__builtin_ia32_expanddi256_mask: + case X86::BI__builtin_ia32_expanddi512_mask: + case X86::BI__builtin_ia32_expandsi128_mask: + case X86::BI__builtin_ia32_expandsi256_mask: + case X86::BI__builtin_ia32_expandsi512_mask: + case X86::BI__builtin_ia32_expandhi128_mask: + case X86::BI__builtin_ia32_expandhi256_mask: + case X86::BI__builtin_ia32_expandhi512_mask: + case X86::BI__builtin_ia32_expandqi128_mask: + case X86::BI__builtin_ia32_expandqi256_mask: + case X86::BI__builtin_ia32_expandqi512_mask: + case X86::BI__builtin_ia32_compressdf128_mask: + case X86::BI__builtin_ia32_compressdf256_mask: + case X86::BI__builtin_ia32_compressdf512_mask: + case X86::BI__builtin_ia32_compresssf128_mask: + case X86::BI__builtin_ia32_compresssf256_mask: + case X86::BI__builtin_ia32_compresssf512_mask: + case X86::BI__builtin_ia32_compressdi128_mask: + case X86::BI__builtin_ia32_compressdi256_mask: + case X86::BI__builtin_ia32_compressdi512_mask: + case X86::BI__builtin_ia32_compresssi128_mask: + case X86::BI__builtin_ia32_compresssi256_mask: + case X86::BI__builtin_ia32_compresssi512_mask: + case X86::BI__builtin_ia32_compresshi128_mask: + case X86::BI__builtin_ia32_compresshi256_mask: + case X86::BI__builtin_ia32_compresshi512_mask: + case X86::BI__builtin_ia32_compressqi128_mask: + case X86::BI__builtin_ia32_compressqi256_mask: + case X86::BI__builtin_ia32_compressqi512_mask: + case X86::BI__builtin_ia32_gather3div2df: + case X86::BI__builtin_ia32_gather3div2di: + case X86::BI__builtin_ia32_gather3div4df: + case X86::BI__builtin_ia32_gather3div4di: + case X86::BI__builtin_ia32_gather3div4sf: + case X86::BI__builtin_ia32_gather3div4si: + case X86::BI__builtin_ia32_gather3div8sf: + case X86::BI__builtin_ia32_gather3div8si: + case X86::BI__builtin_ia32_gather3siv2df: + case X86::BI__builtin_ia32_gather3siv2di: + case X86::BI__builtin_ia32_gather3siv4df: + case X86::BI__builtin_ia32_gather3siv4di: + case X86::BI__builtin_ia32_gather3siv4sf: + case X86::BI__builtin_ia32_gather3siv4si: + case X86::BI__builtin_ia32_gather3siv8sf: + case X86::BI__builtin_ia32_gather3siv8si: + case X86::BI__builtin_ia32_gathersiv8df: + case X86::BI__builtin_ia32_gathersiv16sf: + case X86::BI__builtin_ia32_gatherdiv8df: + case X86::BI__builtin_ia32_gatherdiv16sf: + case X86::BI__builtin_ia32_gathersiv8di: + case X86::BI__builtin_ia32_gathersiv16si: + case X86::BI__builtin_ia32_gatherdiv8di: + case X86::BI__builtin_ia32_gatherdiv16si: + case X86::BI__builtin_ia32_scattersiv8df: + case X86::BI__builtin_ia32_scattersiv16sf: + case X86::BI__builtin_ia32_scatterdiv8df: + case X86::BI__builtin_ia32_scatterdiv16sf: + case X86::BI__builtin_ia32_scattersiv8di: + case X86::BI__builtin_ia32_scattersiv16si: + case X86::BI__builtin_ia32_scatterdiv8di: + case X86::BI__builtin_ia32_scatterdiv16si: + case X86::BI__builtin_ia32_scatterdiv2df: + case X86::BI__builtin_ia32_scatterdiv2di: + case X86::BI__builtin_ia32_scatterdiv4df: + case X86::BI__builtin_ia32_scatterdiv4di: + case X86::BI__builtin_ia32_scatterdiv4sf: + case X86::BI__builtin_ia32_scatterdiv4si: + case X86::BI__builtin_ia32_scatterdiv8sf: + case X86::BI__builtin_ia32_scatterdiv8si: + case X86::BI__builtin_ia32_scattersiv2df: + case X86::BI__builtin_ia32_scattersiv2di: + case X86::BI__builtin_ia32_scattersiv4df: + case X86::BI__builtin_ia32_scattersiv4di: + case X86::BI__builtin_ia32_scattersiv4sf: + case X86::BI__builtin_ia32_scattersiv4si: + case X86::BI__builtin_ia32_scattersiv8sf: + case X86::BI__builtin_ia32_scattersiv8si: + case X86::BI__builtin_ia32_vextractf128_pd256: + case X86::BI__builtin_ia32_vextractf128_ps256: + case X86::BI__builtin_ia32_vextractf128_si256: + case X86::BI__builtin_ia32_extract128i256: + case X86::BI__builtin_ia32_extractf64x4_mask: + case X86::BI__builtin_ia32_extractf32x4_mask: + case X86::BI__builtin_ia32_extracti64x4_mask: + case X86::BI__builtin_ia32_extracti32x4_mask: + case X86::BI__builtin_ia32_extractf32x8_mask: + case X86::BI__builtin_ia32_extracti32x8_mask: + case X86::BI__builtin_ia32_extractf32x4_256_mask: + case X86::BI__builtin_ia32_extracti32x4_256_mask: + case X86::BI__builtin_ia32_extractf64x2_256_mask: + case X86::BI__builtin_ia32_extracti64x2_256_mask: + case X86::BI__builtin_ia32_extractf64x2_512_mask: + case X86::BI__builtin_ia32_extracti64x2_512_mask: + case X86::BI__builtin_ia32_vinsertf128_pd256: + case X86::BI__builtin_ia32_vinsertf128_ps256: + case X86::BI__builtin_ia32_vinsertf128_si256: + case X86::BI__builtin_ia32_insert128i256: + case X86::BI__builtin_ia32_insertf64x4: + case X86::BI__builtin_ia32_insertf32x4: + case X86::BI__builtin_ia32_inserti64x4: + case X86::BI__builtin_ia32_inserti32x4: + case X86::BI__builtin_ia32_insertf32x8: + case X86::BI__builtin_ia32_inserti32x8: + case X86::BI__builtin_ia32_insertf32x4_256: + case X86::BI__builtin_ia32_inserti32x4_256: + case X86::BI__builtin_ia32_insertf64x2_256: + case X86::BI__builtin_ia32_inserti64x2_256: + case X86::BI__builtin_ia32_insertf64x2_512: + case X86::BI__builtin_ia32_inserti64x2_512: + case X86::BI__builtin_ia32_pmovqd512_mask: + case X86::BI__builtin_ia32_pmovwb512_mask: + case X86::BI__builtin_ia32_pblendw128: + case X86::BI__builtin_ia32_blendpd: + case X86::BI__builtin_ia32_blendps: + case X86::BI__builtin_ia32_blendpd256: + case X86::BI__builtin_ia32_blendps256: + case X86::BI__builtin_ia32_pblendw256: + case X86::BI__builtin_ia32_pblendd128: + case X86::BI__builtin_ia32_pblendd256: + case X86::BI__builtin_ia32_pshuflw: + case X86::BI__builtin_ia32_pshuflw256: + case X86::BI__builtin_ia32_pshuflw512: + case X86::BI__builtin_ia32_pshufhw: + case X86::BI__builtin_ia32_pshufhw256: + case X86::BI__builtin_ia32_pshufhw512: + case X86::BI__builtin_ia32_pshufd: + case X86::BI__builtin_ia32_pshufd256: + case X86::BI__builtin_ia32_pshufd512: + case X86::BI__builtin_ia32_vpermilpd: + case X86::BI__builtin_ia32_vpermilps: + case X86::BI__builtin_ia32_vpermilpd256: + case X86::BI__builtin_ia32_vpermilps256: + case X86::BI__builtin_ia32_vpermilpd512: + case X86::BI__builtin_ia32_vpermilps512: + case X86::BI__builtin_ia32_shufpd: + case X86::BI__builtin_ia32_shufpd256: + case X86::BI__builtin_ia32_shufpd512: + case X86::BI__builtin_ia32_shufps: + case X86::BI__builtin_ia32_shufps256: + case X86::BI__builtin_ia32_shufps512: + case X86::BI__builtin_ia32_permdi256: + case X86::BI__builtin_ia32_permdf256: + case X86::BI__builtin_ia32_permdi512: + case X86::BI__builtin_ia32_permdf512: + case X86::BI__builtin_ia32_palignr128: + case X86::BI__builtin_ia32_palignr256: + case X86::BI__builtin_ia32_palignr512: + case X86::BI__builtin_ia32_alignd128: + case X86::BI__builtin_ia32_alignd256: + case X86::BI__builtin_ia32_alignd512: + case X86::BI__builtin_ia32_alignq128: + case X86::BI__builtin_ia32_alignq256: + case X86::BI__builtin_ia32_alignq512: + case X86::BI__builtin_ia32_shuf_f32x4_256: + case X86::BI__builtin_ia32_shuf_f64x2_256: + case X86::BI__builtin_ia32_shuf_i32x4_256: + case X86::BI__builtin_ia32_shuf_i64x2_256: + case X86::BI__builtin_ia32_shuf_f32x4: + case X86::BI__builtin_ia32_shuf_f64x2: + case X86::BI__builtin_ia32_shuf_i32x4: + case X86::BI__builtin_ia32_shuf_i64x2: + case X86::BI__builtin_ia32_vperm2f128_pd256: + case X86::BI__builtin_ia32_vperm2f128_ps256: + case X86::BI__builtin_ia32_vperm2f128_si256: + case X86::BI__builtin_ia32_permti256: + case X86::BI__builtin_ia32_pslldqi128_byteshift: + case X86::BI__builtin_ia32_pslldqi256_byteshift: + case X86::BI__builtin_ia32_pslldqi512_byteshift: + case X86::BI__builtin_ia32_psrldqi128_byteshift: + case X86::BI__builtin_ia32_psrldqi256_byteshift: + case X86::BI__builtin_ia32_psrldqi512_byteshift: + case X86::BI__builtin_ia32_kshiftliqi: + case X86::BI__builtin_ia32_kshiftlihi: + case X86::BI__builtin_ia32_kshiftlisi: + case X86::BI__builtin_ia32_kshiftlidi: + case X86::BI__builtin_ia32_kshiftriqi: + case X86::BI__builtin_ia32_kshiftrihi: + case X86::BI__builtin_ia32_kshiftrisi: + case X86::BI__builtin_ia32_kshiftridi: + case X86::BI__builtin_ia32_vprotbi: + case X86::BI__builtin_ia32_vprotwi: + case X86::BI__builtin_ia32_vprotdi: + case X86::BI__builtin_ia32_vprotqi: + case X86::BI__builtin_ia32_prold128: + case X86::BI__builtin_ia32_prold256: + case X86::BI__builtin_ia32_prold512: + case X86::BI__builtin_ia32_prolq128: + case X86::BI__builtin_ia32_prolq256: + case X86::BI__builtin_ia32_prolq512: + case X86::BI__builtin_ia32_prord128: + case X86::BI__builtin_ia32_prord256: + case X86::BI__builtin_ia32_prord512: + case X86::BI__builtin_ia32_prorq128: + case X86::BI__builtin_ia32_prorq256: + case X86::BI__builtin_ia32_prorq512: + case X86::BI__builtin_ia32_selectb_128: + case X86::BI__builtin_ia32_selectb_256: + case X86::BI__builtin_ia32_selectb_512: + case X86::BI__builtin_ia32_selectw_128: + case X86::BI__builtin_ia32_selectw_256: + case X86::BI__builtin_ia32_selectw_512: + case X86::BI__builtin_ia32_selectd_128: + case X86::BI__builtin_ia32_selectd_256: + case X86::BI__builtin_ia32_selectd_512: + case X86::BI__builtin_ia32_selectq_128: + case X86::BI__builtin_ia32_selectq_256: + case X86::BI__builtin_ia32_selectq_512: + case X86::BI__builtin_ia32_selectph_128: + case X86::BI__builtin_ia32_selectph_256: + case X86::BI__builtin_ia32_selectph_512: + case X86::BI__builtin_ia32_selectpbf_128: + case X86::BI__builtin_ia32_selectpbf_256: + case X86::BI__builtin_ia32_selectpbf_512: + case X86::BI__builtin_ia32_selectps_128: + case X86::BI__builtin_ia32_selectps_256: + case X86::BI__builtin_ia32_selectps_512: + case X86::BI__builtin_ia32_selectpd_128: + case X86::BI__builtin_ia32_selectpd_256: + case X86::BI__builtin_ia32_selectpd_512: + case X86::BI__builtin_ia32_selectsh_128: + case X86::BI__builtin_ia32_selectsbf_128: + case X86::BI__builtin_ia32_selectss_128: + case X86::BI__builtin_ia32_selectsd_128: + case X86::BI__builtin_ia32_cmpb128_mask: + case X86::BI__builtin_ia32_cmpb256_mask: + case X86::BI__builtin_ia32_cmpb512_mask: + case X86::BI__builtin_ia32_cmpw128_mask: + case X86::BI__builtin_ia32_cmpw256_mask: + case X86::BI__builtin_ia32_cmpw512_mask: + case X86::BI__builtin_ia32_cmpd128_mask: + case X86::BI__builtin_ia32_cmpd256_mask: + case X86::BI__builtin_ia32_cmpd512_mask: + case X86::BI__builtin_ia32_cmpq128_mask: + case X86::BI__builtin_ia32_cmpq256_mask: + case X86::BI__builtin_ia32_cmpq512_mask: + case X86::BI__builtin_ia32_ucmpb128_mask: + case X86::BI__builtin_ia32_ucmpb256_mask: + case X86::BI__builtin_ia32_ucmpb512_mask: + case X86::BI__builtin_ia32_ucmpw128_mask: + case X86::BI__builtin_ia32_ucmpw256_mask: + case X86::BI__builtin_ia32_ucmpw512_mask: + case X86::BI__builtin_ia32_ucmpd128_mask: + case X86::BI__builtin_ia32_ucmpd256_mask: + case X86::BI__builtin_ia32_ucmpd512_mask: + case X86::BI__builtin_ia32_ucmpq128_mask: + case X86::BI__builtin_ia32_ucmpq256_mask: + case X86::BI__builtin_ia32_ucmpq512_mask: + case X86::BI__builtin_ia32_vpcomb: + case X86::BI__builtin_ia32_vpcomw: + case X86::BI__builtin_ia32_vpcomd: + case X86::BI__builtin_ia32_vpcomq: + case X86::BI__builtin_ia32_vpcomub: + case X86::BI__builtin_ia32_vpcomuw: + case X86::BI__builtin_ia32_vpcomud: + case X86::BI__builtin_ia32_vpcomuq: + case X86::BI__builtin_ia32_kortestcqi: + case X86::BI__builtin_ia32_kortestchi: + case X86::BI__builtin_ia32_kortestcsi: + case X86::BI__builtin_ia32_kortestcdi: + case X86::BI__builtin_ia32_kortestzqi: + case X86::BI__builtin_ia32_kortestzhi: + case X86::BI__builtin_ia32_kortestzsi: + case X86::BI__builtin_ia32_kortestzdi: + case X86::BI__builtin_ia32_ktestcqi: + case X86::BI__builtin_ia32_ktestzqi: + case X86::BI__builtin_ia32_ktestchi: + case X86::BI__builtin_ia32_ktestzhi: + case X86::BI__builtin_ia32_ktestcsi: + case X86::BI__builtin_ia32_ktestzsi: + case X86::BI__builtin_ia32_ktestcdi: + case X86::BI__builtin_ia32_ktestzdi: + case X86::BI__builtin_ia32_kaddqi: + case X86::BI__builtin_ia32_kaddhi: + case X86::BI__builtin_ia32_kaddsi: + case X86::BI__builtin_ia32_kadddi: + case X86::BI__builtin_ia32_kandqi: + case X86::BI__builtin_ia32_kandhi: + case X86::BI__builtin_ia32_kandsi: + case X86::BI__builtin_ia32_kanddi: + case X86::BI__builtin_ia32_kandnqi: + case X86::BI__builtin_ia32_kandnhi: + case X86::BI__builtin_ia32_kandnsi: + case X86::BI__builtin_ia32_kandndi: + case X86::BI__builtin_ia32_korqi: + case X86::BI__builtin_ia32_korhi: + case X86::BI__builtin_ia32_korsi: + case X86::BI__builtin_ia32_kordi: + case X86::BI__builtin_ia32_kxnorqi: + case X86::BI__builtin_ia32_kxnorhi: + case X86::BI__builtin_ia32_kxnorsi: + case X86::BI__builtin_ia32_kxnordi: + case X86::BI__builtin_ia32_kxorqi: + case X86::BI__builtin_ia32_kxorhi: + case X86::BI__builtin_ia32_kxorsi: + case X86::BI__builtin_ia32_kxordi: + case X86::BI__builtin_ia32_knotqi: + case X86::BI__builtin_ia32_knothi: + case X86::BI__builtin_ia32_knotsi: + case X86::BI__builtin_ia32_knotdi: + case X86::BI__builtin_ia32_kmovb: + case X86::BI__builtin_ia32_kmovw: + case X86::BI__builtin_ia32_kmovd: + case X86::BI__builtin_ia32_kmovq: + case X86::BI__builtin_ia32_kunpckdi: + case X86::BI__builtin_ia32_kunpcksi: + case X86::BI__builtin_ia32_kunpckhi: + case X86::BI__builtin_ia32_sqrtsh_round_mask: + case X86::BI__builtin_ia32_sqrtsd_round_mask: + case X86::BI__builtin_ia32_sqrtss_round_mask: + case X86::BI__builtin_ia32_sqrtpd256: + case X86::BI__builtin_ia32_sqrtpd: + case X86::BI__builtin_ia32_sqrtps256: + case X86::BI__builtin_ia32_sqrtps: + case X86::BI__builtin_ia32_sqrtph256: + case X86::BI__builtin_ia32_sqrtph: + case X86::BI__builtin_ia32_sqrtph512: + case X86::BI__builtin_ia32_vsqrtbf16256: + case X86::BI__builtin_ia32_vsqrtbf16: + case X86::BI__builtin_ia32_vsqrtbf16512: + case X86::BI__builtin_ia32_sqrtps512: + case X86::BI__builtin_ia32_sqrtpd512: + case X86::BI__builtin_ia32_pmuludq128: + case X86::BI__builtin_ia32_pmuludq256: + case X86::BI__builtin_ia32_pmuludq512: + case X86::BI__builtin_ia32_pmuldq128: + case X86::BI__builtin_ia32_pmuldq256: + case X86::BI__builtin_ia32_pmuldq512: + case X86::BI__builtin_ia32_pternlogd512_mask: + case X86::BI__builtin_ia32_pternlogq512_mask: + case X86::BI__builtin_ia32_pternlogd128_mask: + case X86::BI__builtin_ia32_pternlogd256_mask: + case X86::BI__builtin_ia32_pternlogq128_mask: + case X86::BI__builtin_ia32_pternlogq256_mask: + case X86::BI__builtin_ia32_pternlogd512_maskz: + case X86::BI__builtin_ia32_pternlogq512_maskz: + case X86::BI__builtin_ia32_pternlogd128_maskz: + case X86::BI__builtin_ia32_pternlogd256_maskz: + case X86::BI__builtin_ia32_pternlogq128_maskz: + case X86::BI__builtin_ia32_pternlogq256_maskz: + case X86::BI__builtin_ia32_vpshldd128: + case X86::BI__builtin_ia32_vpshldd256: + case X86::BI__builtin_ia32_vpshldd512: + case X86::BI__builtin_ia32_vpshldq128: + case X86::BI__builtin_ia32_vpshldq256: + case X86::BI__builtin_ia32_vpshldq512: + case X86::BI__builtin_ia32_vpshldw128: + case X86::BI__builtin_ia32_vpshldw256: + case X86::BI__builtin_ia32_vpshldw512: + case X86::BI__builtin_ia32_vpshrdd128: + case X86::BI__builtin_ia32_vpshrdd256: + case X86::BI__builtin_ia32_vpshrdd512: + case X86::BI__builtin_ia32_vpshrdq128: + case X86::BI__builtin_ia32_vpshrdq256: + case X86::BI__builtin_ia32_vpshrdq512: + case X86::BI__builtin_ia32_vpshrdw128: + case X86::BI__builtin_ia32_vpshrdw256: + case X86::BI__builtin_ia32_vpshrdw512: + case X86::BI__builtin_ia32_reduce_fadd_pd512: + case X86::BI__builtin_ia32_reduce_fadd_ps512: + case X86::BI__builtin_ia32_reduce_fadd_ph512: + case X86::BI__builtin_ia32_reduce_fadd_ph256: + case X86::BI__builtin_ia32_reduce_fadd_ph128: + case X86::BI__builtin_ia32_reduce_fmul_pd512: + case X86::BI__builtin_ia32_reduce_fmul_ps512: + case X86::BI__builtin_ia32_reduce_fmul_ph512: + case X86::BI__builtin_ia32_reduce_fmul_ph256: + case X86::BI__builtin_ia32_reduce_fmul_ph128: + case X86::BI__builtin_ia32_reduce_fmax_pd512: + case X86::BI__builtin_ia32_reduce_fmax_ps512: + case X86::BI__builtin_ia32_reduce_fmax_ph512: + case X86::BI__builtin_ia32_reduce_fmax_ph256: + case X86::BI__builtin_ia32_reduce_fmax_ph128: + case X86::BI__builtin_ia32_reduce_fmin_pd512: + case X86::BI__builtin_ia32_reduce_fmin_ps512: + case X86::BI__builtin_ia32_reduce_fmin_ph512: + case X86::BI__builtin_ia32_reduce_fmin_ph256: + case X86::BI__builtin_ia32_reduce_fmin_ph128: + case X86::BI__builtin_ia32_rdrand16_step: + case X86::BI__builtin_ia32_rdrand32_step: + case X86::BI__builtin_ia32_rdrand64_step: + case X86::BI__builtin_ia32_rdseed16_step: + case X86::BI__builtin_ia32_rdseed32_step: + case X86::BI__builtin_ia32_rdseed64_step: + case X86::BI__builtin_ia32_addcarryx_u32: + case X86::BI__builtin_ia32_addcarryx_u64: + case X86::BI__builtin_ia32_subborrow_u32: + case X86::BI__builtin_ia32_subborrow_u64: + case X86::BI__builtin_ia32_fpclassps128_mask: + case X86::BI__builtin_ia32_fpclassps256_mask: + case X86::BI__builtin_ia32_fpclassps512_mask: + case X86::BI__builtin_ia32_vfpclassbf16128_mask: + case X86::BI__builtin_ia32_vfpclassbf16256_mask: + case X86::BI__builtin_ia32_vfpclassbf16512_mask: + case X86::BI__builtin_ia32_fpclassph128_mask: + case X86::BI__builtin_ia32_fpclassph256_mask: + case X86::BI__builtin_ia32_fpclassph512_mask: + case X86::BI__builtin_ia32_fpclasspd128_mask: + case X86::BI__builtin_ia32_fpclasspd256_mask: + case X86::BI__builtin_ia32_fpclasspd512_mask: + case X86::BI__builtin_ia32_vp2intersect_q_512: + case X86::BI__builtin_ia32_vp2intersect_q_256: + case X86::BI__builtin_ia32_vp2intersect_q_128: + case X86::BI__builtin_ia32_vp2intersect_d_512: + case X86::BI__builtin_ia32_vp2intersect_d_256: + case X86::BI__builtin_ia32_vp2intersect_d_128: + case X86::BI__builtin_ia32_vpmultishiftqb128: + case X86::BI__builtin_ia32_vpmultishiftqb256: + case X86::BI__builtin_ia32_vpmultishiftqb512: + case X86::BI__builtin_ia32_vpshufbitqmb128_mask: + case X86::BI__builtin_ia32_vpshufbitqmb256_mask: + case X86::BI__builtin_ia32_vpshufbitqmb512_mask: + case X86::BI__builtin_ia32_cmpeqps: + case X86::BI__builtin_ia32_cmpeqpd: + case X86::BI__builtin_ia32_cmpltps: + case X86::BI__builtin_ia32_cmpltpd: + case X86::BI__builtin_ia32_cmpleps: + case X86::BI__builtin_ia32_cmplepd: + case X86::BI__builtin_ia32_cmpunordps: + case X86::BI__builtin_ia32_cmpunordpd: + case X86::BI__builtin_ia32_cmpneqps: + case X86::BI__builtin_ia32_cmpneqpd: + case X86::BI__builtin_ia32_cmpnltps: + case X86::BI__builtin_ia32_cmpnltpd: + case X86::BI__builtin_ia32_cmpnleps: + case X86::BI__builtin_ia32_cmpnlepd: + case X86::BI__builtin_ia32_cmpordps: + case X86::BI__builtin_ia32_cmpordpd: + case X86::BI__builtin_ia32_cmpph128_mask: + case X86::BI__builtin_ia32_cmpph256_mask: + case X86::BI__builtin_ia32_cmpph512_mask: + case X86::BI__builtin_ia32_cmpps128_mask: + case X86::BI__builtin_ia32_cmpps256_mask: + case X86::BI__builtin_ia32_cmpps512_mask: + case X86::BI__builtin_ia32_cmppd128_mask: + case X86::BI__builtin_ia32_cmppd256_mask: + case X86::BI__builtin_ia32_cmppd512_mask: + case X86::BI__builtin_ia32_vcmpbf16512_mask: + case X86::BI__builtin_ia32_vcmpbf16256_mask: + case X86::BI__builtin_ia32_vcmpbf16128_mask: + case X86::BI__builtin_ia32_cmpps: + case X86::BI__builtin_ia32_cmpps256: + case X86::BI__builtin_ia32_cmppd: + case X86::BI__builtin_ia32_cmppd256: + case X86::BI__builtin_ia32_cmpeqss: + case X86::BI__builtin_ia32_cmpltss: + case X86::BI__builtin_ia32_cmpless: + case X86::BI__builtin_ia32_cmpunordss: + case X86::BI__builtin_ia32_cmpneqss: + case X86::BI__builtin_ia32_cmpnltss: + case X86::BI__builtin_ia32_cmpnless: + case X86::BI__builtin_ia32_cmpordss: + case X86::BI__builtin_ia32_cmpeqsd: + case X86::BI__builtin_ia32_cmpltsd: + case X86::BI__builtin_ia32_cmplesd: + case X86::BI__builtin_ia32_cmpunordsd: + case X86::BI__builtin_ia32_cmpneqsd: + case X86::BI__builtin_ia32_cmpnltsd: + case X86::BI__builtin_ia32_cmpnlesd: + case X86::BI__builtin_ia32_cmpordsd: + case X86::BI__builtin_ia32_vcvtph2ps_mask: + case X86::BI__builtin_ia32_vcvtph2ps256_mask: + case X86::BI__builtin_ia32_vcvtph2ps512_mask: + case X86::BI__builtin_ia32_cvtneps2bf16_128_mask: + case X86::BI__builtin_ia32_cvtsbf162ss_32: + case X86::BI__builtin_ia32_cvtneps2bf16_256_mask: + case X86::BI__builtin_ia32_cvtneps2bf16_512_mask: + case X86::BI__cpuid: + case X86::BI__cpuidex: + case X86::BI__emul: + case X86::BI__emulu: + case X86::BI__mulh: + case X86::BI__umulh: + case X86::BI_mul128: + case X86::BI_umul128: + case X86::BI__faststorefence: + case X86::BI__shiftleft128: + case X86::BI__shiftright128: + case X86::BI_ReadWriteBarrier: + case X86::BI_ReadBarrier: + case X86::BI_WriteBarrier: + case X86::BI_AddressOfReturnAddress: + case X86::BI__stosb: + case X86::BI__builtin_ia32_t2rpntlvwz0_internal: + case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal: + case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal: + case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal: + case X86::BI__builtin_ia32_t2rpntlvwz1_internal: + case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal: + case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: + case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: + case X86::BI__ud2: + case X86::BI__int2c: + case X86::BI__readfsbyte: + case X86::BI__readfsword: + case X86::BI__readfsdword: + case X86::BI__readfsqword: + case X86::BI__readgsbyte: + case X86::BI__readgsword: + case X86::BI__readgsdword: + case X86::BI__readgsqword: + case X86::BI__builtin_ia32_encodekey128_u32: + case X86::BI__builtin_ia32_encodekey256_u32: + case X86::BI__builtin_ia32_aesenc128kl_u8: + case X86::BI__builtin_ia32_aesdec128kl_u8: + case X86::BI__builtin_ia32_aesenc256kl_u8: + case X86::BI__builtin_ia32_aesdec256kl_u8: + case X86::BI__builtin_ia32_aesencwide128kl_u8: + case X86::BI__builtin_ia32_aesdecwide128kl_u8: + case X86::BI__builtin_ia32_aesencwide256kl_u8: + case X86::BI__builtin_ia32_aesdecwide256kl_u8: + case X86::BI__builtin_ia32_vfcmaddcph512_mask: + case X86::BI__builtin_ia32_vfmaddcph512_mask: + case X86::BI__builtin_ia32_vfcmaddcsh_round_mask: + case X86::BI__builtin_ia32_vfmaddcsh_round_mask: + case X86::BI__builtin_ia32_vfcmaddcsh_round_mask3: + case X86::BI__builtin_ia32_vfmaddcsh_round_mask3: + case X86::BI__builtin_ia32_prefetchi: + cgm.errorNYI(e->getSourceRange(), + std::string("unimplemented X86 builtin call: ") + + getContext().BuiltinInfo.getName(builtinID)); + return {}; + } +} diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h index fa3260c5ec5b8..706801f95bb6e 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.h +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h @@ -1614,6 +1614,10 @@ class CIRGenFunction : public CIRGenTypeCache { bool buildingTopLevelCase); mlir::LogicalResult emitSwitchStmt(const clang::SwitchStmt &s); + mlir::Value emitTargetBuiltinExpr(unsigned builtinID, + const clang::CallExpr *e, + ReturnValueSlot &returnValue); + /// Given a value and its clang type, returns the value casted to its memory /// representation. /// Note: CIR defers most of the special casting to the final lowering passes @@ -1652,6 +1656,8 @@ class CIRGenFunction : public CIRGenTypeCache { mlir::LogicalResult emitWhileStmt(const clang::WhileStmt &s); + mlir::Value emitX86BuiltinExpr(unsigned builtinID, const CallExpr *e); + /// Given an assignment `*lhs = rhs`, emit a test that checks if \p rhs is /// nonnull, if 1\p LHS is marked _Nonnull. void emitNullabilityCheck(LValue lhs, mlir::Value rhs, diff --git a/clang/lib/CIR/CodeGen/CMakeLists.txt b/clang/lib/CIR/CodeGen/CMakeLists.txt index 36db4bd9bab25..7c31beacc5fb3 100644 --- a/clang/lib/CIR/CodeGen/CMakeLists.txt +++ b/clang/lib/CIR/CodeGen/CMakeLists.txt @@ -11,13 +11,14 @@ add_clang_library(clangCIR CIRGenAsm.cpp CIRGenAtomic.cpp CIRGenBuilder.cpp + CIRGenBuiltin.cpp + CIRGenBuiltinX86.cpp CIRGenCall.cpp CIRGenClass.cpp CIRGenCleanup.cpp CIRGenCoroutine.cpp CIRGenCXX.cpp CIRGenCXXABI.cpp - CIRGenBuiltin.cpp CIRGenDecl.cpp CIRGenDeclCXX.cpp CIRGenDeclOpenACC.cpp From e960f1ec47c3e0060e3031432293c8a1c99381cb Mon Sep 17 00:00:00 2001 From: Max Desiatov Date: Tue, 21 Oct 2025 21:26:28 +0100 Subject: [PATCH 004/530] Refine `qWasmCallStack` documentation (#163803) `qWasmCallStack` documentation is not clear enough in how the list of PC values is delimited and the size of these values. --- lldb/docs/resources/lldbgdbremote.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lldb/docs/resources/lldbgdbremote.md b/lldb/docs/resources/lldbgdbremote.md index 287484ea5cabf..032edb66690e4 100644 --- a/lldb/docs/resources/lldbgdbremote.md +++ b/lldb/docs/resources/lldbgdbremote.md @@ -2491,9 +2491,10 @@ The packet below are supported by the ### qWasmCallStack Get the Wasm call stack for the given thread id. This returns a hex-encoded -list of PC values, one for each frame of the call stack. To match the Wasm -specification, the addresses are encoded in little endian byte order, even if -the endian of the Wasm runtime's host is not little endian. +list (with no delimiters) of 64-bit PC values, one for each frame of the call +stack. To match the Wasm specification, the addresses are encoded in little +endian byte order, even if the endian of the Wasm runtime's host is not little +endian. ``` send packet: $qWasmCallStack:202dbe040#08 From 116b94ce7900b79d5c4b84456da26d8e40dbae3b Mon Sep 17 00:00:00 2001 From: jiang1997 Date: Wed, 22 Oct 2025 04:27:54 +0800 Subject: [PATCH 005/530] [CIR] Add array new cookie support (#163649) This patch adds the minimal support for array cookies needed to enable ClangIR generation for an array new expression that requires cookies but does not require an explicit initializer. This only provides the cookie support for the base Itanium CXXABI. Different cookie calculations are required for AppleARM64, which will be added in a subsequent patch. Ported from ClangIR incubator PR https://github.com/llvm/clangir/pull/1297. This is the second PR in a series intended to address https://github.com/llvm/llvm-project/issues/160383. --------- Co-authored-by: Andy Kaylor --- clang/lib/CIR/CodeGen/CIRGenCXXABI.cpp | 3 +- clang/lib/CIR/CodeGen/CIRGenCXXABI.h | 22 +++- clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp | 73 +++++++++-- clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp | 75 ++++++++++- clang/lib/CIR/CodeGen/CIRGenModule.cpp | 2 +- clang/lib/CIR/CodeGen/CIRGenTypeCache.h | 10 +- clang/test/CIR/CodeGen/new.cpp | 121 ++++++++++++++++++ 7 files changed, 290 insertions(+), 16 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenCXXABI.cpp b/clang/lib/CIR/CodeGen/CIRGenCXXABI.cpp index d81492ef01540..eef3739196d94 100644 --- a/clang/lib/CIR/CodeGen/CIRGenCXXABI.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenCXXABI.cpp @@ -85,8 +85,7 @@ CharUnits CIRGenCXXABI::getArrayCookieSize(const CXXNewExpr *e) { if (!requiresArrayCookie(e)) return CharUnits::Zero(); - cgm.errorNYI(e->getSourceRange(), "CIRGenCXXABI::getArrayCookieSize"); - return CharUnits::Zero(); + return getArrayCookieSizeImpl(e->getAllocatedType()); } bool CIRGenCXXABI::requiresArrayCookie(const CXXNewExpr *e) { diff --git a/clang/lib/CIR/CodeGen/CIRGenCXXABI.h b/clang/lib/CIR/CodeGen/CIRGenCXXABI.h index da7cc72136ef1..c78f9b0ea8a26 100644 --- a/clang/lib/CIR/CodeGen/CIRGenCXXABI.h +++ b/clang/lib/CIR/CodeGen/CIRGenCXXABI.h @@ -302,8 +302,28 @@ class CIRGenCXXABI { /// - non-array allocations never need a cookie /// - calls to \::operator new(size_t, void*) never need a cookie /// - /// \param E - the new-expression being allocated. + /// \param e - the new-expression being allocated. virtual CharUnits getArrayCookieSize(const CXXNewExpr *e); + + /// Initialize the array cookie for the given allocation. + /// + /// \param newPtr - a char* which is the presumed-non-null + /// return value of the allocation function + /// \param numElements - the computed number of elements, + /// potentially collapsed from the multidimensional array case; + /// always a size_t + /// \param elementType - the base element allocated type, + /// i.e. the allocated type after stripping all array types + virtual Address initializeArrayCookie(CIRGenFunction &cgf, Address newPtr, + mlir::Value numElements, + const CXXNewExpr *e, + QualType elementType) = 0; + +protected: + /// Returns the extra size required in order to store the array + /// cookie for the given type. Assumes that an array cookie is + /// required. + virtual CharUnits getArrayCookieSizeImpl(QualType elementType) = 0; }; /// Creates and Itanium-family ABI diff --git a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp index b1e9e768ff1e2..fe9e21064263b 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp @@ -306,6 +306,7 @@ static mlir::Value emitCXXNewAllocSize(CIRGenFunction &cgf, const CXXNewExpr *e, mlir::cast(constNumElements).getValue(); unsigned numElementsWidth = count.getBitWidth(); + bool hasAnyOverflow = false; // The equivalent code in CodeGen/CGExprCXX.cpp handles these cases as // overflow, but that should never happen. The size argument is implicitly @@ -336,11 +337,22 @@ static mlir::Value emitCXXNewAllocSize(CIRGenFunction &cgf, const CXXNewExpr *e, // Add in the cookie, and check whether it's overflowed. if (cookieSize != 0) { - cgf.cgm.errorNYI(e->getSourceRange(), - "emitCXXNewAllocSize: array cookie"); + // Save the current size without a cookie. This shouldn't be + // used if there was overflow + sizeWithoutCookie = cgf.getBuilder().getConstInt( + loc, allocationSize.zextOrTrunc(sizeWidth)); + + allocationSize = allocationSize.uadd_ov(cookieSize, overflow); + hasAnyOverflow |= overflow; } - size = cgf.getBuilder().getConstInt(loc, allocationSize); + // On overflow, produce a -1 so operator new will fail + if (hasAnyOverflow) { + size = + cgf.getBuilder().getConstInt(loc, llvm::APInt::getAllOnes(sizeWidth)); + } else { + size = cgf.getBuilder().getConstInt(loc, allocationSize); + } } else { // TODO: Handle the variable size case cgf.cgm.errorNYI(e->getSourceRange(), @@ -390,7 +402,50 @@ void CIRGenFunction::emitNewArrayInitializer( if (!e->hasInitializer()) return; - cgm.errorNYI(e->getSourceRange(), "emitNewArrayInitializer"); + unsigned initListElements = 0; + + const Expr *init = e->getInitializer(); + const InitListExpr *ile = dyn_cast(init); + if (ile) { + cgm.errorNYI(ile->getSourceRange(), "emitNewArrayInitializer: init list"); + return; + } + + // If all elements have already been initialized, skip any further + // initialization. + auto constOp = mlir::dyn_cast(numElements.getDefiningOp()); + if (constOp) { + auto constIntAttr = mlir::dyn_cast(constOp.getValue()); + // Just skip out if the constant count is zero. + if (constIntAttr && constIntAttr.getUInt() <= initListElements) + return; + } + + assert(init && "have trailing elements to initialize but no initializer"); + + // If this is a constructor call, try to optimize it out, and failing that + // emit a single loop to initialize all remaining elements. + if (const CXXConstructExpr *cce = dyn_cast(init)) { + CXXConstructorDecl *ctor = cce->getConstructor(); + if (ctor->isTrivial()) { + // If new expression did not specify value-initialization, then there + // is no initialization. + if (!cce->requiresZeroInitialization()) + return; + + cgm.errorNYI(cce->getSourceRange(), + "emitNewArrayInitializer: trivial ctor zero-init"); + return; + } + + cgm.errorNYI(cce->getSourceRange(), + "emitNewArrayInitializer: ctor initializer"); + return; + } + + cgm.errorNYI(init->getSourceRange(), + "emitNewArrayInitializer: unsupported initializer"); + return; } static void emitNewInitializer(CIRGenFunction &cgf, const CXXNewExpr *e, @@ -586,9 +641,6 @@ mlir::Value CIRGenFunction::emitCXXNewExpr(const CXXNewExpr *e) { // If there is a brace-initializer, cannot allocate fewer elements than inits. unsigned minElements = 0; - if (e->isArray() && e->hasInitializer()) { - cgm.errorNYI(e->getSourceRange(), "emitCXXNewExpr: array initializer"); - } mlir::Value numElements = nullptr; mlir::Value allocSizeWithoutCookie = nullptr; @@ -667,8 +719,11 @@ mlir::Value CIRGenFunction::emitCXXNewExpr(const CXXNewExpr *e) { !e->getOperatorDelete()->isReservedGlobalPlacementOperator()) cgm.errorNYI(e->getSourceRange(), "emitCXXNewExpr: operator delete"); - if (allocSize != allocSizeWithoutCookie) - cgm.errorNYI(e->getSourceRange(), "emitCXXNewExpr: array with cookies"); + if (allocSize != allocSizeWithoutCookie) { + assert(e->isArray()); + allocation = cgm.getCXXABI().initializeArrayCookie( + *this, allocation, numElements, e, allocType); + } mlir::Type elementTy; if (e->isArray()) { diff --git a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp index c184d4a4b1d97..e620310437219 100644 --- a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp @@ -135,8 +135,14 @@ class CIRGenItaniumCXXABI : public CIRGenCXXABI { cir::PointerType destCIRTy, bool isRefCast, Address src) override; - /**************************** RTTI Uniqueness ******************************/ + Address initializeArrayCookie(CIRGenFunction &cgf, Address newPtr, + mlir::Value numElements, const CXXNewExpr *e, + QualType elementType) override; + protected: + CharUnits getArrayCookieSizeImpl(QualType elementType) override; + + /**************************** RTTI Uniqueness ******************************/ /// Returns true if the ABI requires RTTI type_info objects to be unique /// across a program. virtual bool shouldRTTIBeUnique() const { return true; } @@ -2003,3 +2009,70 @@ mlir::Value CIRGenItaniumCXXABI::emitDynamicCast(CIRGenFunction &cgf, return cgf.getBuilder().createDynCast(loc, src.getPointer(), destCIRTy, isRefCast, castInfo); } + +/************************** Array allocation cookies **************************/ + +CharUnits CIRGenItaniumCXXABI::getArrayCookieSizeImpl(QualType elementType) { + // The array cookie is a size_t; pad that up to the element alignment. + // The cookie is actually right-justified in that space. + return std::max( + cgm.getSizeSize(), + cgm.getASTContext().getPreferredTypeAlignInChars(elementType)); +} + +Address CIRGenItaniumCXXABI::initializeArrayCookie(CIRGenFunction &cgf, + Address newPtr, + mlir::Value numElements, + const CXXNewExpr *e, + QualType elementType) { + assert(requiresArrayCookie(e)); + + // TODO: When sanitizer support is implemented, we'll need to + // get the address space from `newPtr`. + assert(!cir::MissingFeatures::addressSpace()); + assert(!cir::MissingFeatures::sanitizers()); + + ASTContext &ctx = cgm.getASTContext(); + CharUnits sizeSize = cgf.getSizeSize(); + mlir::Location loc = cgf.getLoc(e->getSourceRange()); + + // The size of the cookie. + CharUnits cookieSize = + std::max(sizeSize, ctx.getPreferredTypeAlignInChars(elementType)); + assert(cookieSize == getArrayCookieSizeImpl(elementType)); + + cir::PointerType u8PtrTy = cgf.getBuilder().getUInt8PtrTy(); + mlir::Value baseBytePtr = + cgf.getBuilder().createPtrBitcast(newPtr.getPointer(), u8PtrTy); + + // Compute an offset to the cookie. + CharUnits cookieOffset = cookieSize - sizeSize; + mlir::Value cookiePtrValue = baseBytePtr; + if (!cookieOffset.isZero()) { + mlir::Value offsetOp = cgf.getBuilder().getSignedInt( + loc, cookieOffset.getQuantity(), /*width=*/32); + cookiePtrValue = + cgf.getBuilder().createPtrStride(loc, cookiePtrValue, offsetOp); + } + + CharUnits baseAlignment = newPtr.getAlignment(); + CharUnits cookiePtrAlignment = baseAlignment.alignmentAtOffset(cookieOffset); + Address cookiePtr(cookiePtrValue, u8PtrTy, cookiePtrAlignment); + + // Write the number of elements into the appropriate slot. + Address numElementsPtr = + cookiePtr.withElementType(cgf.getBuilder(), cgf.SizeTy); + cgf.getBuilder().createStore(loc, numElements, numElementsPtr); + + // Finally, compute a pointer to the actual data buffer by skipping + // over the cookie completely. + mlir::Value dataOffset = + cgf.getBuilder().getSignedInt(loc, cookieSize.getQuantity(), + /*width=*/32); + mlir::Value dataPtr = + cgf.getBuilder().createPtrStride(loc, baseBytePtr, dataOffset); + mlir::Value finalPtr = + cgf.getBuilder().createPtrBitcast(dataPtr, newPtr.getElementType()); + CharUnits finalAlignment = baseAlignment.alignmentAtOffset(cookieSize); + return Address(finalPtr, newPtr.getElementType(), finalAlignment); +} diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp index 74d39b73fff8f..6b293738fdf8a 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp @@ -102,7 +102,7 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext, // TODO(CIR): Should be updated once TypeSizeInfoAttr is upstreamed const unsigned sizeTypeSize = astContext.getTypeSize(astContext.getSignedSizeType()); - SizeAlignInBytes = astContext.toCharUnitsFromBits(sizeTypeSize).getQuantity(); + SizeSizeInBytes = astContext.toCharUnitsFromBits(sizeTypeSize).getQuantity(); // In CIRGenTypeCache, UIntPtrTy and SizeType are fields of the same union UIntPtrTy = cir::IntType::get(&getMLIRContext(), sizeTypeSize, /*isSigned=*/false); diff --git a/clang/lib/CIR/CodeGen/CIRGenTypeCache.h b/clang/lib/CIR/CodeGen/CIRGenTypeCache.h index b5612d9127506..ff5842cd86e04 100644 --- a/clang/lib/CIR/CodeGen/CIRGenTypeCache.h +++ b/clang/lib/CIR/CodeGen/CIRGenTypeCache.h @@ -74,11 +74,17 @@ struct CIRGenTypeCache { unsigned char PointerSizeInBytes; }; - /// The alignment of size_t. - unsigned char SizeAlignInBytes; + /// The size and alignment of size_t. + union { + unsigned char SizeSizeInBytes; // sizeof(size_t) + unsigned char SizeAlignInBytes; + }; cir::TargetAddressSpaceAttr cirAllocaAddressSpace; + clang::CharUnits getSizeSize() const { + return clang::CharUnits::fromQuantity(SizeSizeInBytes); + } clang::CharUnits getSizeAlign() const { return clang::CharUnits::fromQuantity(SizeAlignInBytes); } diff --git a/clang/test/CIR/CodeGen/new.cpp b/clang/test/CIR/CodeGen/new.cpp index 000ea5b77bebf..2efad10c19efd 100644 --- a/clang/test/CIR/CodeGen/new.cpp +++ b/clang/test/CIR/CodeGen/new.cpp @@ -208,6 +208,127 @@ void t_new_constant_size() { // OGCG: %[[CALL:.*]] = call noalias noundef nonnull ptr @_Znam(i64 noundef 128) // OGCG: store ptr %[[CALL]], ptr %[[P_ADDR]], align 8 +class C { + public: + ~C(); +}; + +void t_constant_size_nontrivial() { + auto p = new C[3]; +} + +// CHECK: cir.func{{.*}} @_Z26t_constant_size_nontrivialv() +// CHECK: %[[P_ADDR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["p", init] {alignment = 8 : i64} +// CHECK: %[[#NUM_ELEMENTS:]] = cir.const #cir.int<3> : !u64i +// CHECK: %[[#SIZE_WITHOUT_COOKIE:]] = cir.const #cir.int<3> : !u64i +// CHECK: %[[#ALLOCATION_SIZE:]] = cir.const #cir.int<11> : !u64i +// CHECK: %[[RAW_PTR:.*]] = cir.call @_Znam(%[[#ALLOCATION_SIZE]]) : (!u64i) -> !cir.ptr +// CHECK: %[[COOKIE_PTR_BASE:.*]] = cir.cast bitcast %[[RAW_PTR]] : !cir.ptr -> !cir.ptr> +// CHECK: %[[COOKIE_PTR:.*]] = cir.cast bitcast %[[COOKIE_PTR_BASE]] : !cir.ptr> -> !cir.ptr +// CHECK: cir.store align(8) %[[#NUM_ELEMENTS]], %[[COOKIE_PTR]] : !u64i, !cir.ptr +// CHECK: %[[#COOKIE_SIZE:]] = cir.const #cir.int<8> : !s32i +// CHECK: %[[DATA_PTR_RAW:.*]] = cir.ptr_stride %[[COOKIE_PTR_BASE]], %[[#COOKIE_SIZE]] : (!cir.ptr>, !s32i) -> !cir.ptr> +// CHECK: %[[DATA_PTR_VOID:.*]] = cir.cast bitcast %[[DATA_PTR_RAW]] : !cir.ptr> -> !cir.ptr +// CHECK: %[[DATA_PTR:.*]] = cir.cast bitcast %[[DATA_PTR_VOID]] : !cir.ptr -> !cir.ptr +// CHECK: cir.store align(8) %[[DATA_PTR]], %[[P_ADDR]] : !cir.ptr, !cir.ptr> +// CHECK: cir.return +// CHECK: } + +// LLVM: @_Z26t_constant_size_nontrivialv() +// LLVM: %[[ALLOCA:.*]] = alloca ptr, i64 1, align 8 +// LLVM: %[[COOKIE_PTR:.*]] = call ptr @_Znam(i64 11) +// LLVM: store i64 3, ptr %[[COOKIE_PTR]], align 8 +// LLVM: %[[ALLOCATED_PTR:.*]] = getelementptr ptr, ptr %[[COOKIE_PTR]], i64 8 +// LLVM: store ptr %[[ALLOCATED_PTR]], ptr %[[ALLOCA]], align 8 + +// OGCG: @_Z26t_constant_size_nontrivialv() +// OGCG: %[[ALLOCA:.*]] = alloca ptr, align 8 +// OGCG: %[[COOKIE_PTR:.*]] = call noalias noundef nonnull ptr @_Znam(i64 noundef 11) +// OGCG: store i64 3, ptr %[[COOKIE_PTR]], align 8 +// OGCG: %[[ALLOCATED_PTR:.*]] = getelementptr inbounds i8, ptr %[[COOKIE_PTR]], i64 8 +// OGCG: store ptr %[[ALLOCATED_PTR]], ptr %[[ALLOCA]], align 8 + +class D { + public: + int x; + ~D(); +}; + +void t_constant_size_nontrivial2() { + auto p = new D[3]; +} + +// In this test SIZE_WITHOUT_COOKIE isn't used, but it would be if there were +// an initializer. + +// CHECK: cir.func{{.*}} @_Z27t_constant_size_nontrivial2v() +// CHECK: %[[P_ADDR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["p", init] {alignment = 8 : i64} +// CHECK: %[[#NUM_ELEMENTS:]] = cir.const #cir.int<3> : !u64i +// CHECK: %[[#SIZE_WITHOUT_COOKIE:]] = cir.const #cir.int<12> : !u64i +// CHECK: %[[#ALLOCATION_SIZE:]] = cir.const #cir.int<20> : !u64i +// CHECK: %[[RAW_PTR:.*]] = cir.call @_Znam(%[[#ALLOCATION_SIZE]]) : (!u64i) -> !cir.ptr +// CHECK: %[[COOKIE_PTR_BASE:.*]] = cir.cast bitcast %[[RAW_PTR]] : !cir.ptr -> !cir.ptr> +// CHECK: %[[COOKIE_PTR:.*]] = cir.cast bitcast %[[COOKIE_PTR_BASE]] : !cir.ptr> -> !cir.ptr +// CHECK: cir.store align(8) %[[#NUM_ELEMENTS]], %[[COOKIE_PTR]] : !u64i, !cir.ptr +// CHECK: %[[#COOKIE_SIZE:]] = cir.const #cir.int<8> : !s32i +// CHECK: %[[DATA_PTR_RAW:.*]] = cir.ptr_stride %[[COOKIE_PTR_BASE]], %[[#COOKIE_SIZE]] : (!cir.ptr>, !s32i) -> !cir.ptr> +// CHECK: %[[DATA_PTR_VOID:.*]] = cir.cast bitcast %[[DATA_PTR_RAW]] : !cir.ptr> -> !cir.ptr +// CHECK: %[[DATA_PTR:.*]] = cir.cast bitcast %[[DATA_PTR_VOID]] : !cir.ptr -> !cir.ptr +// CHECK: cir.store align(8) %[[DATA_PTR]], %[[P_ADDR]] : !cir.ptr, !cir.ptr> +// CHECK: cir.return +// CHECK: } + +// LLVM: @_Z27t_constant_size_nontrivial2v() +// LLVM: %[[ALLOCA:.*]] = alloca ptr, i64 1, align 8 +// LLVM: %[[COOKIE_PTR:.*]] = call ptr @_Znam(i64 20) +// LLVM: store i64 3, ptr %[[COOKIE_PTR]], align 8 +// LLVM: %[[ALLOCATED_PTR:.*]] = getelementptr ptr, ptr %[[COOKIE_PTR]], i64 8 +// LLVM: store ptr %[[ALLOCATED_PTR]], ptr %[[ALLOCA]], align 8 + +struct alignas(16) E { + int x; + ~E(); +}; + +void t_align16_nontrivial() { + auto p = new E[2]; +} + +// CHECK: cir.func{{.*}} @_Z20t_align16_nontrivialv() +// CHECK: %[[P_ADDR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["p", init] {alignment = 8 : i64} +// CHECK: %[[#NUM_ELEMENTS:]] = cir.const #cir.int<2> : !u64i +// CHECK: %[[#SIZE_WITHOUT_COOKIE:]] = cir.const #cir.int<32> : !u64i +// CHECK: %[[#ALLOCATION_SIZE:]] = cir.const #cir.int<48> : !u64i +// CHECK: %[[RAW_PTR:.*]] = cir.call @_Znam(%[[#ALLOCATION_SIZE]]) : (!u64i) -> !cir.ptr +// CHECK: %[[COOKIE_PTR_BASE:.*]] = cir.cast bitcast %[[RAW_PTR]] : !cir.ptr -> !cir.ptr> +// CHECK: %[[COOKIE_OFFSET:.*]] = cir.const #cir.int<8> : !s32i +// CHECK: %[[COOKIE_PTR_RAW:.*]] = cir.ptr_stride %[[COOKIE_PTR_BASE]], %[[COOKIE_OFFSET]] : (!cir.ptr>, !s32i) -> !cir.ptr> +// CHECK: %[[COOKIE_PTR:.*]] = cir.cast bitcast %[[COOKIE_PTR_RAW]] : !cir.ptr> -> !cir.ptr +// CHECK: cir.store align(8) %[[#NUM_ELEMENTS]], %[[COOKIE_PTR]] : !u64i, !cir.ptr +// CHECK: %[[#COOKIE_SIZE:]] = cir.const #cir.int<16> : !s32i +// CHECK: %[[DATA_PTR_RAW:.*]] = cir.ptr_stride %[[COOKIE_PTR_BASE]], %[[#COOKIE_SIZE]] : (!cir.ptr>, !s32i) -> !cir.ptr> +// CHECK: %[[DATA_PTR_VOID:.*]] = cir.cast bitcast %[[DATA_PTR_RAW]] : !cir.ptr> -> !cir.ptr +// CHECK: %[[DATA_PTR:.*]] = cir.cast bitcast %[[DATA_PTR_VOID]] : !cir.ptr -> !cir.ptr +// CHECK: cir.store align(8) %[[DATA_PTR]], %[[P_ADDR]] : !cir.ptr, !cir.ptr> +// CHECK: cir.return +// CHECK: } + +// LLVM: @_Z20t_align16_nontrivialv() +// LLVM: %[[ALLOCA:.*]] = alloca ptr, i64 1, align 8 +// LLVM: %[[RAW_PTR:.*]] = call ptr @_Znam(i64 48) +// LLVM: %[[COOKIE_PTR:.*]] = getelementptr ptr, ptr %[[RAW_PTR]], i64 8 +// LLVM: store i64 2, ptr %[[COOKIE_PTR]], align 8 +// LLVM: %[[ALLOCATED_PTR:.*]] = getelementptr ptr, ptr %[[RAW_PTR]], i64 16 +// LLVM: store ptr %[[ALLOCATED_PTR]], ptr %[[ALLOCA]], align 8 + +// OGCG: define{{.*}} void @_Z20t_align16_nontrivialv +// OGCG: %[[ALLOCA:.*]] = alloca ptr, align 8 +// OGCG: %[[RAW_PTR:.*]] = call noalias noundef nonnull ptr @_Znam(i64 noundef 48) +// OGCG: %[[COOKIE_PTR:.*]] = getelementptr inbounds i8, ptr %[[RAW_PTR]], i64 8 +// OGCG: store i64 2, ptr %[[COOKIE_PTR]], align 8 +// OGCG: %[[ALLOCATED_PTR:.*]] = getelementptr inbounds i8, ptr %[[RAW_PTR]], i64 16 +// OGCG: store ptr %[[ALLOCATED_PTR]], ptr %[[ALLOCA]], align 8 +// OGCG: ret void void t_new_multidim_constant_size() { auto p = new double[2][3][4]; From 41cc0de595ebad1af7d88e1f0a0d439a399e85bd Mon Sep 17 00:00:00 2001 From: Amr Hesham Date: Tue, 21 Oct 2025 22:33:54 +0200 Subject: [PATCH 006/530] [CIR] Const member expr for struct type (#164172) Upstream support the const member expr for struct type --- clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp | 3 +- clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp | 5 +- clang/test/CIR/CodeGen/struct.cpp | 64 +++++++++++++++++++ 3 files changed, 69 insertions(+), 3 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp index 568cbdb06bb48..d6d226b3f75db 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp @@ -280,6 +280,7 @@ class AggExprEmitter : public StmtVisitor { void VisitUnaryDeref(UnaryOperator *e) { emitAggLoadOfLValue(e); } void VisitStringLiteral(StringLiteral *e) { emitAggLoadOfLValue(e); } void VisitCompoundLiteralExpr(CompoundLiteralExpr *e); + void VisitPredefinedExpr(const PredefinedExpr *e) { cgf.cgm.errorNYI(e->getSourceRange(), "AggExprEmitter: VisitPredefinedExpr"); @@ -670,7 +671,7 @@ void AggExprEmitter::emitNullInitializationToLValue(mlir::Location loc, return; } - cgf.cgm.errorNYI("emitStoreThroughBitfieldLValue"); + cgf.emitStoreThroughBitfieldLValue(RValue::get(null), lv); return; } diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp index 138082b59fecd..33eb748d72219 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp @@ -2041,8 +2041,9 @@ mlir::Value ScalarExprEmitter::VisitMemberExpr(MemberExpr *e) { assert(!cir::MissingFeatures::tryEmitAsConstant()); Expr::EvalResult result; if (e->EvaluateAsInt(result, cgf.getContext(), Expr::SE_AllowSideEffects)) { - cgf.cgm.errorNYI(e->getSourceRange(), "Constant interger member expr"); - // Fall through to emit this as a non-constant access. + llvm::APSInt value = result.Val.getInt(); + cgf.emitIgnoredExpr(e->getBase()); + return builder.getConstInt(cgf.getLoc(e->getExprLoc()), value); } return emitLoadOfLValue(e); } diff --git a/clang/test/CIR/CodeGen/struct.cpp b/clang/test/CIR/CodeGen/struct.cpp index 6d362c79c1c44..c8db71498e477 100644 --- a/clang/test/CIR/CodeGen/struct.cpp +++ b/clang/test/CIR/CodeGen/struct.cpp @@ -280,3 +280,67 @@ void bin_comma() { // OGCG: define{{.*}} void @_Z9bin_commav() // OGCG: %[[A_ADDR:.*]] = alloca %struct.CompleteS, align 4 // OGCG: call void @llvm.memset.p0.i64(ptr align 4 %[[A_ADDR]], i8 0, i64 8, i1 false) + +void compound_literal_expr() { CompleteS a = (CompleteS){}; } + +// CIR: %[[A_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr, ["a", init] +// CIR: %[[A_ELEM_0_PTR:.*]] = cir.get_member %[[A_ADDR]][0] {name = "a"} : !cir.ptr -> !cir.ptr +// CIR: %[[CONST_0:.*]] = cir.const #cir.int<0> : !s32i +// CIR: cir.store{{.*}} %[[CONST_0]], %[[A_ELEM_0_PTR]] : !s32i, !cir.ptr +// CIR: %[[A_ELEM_1_PTR:.*]] = cir.get_member %[[A_ADDR]][1] {name = "b"} : !cir.ptr -> !cir.ptr +// CIR: %[[CONST_0:.*]] = cir.const #cir.int<0> : !s8i +// CIR: cir.store{{.*}} %[[CONST_0]], %[[A_ELEM_1_PTR]] : !s8i, !cir.ptr + +// TODO(cir): zero-initialize the padding + +// LLVM: %[[A_ADDR:.*]] = alloca %struct.CompleteS, i64 1, align 4 +// LLVM: %[[A_ELEM_0_PTR:.*]] = getelementptr %struct.CompleteS, ptr %[[A_ADDR]], i32 0, i32 0 +// LLVM: store i32 0, ptr %[[A_ELEM_0_PTR]], align 4 +// LLVM: %[[A_ELEM_1_PTR:.*]] = getelementptr %struct.CompleteS, ptr %[[A_ADDR]], i32 0, i32 1 +// LLVM: store i8 0, ptr %[[A_ELEM_1_PTR]], align 4 + +// OGCG: %[[A_ADDR:.*]] = alloca %struct.CompleteS, align 4 +// OGCG: call void @llvm.memset.p0.i64(ptr align 4 %[[A_ADDR]], i8 0, i64 8, i1 false) + +struct StructWithConstMember { + int a : 1; +}; + +void struct_with_const_member_expr() { + int a = (StructWithConstMember){}.a; +} + +// CIR: %[[A_ADDR:.*]] = cir.alloca !s32i, !cir.ptr, ["a", init] +// CIR: %[[RESULT:.*]] = cir.scope { +// CIR: %[[REF_ADDR:.*]] = cir.alloca !rec_StructWithConstMember, !cir.ptr, ["ref.tmp0"] +// CIR: %[[ELEM_0_PTR:.*]] = cir.get_member %[[REF_ADDR]][0] {name = "a"} : !cir.ptr -> !cir.ptr +// CIR: %[[CONST_0:.*]] = cir.const #cir.int<0> : !s32i +// CIR: %[[SET_BF:.*]] = cir.set_bitfield{{.*}} (#bfi_a, %[[ELEM_0_PTR]] : !cir.ptr, %[[CONST_0]] : !s32i) -> !s32i +// CIR: %[[CONST_0:.*]] = cir.const #cir.int<0> : !s32i +// CIR: cir.yield %[[CONST_0]] : !s32i +// CIR: } : !s32i +// CIR: cir.store{{.*}} %[[RESULT]], %[[A_ADDR]] : !s32i, !cir.ptr + +// TODO(cir): zero-initialize the padding + +// LLVM: %[[REF_ADDR:.*]] = alloca %struct.StructWithConstMember, i64 1, align 4 +// LLVM: %[[A_ADDR:.*]] = alloca i32, i64 1, align 4 +// LLVM: br label %[[BF_LABEL:.*]] +// LLVM: [[BF_LABEL]]: +// LLVM: %[[ELEM_0_PTR:.*]] = getelementptr %struct.StructWithConstMember, ptr %[[REF_ADDR]], i32 0, i32 0 +// LLVM: %[[TMP_REF:.*]] = load i8, ptr %[[ELEM_0_PTR]], align 4 +// LLVM: %[[BF_CLEAR:.*]] = and i8 %[[TMP_REF]], -2 +// LLVM: %[[BF_SET:.*]] = or i8 %[[BF_CLEAR]], 0 +// LLVM: store i8 %[[BF_SET]], ptr %[[ELEM_0_PTR]], align 4 +// LLVM: br label %[[RESULT_LABEL:.*]] +// LLVM: [[RESULT_LABEL]]: +// LLVM: %[[RESULT:.*]] = phi i32 [ 0, %[[BF_LABEL]] ] +// LLVM: store i32 %[[RESULT]], ptr %[[A_ADDR]], align 4 + +// OGCG: %[[A_ADDR:.*]] = alloca i32, align 4 +// OGCG: %[[REF_ADDR:.*]] = alloca %struct.StructWithConstMember, align 4 +// OGCG: %[[TMP_REF:.*]] = load i8, ptr %[[REF_ADDR]], align 4 +// OGCG: %[[BF_CLEAR:.*]] = and i8 %[[TMP_REF]], -2 +// OGCG: %[[BF_SET:.*]] = or i8 %[[BF_CLEAR]], 0 +// OGCG: store i8 %[[BF_SET]], ptr %[[REF_ADDR]], align 4 +// OGCG: store i32 0, ptr %[[A_ADDR]], align 4 From dab1148cfa2c1e51bf1b9af5587d0afecb084279 Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Tue, 21 Oct 2025 13:37:49 -0700 Subject: [PATCH 007/530] [flang] Address OpenACC name resolution gaps (#164313) Some OpenACC parsers aren't filling in the "source" data members of parse tree nodes, or not doing so correctly; and some of those nodes are not adding their source data members to the source ranges of the current scope when being visited in name resolution, which causes SemanticsContext::FindScope() to misidentify the current scope in directive resolution when creating contexts. Further, the name resolution for a "use_device" clause isn't walking its subtrees, so some parser::Name nodes are not being resolved to Symbols. Fix these problems, and clean up resolve-directives.cpp a bit, since most Name nodes don't need to have their symbol table pointers updated now. --- flang/include/flang/Semantics/semantics.h | 1 + flang/lib/Parser/openacc-parsers.cpp | 71 ++++++++-------- flang/lib/Semantics/resolve-directives.cpp | 82 +++++++++---------- flang/lib/Semantics/resolve-names.cpp | 38 ++++++--- flang/lib/Semantics/semantics.cpp | 9 ++ .../OpenACC/openacc-type-categories-class.f90 | 2 +- flang/test/Semantics/OpenACC/bug1583.f90 | 23 ++++++ 7 files changed, 134 insertions(+), 92 deletions(-) create mode 100644 flang/test/Semantics/OpenACC/bug1583.f90 diff --git a/flang/include/flang/Semantics/semantics.h b/flang/include/flang/Semantics/semantics.h index f7910ad38a19d..c03d0a02d7faf 100644 --- a/flang/include/flang/Semantics/semantics.h +++ b/flang/include/flang/Semantics/semantics.h @@ -262,6 +262,7 @@ class SemanticsContext { const Scope &FindScope(parser::CharBlock) const; Scope &FindScope(parser::CharBlock); void UpdateScopeIndex(Scope &, parser::CharBlock); + void DumpScopeIndex(llvm::raw_ostream &) const; bool IsInModuleFile(parser::CharBlock) const; diff --git a/flang/lib/Parser/openacc-parsers.cpp b/flang/lib/Parser/openacc-parsers.cpp index ad035e6ade321..0dec56521f750 100644 --- a/flang/lib/Parser/openacc-parsers.cpp +++ b/flang/lib/Parser/openacc-parsers.cpp @@ -75,21 +75,21 @@ TYPE_PARSER( // tile size is one of: // * (represented as an empty std::optional) // constant-int-expr -TYPE_PARSER(construct(scalarIntConstantExpr) || +TYPE_PARSER(sourced(construct(scalarIntConstantExpr) || construct( - "*" >> construct>())) + "*" >> construct>()))) TYPE_PARSER(construct(nonemptyList(Parser{}))) // 2.9 (1979-1982) gang-arg is one of : // [num:]int-expr // dim:int-expr // static:size-expr -TYPE_PARSER(construct(construct( - "STATIC: " >> Parser{})) || +TYPE_PARSER(sourced(construct(construct( + "STATIC: " >> Parser{})) || construct( construct("DIM: " >> scalarIntExpr)) || construct( - construct(maybe("NUM: "_tok) >> scalarIntExpr))) + construct(maybe("NUM: "_tok) >> scalarIntExpr)))) // 2.9 gang-arg-list TYPE_PARSER( @@ -101,7 +101,7 @@ TYPE_PARSER(construct( // 2.5.15 Reduction, F'2023 R1131, and CUF reduction-op // Operator for reduction -TYPE_PARSER(sourced(construct( +TYPE_PARSER(construct( first("+" >> pure(ReductionOperator::Operator::Plus), "*" >> pure(ReductionOperator::Operator::Multiply), "MAX" >> pure(ReductionOperator::Operator::Max), @@ -112,32 +112,32 @@ TYPE_PARSER(sourced(construct( ".AND." >> pure(ReductionOperator::Operator::And), ".OR." >> pure(ReductionOperator::Operator::Or), ".EQV." >> pure(ReductionOperator::Operator::Eqv), - ".NEQV." >> pure(ReductionOperator::Operator::Neqv))))) + ".NEQV." >> pure(ReductionOperator::Operator::Neqv)))) // 2.15.1 Bind clause -TYPE_PARSER(sourced(construct(name)) || - sourced(construct(scalarDefaultCharExpr))) +TYPE_PARSER(sourced(construct(name) || + construct(scalarDefaultCharExpr))) // 2.5.16 Default clause -TYPE_PARSER(construct( +TYPE_PARSER(sourced(construct( first("NONE" >> pure(llvm::acc::DefaultValue::ACC_Default_none), - "PRESENT" >> pure(llvm::acc::DefaultValue::ACC_Default_present)))) + "PRESENT" >> pure(llvm::acc::DefaultValue::ACC_Default_present))))) // SELF clause is either a simple optional condition for compute construct // or a synonym of the HOST clause for the update directive 2.14.4 holding // an object list. -TYPE_PARSER( +TYPE_PARSER(sourced( construct(Parser{}) / lookAhead(")"_tok) || - construct(scalarLogicalExpr / lookAhead(")"_tok)) || + construct(scalarLogicalExpr) / lookAhead(")"_tok) || construct( recovery(fail>( "logical expression or object list expected"_err_en_US), - SkipTo<')'>{} >> pure>()))) + SkipTo<')'>{} >> pure>())))) // Modifier for copyin, copyout, cache and create -TYPE_PARSER(construct( +TYPE_PARSER(sourced(construct( first("ZERO:" >> pure(AccDataModifier::Modifier::Zero), - "READONLY:" >> pure(AccDataModifier::Modifier::ReadOnly)))) + "READONLY:" >> pure(AccDataModifier::Modifier::ReadOnly))))) // Combined directives TYPE_PARSER(sourced(construct( @@ -166,14 +166,13 @@ TYPE_PARSER(sourced(construct( TYPE_PARSER(sourced(construct( first("LOOP" >> pure(llvm::acc::Directive::ACCD_loop))))) -TYPE_PARSER(construct( - sourced(Parser{}), Parser{})) +TYPE_PARSER(sourced(construct( + Parser{}, Parser{}))) TYPE_PARSER(construct("END LOOP"_tok)) TYPE_PARSER(construct( - sourced(Parser{} / endAccLine), - maybe(Parser{}), + Parser{} / endAccLine, maybe(Parser{}), maybe(startAccLine >> Parser{} / endAccLine))) // 2.15.1 Routine directive @@ -186,8 +185,8 @@ TYPE_PARSER(sourced( parenthesized(Parser{})))) // 2.11 Combined constructs -TYPE_PARSER(construct( - sourced(Parser{}), Parser{})) +TYPE_PARSER(sourced(construct( + Parser{}, Parser{}))) // 2.12 Atomic constructs TYPE_PARSER(construct(startAccLine >> "END ATOMIC"_tok)) @@ -213,10 +212,10 @@ TYPE_PARSER("ATOMIC" >> statement(assignmentStmt), Parser{} / endAccLine)) TYPE_PARSER( - sourced(construct(Parser{})) || - sourced(construct(Parser{})) || - sourced(construct(Parser{})) || - sourced(construct(Parser{}))) + sourced(construct(Parser{}) || + construct(Parser{}) || + construct(Parser{}) || + construct(Parser{}))) // 2.13 Declare constructs TYPE_PARSER(sourced(construct( @@ -250,18 +249,18 @@ TYPE_PARSER(construct( pure(llvm::acc::Directive::ACCD_data)))))) // Standalone constructs -TYPE_PARSER(construct( - sourced(Parser{}), Parser{})) +TYPE_PARSER(sourced(construct( + Parser{}, Parser{}))) // Standalone declarative constructs -TYPE_PARSER(construct( - sourced(Parser{}), Parser{})) +TYPE_PARSER(sourced(construct( + Parser{}, Parser{}))) TYPE_PARSER(startAccLine >> withMessage("expected OpenACC directive"_err_en_US, - first(sourced(construct( - Parser{})), - sourced(construct( + sourced(first(construct( + Parser{}), + construct( Parser{}))))) TYPE_PARSER(sourced(construct( @@ -293,9 +292,9 @@ TYPE_PARSER(startAccLine >> "SERIAL"_tok >> maybe("LOOP"_tok) >> pure(llvm::acc::Directive::ACCD_serial_loop)))))) -TYPE_PARSER(construct( - sourced(Parser{} / endAccLine), +TYPE_PARSER(sourced(construct( + Parser{} / endAccLine, maybe(Parser{}), - maybe(Parser{} / endAccLine))) + maybe(Parser{} / endAccLine)))) } // namespace Fortran::parser diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp index 33e9ea5a89efd..b0c36eced54c8 100644 --- a/flang/lib/Semantics/resolve-directives.cpp +++ b/flang/lib/Semantics/resolve-directives.cpp @@ -31,15 +31,17 @@ #include #include +namespace Fortran::semantics { + template -static Fortran::semantics::Scope *GetScope( - Fortran::semantics::SemanticsContext &context, const T &x) { - std::optional source{GetLastSource(x)}; - return source ? &context.FindScope(*source) : nullptr; +static Scope *GetScope(SemanticsContext &context, const T &x) { + if (auto source{GetLastSource(x)}) { + return &context.FindScope(*source); + } else { + return nullptr; + } } -namespace Fortran::semantics { - template class DirectiveAttributeVisitor { public: explicit DirectiveAttributeVisitor(SemanticsContext &context) @@ -361,7 +363,7 @@ class AccAttributeVisitor : DirectiveAttributeVisitor { void ResolveAccObject(const parser::AccObject &, Symbol::Flag); Symbol *ResolveAcc(const parser::Name &, Symbol::Flag, Scope &); Symbol *ResolveAcc(Symbol &, Symbol::Flag, Scope &); - Symbol *ResolveName(const parser::Name &, bool parentScope = false); + Symbol *ResolveName(const parser::Name &); Symbol *ResolveFctName(const parser::Name &); Symbol *ResolveAccCommonBlockName(const parser::Name *); Symbol *DeclareOrMarkOtherAccessEntity(const parser::Name &, Symbol::Flag); @@ -1257,31 +1259,22 @@ bool AccAttributeVisitor::Pre(const parser::OpenACCStandaloneConstruct &x) { return true; } -Symbol *AccAttributeVisitor::ResolveName( - const parser::Name &name, bool parentScope) { - Symbol *prev{currScope().FindSymbol(name.source)}; - // Check in parent scope if asked for. - if (!prev && parentScope) { - prev = currScope().parent().FindSymbol(name.source); - } - if (prev != name.symbol) { - name.symbol = prev; - } - return prev; +Symbol *AccAttributeVisitor::ResolveName(const parser::Name &name) { + return name.symbol; } Symbol *AccAttributeVisitor::ResolveFctName(const parser::Name &name) { Symbol *prev{currScope().FindSymbol(name.source)}; - if (!prev || (prev && prev->IsFuncResult())) { + if (prev && prev->IsFuncResult()) { prev = currScope().parent().FindSymbol(name.source); - if (!prev) { - prev = &context_.globalScope().MakeSymbol( - name.source, Attrs{}, ProcEntityDetails{}); - } } - if (prev != name.symbol) { - name.symbol = prev; + if (!prev) { + prev = &*context_.globalScope() + .try_emplace(name.source, ProcEntityDetails{}) + .first->second; } + CHECK(!name.symbol || name.symbol == prev); + name.symbol = prev; return prev; } @@ -1388,9 +1381,8 @@ bool AccAttributeVisitor::Pre(const parser::OpenACCRoutineConstruct &x) { } else { PushContext(verbatim.source, llvm::acc::Directive::ACCD_routine); } - const auto &optName{std::get>(x.t)}; - if (optName) { - if (Symbol *sym = ResolveFctName(*optName)) { + if (const auto &optName{std::get>(x.t)}) { + if (Symbol * sym{ResolveFctName(*optName)}) { Symbol &ultimate{sym->GetUltimate()}; AddRoutineInfoToSymbol(ultimate, x); } else { @@ -1425,7 +1417,7 @@ bool AccAttributeVisitor::Pre(const parser::OpenACCCombinedConstruct &x) { case llvm::acc::Directive::ACCD_kernels_loop: case llvm::acc::Directive::ACCD_parallel_loop: case llvm::acc::Directive::ACCD_serial_loop: - PushContext(combinedDir.source, combinedDir.v); + PushContext(x.source, combinedDir.v); break; default: break; @@ -1706,26 +1698,27 @@ void AccAttributeVisitor::Post(const parser::AccDefaultClause &x) { } } -// For OpenACC constructs, check all the data-refs within the constructs -// and adjust the symbol for each Name if necessary void AccAttributeVisitor::Post(const parser::Name &name) { - auto *symbol{name.symbol}; - if (symbol && WithinConstruct()) { - symbol = &symbol->GetUltimate(); - if (!symbol->owner().IsDerivedType() && !symbol->has() && - !symbol->has() && !IsObjectWithVisibleDSA(*symbol)) { + if (name.symbol && WithinConstruct()) { + const Symbol &symbol{name.symbol->GetUltimate()}; + if (!symbol.owner().IsDerivedType() && !symbol.has() && + !symbol.has() && !IsObjectWithVisibleDSA(symbol)) { if (Symbol * found{currScope().FindSymbol(name.source)}) { - if (symbol != found) { - name.symbol = found; // adjust the symbol within region + if (&symbol != found) { + // adjust the symbol within the region + // TODO: why didn't name resolution set the right name originally? + name.symbol = found; } else if (GetContext().defaultDSA == Symbol::Flag::AccNone) { // 2.5.14. context_.Say(name.source, "The DEFAULT(NONE) clause requires that '%s' must be listed in a data-mapping clause"_err_en_US, - symbol->name()); + symbol.name()); } + } else { + // TODO: assertion here? or clear name.symbol? } } - } // within OpenACC construct + } } Symbol *AccAttributeVisitor::ResolveAccCommonBlockName( @@ -1810,13 +1803,11 @@ Symbol *AccAttributeVisitor::ResolveAcc( Symbol *AccAttributeVisitor::DeclareOrMarkOtherAccessEntity( const parser::Name &name, Symbol::Flag accFlag) { - Symbol *prev{currScope().FindSymbol(name.source)}; - if (!name.symbol || !prev) { + if (name.symbol) { + return DeclareOrMarkOtherAccessEntity(*name.symbol, accFlag); + } else { return nullptr; - } else if (prev != name.symbol) { - name.symbol = prev; } - return DeclareOrMarkOtherAccessEntity(*prev, accFlag); } Symbol *AccAttributeVisitor::DeclareOrMarkOtherAccessEntity( @@ -2990,6 +2981,7 @@ void OmpAttributeVisitor::Post(const parser::Name &name) { } Symbol *OmpAttributeVisitor::ResolveName(const parser::Name *name) { + // TODO: why is the symbol not properly resolved by name resolution? if (auto *resolvedSymbol{ name ? GetContext().scope.FindSymbol(name->source) : nullptr}) { name->symbol = resolvedSymbol; diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index 0af1c94502bb4..6319937dc3ddb 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -1441,6 +1441,30 @@ class AccVisitor : public virtual DeclarationVisitor { void Post(const parser::AccBeginLoopDirective &x) { messageHandler().set_currStmtSource(std::nullopt); } + bool Pre(const parser::OpenACCStandaloneConstruct &x) { + currScope().AddSourceRange(x.source); + return true; + } + bool Pre(const parser::OpenACCCacheConstruct &x) { + currScope().AddSourceRange(x.source); + return true; + } + bool Pre(const parser::OpenACCWaitConstruct &x) { + currScope().AddSourceRange(x.source); + return true; + } + bool Pre(const parser::OpenACCAtomicConstruct &x) { + currScope().AddSourceRange(x.source); + return true; + } + bool Pre(const parser::OpenACCEndConstruct &x) { + currScope().AddSourceRange(x.source); + return true; + } + bool Pre(const parser::OpenACCDeclarativeConstruct &x) { + currScope().AddSourceRange(x.source); + return true; + } void CopySymbolWithDevice(const parser::Name *name); @@ -1480,7 +1504,8 @@ void AccVisitor::CopySymbolWithDevice(const parser::Name *name) { // symbols are created for the one appearing in the use_device // clause. These new symbols have the CUDA Fortran device // attribute. - if (context_.languageFeatures().IsEnabled(common::LanguageFeature::CUDA)) { + if (context_.languageFeatures().IsEnabled(common::LanguageFeature::CUDA) && + name->symbol) { name->symbol = currScope().CopySymbol(*name->symbol); if (auto *object{name->symbol->detailsIf()}) { object->set_cudaDataAttr(common::CUDADataAttr::Device); @@ -1490,15 +1515,12 @@ void AccVisitor::CopySymbolWithDevice(const parser::Name *name) { bool AccVisitor::Pre(const parser::AccClause::UseDevice &x) { for (const auto &accObject : x.v.v) { + Walk(accObject); common::visit( common::visitors{ [&](const parser::Designator &designator) { if (const auto *name{ parser::GetDesignatorNameIfDataRef(designator)}) { - Symbol *prev{currScope().FindSymbol(name->source)}; - if (prev != name->symbol) { - name->symbol = prev; - } CopySymbolWithDevice(name); } else { if (const auto *dataRef{ @@ -1507,13 +1529,8 @@ bool AccVisitor::Pre(const parser::AccClause::UseDevice &x) { common::Indirection; if (auto *ind{std::get_if(&dataRef->u)}) { const parser::ArrayElement &arrayElement{ind->value()}; - Walk(arrayElement.subscripts); const parser::DataRef &base{arrayElement.base}; if (auto *name{std::get_if(&base.u)}) { - Symbol *prev{currScope().FindSymbol(name->source)}; - if (prev != name->symbol) { - name->symbol = prev; - } CopySymbolWithDevice(name); } } @@ -1537,6 +1554,7 @@ void AccVisitor::Post(const parser::OpenACCBlockConstruct &x) { bool AccVisitor::Pre(const parser::OpenACCCombinedConstruct &x) { PushScope(Scope::Kind::OpenACCConstruct, nullptr); + currScope().AddSourceRange(x.source); return true; } diff --git a/flang/lib/Semantics/semantics.cpp b/flang/lib/Semantics/semantics.cpp index bdb5377265c14..2606d997b1cd7 100644 --- a/flang/lib/Semantics/semantics.cpp +++ b/flang/lib/Semantics/semantics.cpp @@ -452,6 +452,15 @@ void SemanticsContext::UpdateScopeIndex( } } +void SemanticsContext::DumpScopeIndex(llvm::raw_ostream &out) const { + out << "scopeIndex_:\n"; + for (const auto &[source, scope] : scopeIndex_) { + out << "source '" << source.ToString() << "' -> scope " << scope + << "... whose source range is '" << scope.sourceRange().ToString() + << "'\n"; + } +} + bool SemanticsContext::IsInModuleFile(parser::CharBlock source) const { for (const Scope *scope{&FindScope(source)}; !scope->IsGlobal(); scope = &scope->parent()) { diff --git a/flang/test/Fir/OpenACC/openacc-type-categories-class.f90 b/flang/test/Fir/OpenACC/openacc-type-categories-class.f90 index e8951cceeeaeb..ec97114d30f88 100644 --- a/flang/test/Fir/OpenACC/openacc-type-categories-class.f90 +++ b/flang/test/Fir/OpenACC/openacc-type-categories-class.f90 @@ -43,4 +43,4 @@ subroutine init_unlimited(this) ! TODO: After using select type - the appropriate type category should be ! possible. Add the rest of the test once OpenACC lowering correctly handles -! unlimited polymorhic. +! unlimited polymorphic. diff --git a/flang/test/Semantics/OpenACC/bug1583.f90 b/flang/test/Semantics/OpenACC/bug1583.f90 new file mode 100644 index 0000000000000..7778d4644a9ff --- /dev/null +++ b/flang/test/Semantics/OpenACC/bug1583.f90 @@ -0,0 +1,23 @@ +! RUN: %python %S/../test_symbols.py %s %flang_fc1 -fopenacc +!DEF: /m Module +module m + !DEF: /m/t PUBLIC DerivedType + type :: t + !DEF: /m/t/c ALLOCATABLE ObjectEntity REAL(4) + real, allocatable :: c(:) + end type +contains + !DEF: /m/sub PUBLIC (Subroutine) Subprogram + !DEF: /m/sub/v ObjectEntity TYPE(t) + subroutine sub (v) + !REF: /m/t + !REF: /m/sub/v + type(t) :: v +!$acc host_data use_device(v%c) + !DEF: /foo EXTERNAL (Subroutine) ProcEntity + !REF: /m/sub/v + !REF: /m/t/c + call foo(v%c) +!$acc end host_data + end subroutine +end module From ef9494b27c64bd63b5a23c74baa7b5e444957013 Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Tue, 21 Oct 2025 13:38:11 -0700 Subject: [PATCH 008/530] [flang] Non-generic interface must shadow outer generic (#164469) A procedure defined in non-generic INTERFACE block must completely shadow any generic interface of the same name in an outer scope. Fixes https://github.com/llvm/llvm-project/issues/164303. --- flang/lib/Semantics/resolve-names.cpp | 6 +++++- flang/test/Semantics/bug164303.f90 | 31 +++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 flang/test/Semantics/bug164303.f90 diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index 6319937dc3ddb..88cc4469034e7 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -5451,7 +5451,8 @@ void SubprogramVisitor::PushBlockDataScope(const parser::Name &name) { } } -// If name is a generic, return specific subprogram with the same name. +// If name is a generic in the same scope, return its specific subprogram with +// the same name, if any. Symbol *SubprogramVisitor::GetSpecificFromGeneric(const parser::Name &name) { // Search for the name but don't resolve it if (auto *symbol{currScope().FindSymbol(name.source)}) { @@ -5461,6 +5462,9 @@ Symbol *SubprogramVisitor::GetSpecificFromGeneric(const parser::Name &name) { // symbol doesn't inherit it and ruin the ability to check it. symbol->attrs().reset(Attr::MODULE); } + } else if (&symbol->owner() != &currScope() && inInterfaceBlock() && + !isGeneric()) { + // non-generic interface shadows outer definition } else if (auto *details{symbol->detailsIf()}) { // found generic, want specific procedure auto *specific{details->specific()}; diff --git a/flang/test/Semantics/bug164303.f90 b/flang/test/Semantics/bug164303.f90 new file mode 100644 index 0000000000000..c356c07392577 --- /dev/null +++ b/flang/test/Semantics/bug164303.f90 @@ -0,0 +1,31 @@ +!RUN: %flang -fc1 -fsyntax-only %s 2>&1 | FileCheck --allow-empty %s +module foo_mod + use, intrinsic :: iso_fortran_env + use, intrinsic :: iso_c_binding + implicit none + + interface new_foo + procedure :: foo_ctor + end interface + +contains + +function foo_ctor(options) result(retval) + implicit none + integer, intent(in) :: options + integer :: retval + + interface +!CHECK-NOT: error: + subroutine new_foo(f, opt) bind(c, name='new_foo') + import + implicit none + integer, intent(inout) :: f + integer(c_int), intent(in) :: opt + end subroutine + end interface + + call new_foo(retval, options) +end function + +end module From d5103c1254a89d6222777303ca2057a10934e45b Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Tue, 21 Oct 2025 13:38:29 -0700 Subject: [PATCH 009/530] [flang] Line continuation for !@acc and !@cuf conditional lines (#164475) Some Fortran source-level features that work for OpenMP !$ conditional lines, such as free form line continuation, don't work for OpenACC !@acc or CUDA !@cuf conditional lines. Make them less particular. Fixes https://github.com/llvm/llvm-project/issues/164470. --- flang/lib/Parser/prescan.cpp | 26 +++++++++----------------- flang/lib/Parser/prescan.h | 12 +++++++++++- flang/test/Preprocessing/bug164470.cuf | 6 ++++++ 3 files changed, 26 insertions(+), 18 deletions(-) create mode 100644 flang/test/Preprocessing/bug164470.cuf diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp index 66e5b2cbd5c7f..df0372bbe554a 100644 --- a/flang/lib/Parser/prescan.cpp +++ b/flang/lib/Parser/prescan.cpp @@ -140,17 +140,9 @@ void Prescanner::Statement() { CHECK(*at_ == '!'); } std::optional condOffset; - if (InOpenMPConditionalLine()) { + if (InOpenMPConditionalLine()) { // !$ condOffset = 2; - } else if (directiveSentinel_[0] == '@' && directiveSentinel_[1] == 'c' && - directiveSentinel_[2] == 'u' && directiveSentinel_[3] == 'f' && - directiveSentinel_[4] == '\0') { - // CUDA conditional compilation line. - condOffset = 5; - } else if (directiveSentinel_[0] == '@' && directiveSentinel_[1] == 'a' && - directiveSentinel_[2] == 'c' && directiveSentinel_[3] == 'c' && - directiveSentinel_[4] == '\0') { - // OpenACC conditional compilation line. + } else if (InOpenACCOrCUDAConditionalLine()) { // !@acc or !@cuf condOffset = 5; } if (condOffset && !preprocessingOnly_) { @@ -166,7 +158,8 @@ void Prescanner::Statement() { } else { // Compiler directive. Emit normalized sentinel, squash following spaces. // Conditional compilation lines (!$) take this path in -E mode too - // so that -fopenmp only has to appear on the later compilation. + // so that -fopenmp only has to appear on the later compilation + // (ditto for !@cuf and !@acc). EmitChar(tokens, '!'); ++at_, ++column_; for (const char *sp{directiveSentinel_}; *sp != '\0'; @@ -202,7 +195,7 @@ void Prescanner::Statement() { } tokens.CloseToken(); SkipSpaces(); - if (InOpenMPConditionalLine() && inFixedForm_ && !tabInCurrentLine_ && + if (InConditionalLine() && inFixedForm_ && !tabInCurrentLine_ && column_ == 6 && *at_ != '\n') { // !$ 0 - turn '0' into a space // !$ 1 - turn '1' into '&' @@ -347,7 +340,7 @@ void Prescanner::Statement() { while (CompilerDirectiveContinuation(tokens, line.sentinel)) { newlineProvenance = GetCurrentProvenance(); } - if (preprocessingOnly_ && inFixedForm_ && InOpenMPConditionalLine() && + if (preprocessingOnly_ && inFixedForm_ && InConditionalLine() && nextLine_ < limit_) { // In -E mode, when the line after !$ conditional compilation is a // regular fixed form continuation line, append a '&' to the line. @@ -1360,11 +1353,10 @@ const char *Prescanner::FixedFormContinuationLine(bool atNewline) { features_.IsEnabled(LanguageFeature::OldDebugLines))) && nextLine_[1] == ' ' && nextLine_[2] == ' ' && nextLine_[3] == ' ' && nextLine_[4] == ' '}; - if (InCompilerDirective() && - !(InOpenMPConditionalLine() && !preprocessingOnly_)) { + if (InCompilerDirective() && !(InConditionalLine() && !preprocessingOnly_)) { // !$ under -E is not continued, but deferred to later compilation if (IsFixedFormCommentChar(col1) && - !(InOpenMPConditionalLine() && preprocessingOnly_)) { + !(InConditionalLine() && preprocessingOnly_)) { int j{1}; for (; j < 5; ++j) { char ch{directiveSentinel_[j - 1]}; @@ -1443,7 +1435,7 @@ const char *Prescanner::FreeFormContinuationLine(bool ampersand) { } p = SkipWhiteSpaceIncludingEmptyMacros(p); if (InCompilerDirective()) { - if (InOpenMPConditionalLine()) { + if (InConditionalLine()) { if (preprocessingOnly_) { // in -E mode, don't treat !$ as a continuation return nullptr; diff --git a/flang/lib/Parser/prescan.h b/flang/lib/Parser/prescan.h index fc38adb926530..5e7481781d944 100644 --- a/flang/lib/Parser/prescan.h +++ b/flang/lib/Parser/prescan.h @@ -171,7 +171,17 @@ class Prescanner { bool InOpenMPConditionalLine() const { return directiveSentinel_ && directiveSentinel_[0] == '$' && !directiveSentinel_[1]; - ; + } + bool InOpenACCOrCUDAConditionalLine() const { + return directiveSentinel_ && directiveSentinel_[0] == '@' && + ((directiveSentinel_[1] == 'a' && directiveSentinel_[2] == 'c' && + directiveSentinel_[3] == 'c') || + (directiveSentinel_[1] == 'c' && directiveSentinel_[2] == 'u' && + directiveSentinel_[3] == 'f')) && + directiveSentinel_[4] == '\0'; + } + bool InConditionalLine() const { + return InOpenMPConditionalLine() || InOpenACCOrCUDAConditionalLine(); } bool InFixedFormSource() const { return inFixedForm_ && !inPreprocessorDirective_ && !InCompilerDirective(); diff --git a/flang/test/Preprocessing/bug164470.cuf b/flang/test/Preprocessing/bug164470.cuf new file mode 100644 index 0000000000000..3e959f40d2e3f --- /dev/null +++ b/flang/test/Preprocessing/bug164470.cuf @@ -0,0 +1,6 @@ +!RUN: %flang_fc1 -x cuda -fdebug-unparse %s 2>&1 | FileCheck %s +!CHECK: ATTRIBUTES(DEVICE) FUNCTION foo() +!@cuf attributes(device) & +function foo() + foo = 1. +end From f6799d2899ef689a5e4d55beef9b52914d2ae95f Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Tue, 21 Oct 2025 13:46:56 -0700 Subject: [PATCH 010/530] [NFC][LLVM][Twine] Change a few `const StringRef &` arguments to value (#163994) Per LLVM Programmer's Manual, StringRef should always be passed by value. Enforcing this for `Twine` functions. --- llvm/include/llvm/ADT/Twine.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/include/llvm/ADT/Twine.h b/llvm/include/llvm/ADT/Twine.h index d9f9c0f0d5d9c..e3b4d5e26fa17 100644 --- a/llvm/include/llvm/ADT/Twine.h +++ b/llvm/include/llvm/ADT/Twine.h @@ -285,7 +285,7 @@ class Twine { } /// Construct from a StringRef. - /*implicit*/ Twine(const StringRef &Str) : LHSKind(PtrAndLengthKind) { + /*implicit*/ Twine(StringRef Str) : LHSKind(PtrAndLengthKind) { LHS.ptrAndLength.ptr = Str.data(); LHS.ptrAndLength.length = Str.size(); assert(isValid() && "Invalid twine!"); @@ -352,7 +352,7 @@ class Twine { // right thing. Yet. /// Construct as the concatenation of a C string and a StringRef. - /*implicit*/ Twine(const char *LHS, const StringRef &RHS) + /*implicit*/ Twine(const char *LHS, StringRef RHS) : LHSKind(CStringKind), RHSKind(PtrAndLengthKind) { this->LHS.cString = LHS; this->RHS.ptrAndLength.ptr = RHS.data(); @@ -361,7 +361,7 @@ class Twine { } /// Construct as the concatenation of a StringRef and a C string. - /*implicit*/ Twine(const StringRef &LHS, const char *RHS) + /*implicit*/ Twine(StringRef LHS, const char *RHS) : LHSKind(PtrAndLengthKind), RHSKind(CStringKind) { this->LHS.ptrAndLength.ptr = LHS.data(); this->LHS.ptrAndLength.length = LHS.size(); @@ -530,14 +530,14 @@ inline Twine operator+(const Twine &LHS, const Twine &RHS) { /// Additional overload to guarantee simplified codegen; this is equivalent to /// concat(). -inline Twine operator+(const char *LHS, const StringRef &RHS) { +inline Twine operator+(const char *LHS, StringRef RHS) { return Twine(LHS, RHS); } /// Additional overload to guarantee simplified codegen; this is equivalent to /// concat(). -inline Twine operator+(const StringRef &LHS, const char *RHS) { +inline Twine operator+(StringRef LHS, const char *RHS) { return Twine(LHS, RHS); } From 419fbf906b7d21808fdf768f07e4791a24b22c99 Mon Sep 17 00:00:00 2001 From: Florian Mayer Date: Tue, 21 Oct 2025 13:50:08 -0700 Subject: [PATCH 011/530] [FlowSensitive] [StatusOr] [3/N] Support absl::Status ops (#163868) `absl::StatusOr::status` allows extraction of the status associated with a StatusOr value. That can also be used to check whether the StatusOr has a value or not. This makes sure code like this is checked properly: ```cpp int target(absl::StatusOr SOR) { if (SOR.status().ok()) return *SOR; return 0; } ``` --- .../Models/UncheckedStatusOrAccessModel.cpp | 74 ++++++++ ...ncheckedStatusOrAccessModelTestFixture.cpp | 161 ++++++++++++++++++ 2 files changed, 235 insertions(+) diff --git a/clang/lib/Analysis/FlowSensitive/Models/UncheckedStatusOrAccessModel.cpp b/clang/lib/Analysis/FlowSensitive/Models/UncheckedStatusOrAccessModel.cpp index c88a4702cae07..f068be5b96d53 100644 --- a/clang/lib/Analysis/FlowSensitive/Models/UncheckedStatusOrAccessModel.cpp +++ b/clang/lib/Analysis/FlowSensitive/Models/UncheckedStatusOrAccessModel.cpp @@ -24,6 +24,7 @@ #include "clang/Analysis/FlowSensitive/DataflowAnalysis.h" #include "clang/Analysis/FlowSensitive/DataflowEnvironment.h" #include "clang/Analysis/FlowSensitive/MatchSwitch.h" +#include "clang/Analysis/FlowSensitive/RecordOps.h" #include "clang/Analysis/FlowSensitive/StorageLocation.h" #include "clang/Analysis/FlowSensitive/Value.h" #include "clang/Basic/LLVM.h" @@ -95,6 +96,18 @@ static QualType getStatusOrValueType(ClassTemplateSpecializationDecl *TRD) { return TRD->getTemplateArgs().get(0).getAsType(); } +static auto ofClassStatus() { + using namespace ::clang::ast_matchers; // NOLINT: Too many names + return ofClass(hasName("::absl::Status")); +} + +static auto isStatusMemberCallWithName(llvm::StringRef member_name) { + using namespace ::clang::ast_matchers; // NOLINT: Too many names + return cxxMemberCallExpr( + on(expr(unless(cxxThisExpr()))), + callee(cxxMethodDecl(hasName(member_name), ofClassStatus()))); +} + static auto isStatusOrMemberCallWithName(llvm::StringRef member_name) { using namespace ::clang::ast_matchers; // NOLINT: Too many names return cxxMemberCallExpr( @@ -244,6 +257,61 @@ static void transferStatusOrOkCall(const CXXMemberCallExpr *Expr, State.Env.setValue(*Expr, OkVal); } +static void transferStatusCall(const CXXMemberCallExpr *Expr, + const MatchFinder::MatchResult &, + LatticeTransferState &State) { + RecordStorageLocation *StatusOrLoc = + getImplicitObjectLocation(*Expr, State.Env); + if (StatusOrLoc == nullptr) + return; + + RecordStorageLocation &StatusLoc = locForStatus(*StatusOrLoc); + + if (State.Env.getValue(locForOk(StatusLoc)) == nullptr) + initializeStatusOr(*StatusOrLoc, State.Env); + + if (Expr->isPRValue()) + copyRecord(StatusLoc, State.Env.getResultObjectLocation(*Expr), State.Env); + else + State.Env.setStorageLocation(*Expr, StatusLoc); +} + +static void transferStatusOkCall(const CXXMemberCallExpr *Expr, + const MatchFinder::MatchResult &, + LatticeTransferState &State) { + RecordStorageLocation *StatusLoc = + getImplicitObjectLocation(*Expr, State.Env); + if (StatusLoc == nullptr) + return; + + if (Value *Val = State.Env.getValue(locForOk(*StatusLoc))) + State.Env.setValue(*Expr, *Val); +} + +static void transferStatusUpdateCall(const CXXMemberCallExpr *Expr, + const MatchFinder::MatchResult &, + LatticeTransferState &State) { + // S.Update(OtherS) sets S to the error code of OtherS if it is OK, + // otherwise does nothing. + assert(Expr->getNumArgs() == 1); + auto *Arg = Expr->getArg(0); + RecordStorageLocation *ArgRecord = + Arg->isPRValue() ? &State.Env.getResultObjectLocation(*Arg) + : State.Env.get(*Arg); + RecordStorageLocation *ThisLoc = getImplicitObjectLocation(*Expr, State.Env); + if (ThisLoc == nullptr || ArgRecord == nullptr) + return; + + auto &ThisOkVal = valForOk(*ThisLoc, State.Env); + auto &ArgOkVal = valForOk(*ArgRecord, State.Env); + auto &A = State.Env.arena(); + auto &NewVal = State.Env.makeAtomicBoolValue(); + State.Env.assume(A.makeImplies(A.makeNot(ThisOkVal.formula()), + A.makeNot(NewVal.formula()))); + State.Env.assume(A.makeImplies(NewVal.formula(), ArgOkVal.formula())); + State.Env.setValue(locForOk(*ThisLoc), NewVal); +} + CFGMatchSwitch buildTransferMatchSwitch(ASTContext &Ctx, CFGMatchSwitchBuilder Builder) { @@ -251,6 +319,12 @@ buildTransferMatchSwitch(ASTContext &Ctx, return std::move(Builder) .CaseOfCFGStmt(isStatusOrMemberCallWithName("ok"), transferStatusOrOkCall) + .CaseOfCFGStmt(isStatusOrMemberCallWithName("status"), + transferStatusCall) + .CaseOfCFGStmt(isStatusMemberCallWithName("ok"), + transferStatusOkCall) + .CaseOfCFGStmt(isStatusMemberCallWithName("Update"), + transferStatusUpdateCall) .Build(); } diff --git a/clang/unittests/Analysis/FlowSensitive/UncheckedStatusOrAccessModelTestFixture.cpp b/clang/unittests/Analysis/FlowSensitive/UncheckedStatusOrAccessModelTestFixture.cpp index 4827cc1d0a7e9..cae926570ffe4 100644 --- a/clang/unittests/Analysis/FlowSensitive/UncheckedStatusOrAccessModelTestFixture.cpp +++ b/clang/unittests/Analysis/FlowSensitive/UncheckedStatusOrAccessModelTestFixture.cpp @@ -2453,6 +2453,167 @@ TEST_P(UncheckedStatusOrAccessModelTest, SubclassOperator) { )cc"); } +TEST_P(UncheckedStatusOrAccessModelTest, UnwrapValueWithStatusCheck) { + ExpectDiagnosticsFor(R"cc( +#include "unchecked_statusor_access_test_defs.h" + + void target(STATUSOR_INT sor) { + if (sor.status().ok()) + sor.value(); + else + sor.value(); // [[unsafe]] + } + )cc"); +} + +TEST_P(UncheckedStatusOrAccessModelTest, UnwrapValueWithStatusRefCheck) { + ExpectDiagnosticsFor(R"cc( +#include "unchecked_statusor_access_test_defs.h" + + void target(STATUSOR_INT sor) { + const STATUS& s = sor.status(); + if (s.ok()) + sor.value(); + else + sor.value(); // [[unsafe]] + } + )cc"); +} + +TEST_P(UncheckedStatusOrAccessModelTest, UnwrapValueWithStatusPtrCheck) { + ExpectDiagnosticsFor(R"cc( +#include "unchecked_statusor_access_test_defs.h" + + void target(STATUSOR_INT sor) { + const STATUS* s = &sor.status(); + if (s->ok()) + sor.value(); + else + sor.value(); // [[unsafe]] + } + )cc"); +} + +TEST_P(UncheckedStatusOrAccessModelTest, UnwrapValueWithMovedStatus) { + ExpectDiagnosticsFor(R"cc( +#include "unchecked_statusor_access_test_defs.h" + + void target(STATUSOR_INT sor) { + if (std::move(sor.status()).ok()) + sor.value(); + else + sor.value(); // [[unsafe]] + } + )cc"); +} + +TEST_P(UncheckedStatusOrAccessModelTest, MembersUsedInsideStatus) { + ExpectDiagnosticsFor(R"cc( + namespace absl { + + class Status { + public: + bool ok() const; + + void target() const { ok(); } + }; + + } // namespace absl + )cc"); +} + +TEST_P(UncheckedStatusOrAccessModelTest, StatusUpdate) { + ExpectDiagnosticsFor(R"cc( +#include "unchecked_statusor_access_test_defs.h" + + void target(STATUSOR_INT sor) { + STATUS s; + s.Update(sor.status()); + if (s.ok()) + sor.value(); + else + sor.value(); // [[unsafe]] + } + )cc"); + + ExpectDiagnosticsFor(R"cc( +#include "unchecked_statusor_access_test_defs.h" + + void target(STATUSOR_INT sor1, STATUSOR_INT sor2) { + STATUS s; + s.Update(sor1.status()); + s.Update(sor2.status()); + if (s.ok()) { + sor1.value(); + sor2.value(); + } + } + )cc"); + + ExpectDiagnosticsFor(R"cc( +#include "unchecked_statusor_access_test_defs.h" + + void target(STATUSOR_INT sor1, STATUSOR_INT sor2) { + STATUS s; + s.Update(sor1.status()); + CHECK(s.ok()); + s.Update(sor2.status()); + sor1.value(); + sor2.value(); // [[unsafe]] + } + )cc"); + + ExpectDiagnosticsFor(R"cc( +#include "unchecked_statusor_access_test_defs.h" + + void target(STATUSOR_INT sor1, STATUSOR_INT sor2) { + STATUS s; + s.Update(sor1.status()); + CHECK(s.ok()); + sor1.value(); + sor2.value(); // [[unsafe]] + } + )cc"); + + ExpectDiagnosticsFor(R"cc( +#include "unchecked_statusor_access_test_defs.h" + + void target(STATUSOR_INT sor1, STATUSOR_INT sor2) { + STATUS s; + STATUS sor1_status = sor1.status(); + s.Update(std::move(sor1_status)); + CHECK(s.ok()); + sor1.value(); + sor2.value(); // [[unsafe]] + } + )cc"); + + ExpectDiagnosticsFor(R"cc( +#include "unchecked_statusor_access_test_defs.h" + + void target(STATUSOR_INT sor1, STATUSOR_INT sor2) { + STATUS s; + STATUS sor1_status = sor1.status(); + sor1_status.Update(sor2.status()); + s.Update(std::move(sor1_status)); + CHECK(s.ok()); + sor1.value(); + sor2.value(); + } + )cc"); + ExpectDiagnosticsFor(R"cc( +#include "unchecked_statusor_access_test_defs.h" + + const STATUS& OptStatus(); + + void target(STATUSOR_INT sor) { + auto s = sor.status(); + s.Update(OptStatus()); + if (s.ok()) sor.value(); + } + )cc"); +} + } // namespace std::string From 1b2532e1df278babbf612ebbf8352fd4e82eea52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Tue, 21 Oct 2025 10:53:30 -1000 Subject: [PATCH 012/530] [flang][cuda][rt] Do not check error on kernel launch (#164463) Align behavior with other CUDA compiler. --- flang-rt/lib/cuda/kernel.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/flang-rt/lib/cuda/kernel.cpp b/flang-rt/lib/cuda/kernel.cpp index e299a114ed7eb..296f4b7913268 100644 --- a/flang-rt/lib/cuda/kernel.cpp +++ b/flang-rt/lib/cuda/kernel.cpp @@ -76,8 +76,8 @@ void RTDEF(CUFLaunchKernel)(const void *kernel, intptr_t gridX, intptr_t gridY, terminator.Crash("Too many invalid grid dimensions"); } cudaStream_t defaultStream = 0; - CUDA_REPORT_IF_ERROR(cudaLaunchKernel(kernel, gridDim, blockDim, params, smem, - stream != nullptr ? (cudaStream_t)(*stream) : defaultStream)); + cudaLaunchKernel(kernel, gridDim, blockDim, params, smem, + stream != nullptr ? (cudaStream_t)(*stream) : defaultStream); } void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX, @@ -153,7 +153,7 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX, launchAttr[0].val.clusterDim.z = clusterZ; config.numAttrs = 1; config.attrs = launchAttr; - CUDA_REPORT_IF_ERROR(cudaLaunchKernelExC(&config, kernel, params)); + cudaLaunchKernelExC(&config, kernel, params); } void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX, @@ -218,8 +218,8 @@ void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX, terminator.Crash("Too many invalid grid dimensions"); } cudaStream_t defaultStream = 0; - CUDA_REPORT_IF_ERROR(cudaLaunchCooperativeKernel(kernel, gridDim, blockDim, - params, smem, stream != nullptr ? (cudaStream_t)*stream : defaultStream)); + cudaLaunchCooperativeKernel(kernel, gridDim, blockDim, params, smem, + stream != nullptr ? (cudaStream_t)*stream : defaultStream); } } // extern "C" From c5b53b1d659f249514a41dcde0f1e28ad4ffbc57 Mon Sep 17 00:00:00 2001 From: Andy Kaylor Date: Tue, 21 Oct 2025 13:55:53 -0700 Subject: [PATCH 013/530] [CIR] Handle return with cleanups (#163849) This adds support for branching through a cleanup block when a return statement is encountered while we're in a scope with cleanups. --- clang/include/clang/CIR/MissingFeatures.h | 5 + clang/lib/CIR/CodeGen/CIRGenCleanup.cpp | 220 +++++++++++++++++++++- clang/lib/CIR/CodeGen/CIRGenCleanup.h | 25 ++- clang/lib/CIR/CodeGen/CIRGenFunction.h | 45 +++++ clang/lib/CIR/CodeGen/CIRGenStmt.cpp | 121 +++++++----- clang/lib/CIR/CodeGen/EHScopeStack.h | 82 +++++++- clang/test/CIR/CodeGen/dtors.cpp | 12 +- clang/test/CIR/CodeGen/lambda.cpp | 24 +-- clang/test/CIR/CodeGen/statement-exprs.c | 10 +- clang/test/CIR/CodeGen/vla.c | 59 +++++- 10 files changed, 522 insertions(+), 81 deletions(-) diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h index 62125fa306cb8..598e826a473a6 100644 --- a/clang/include/clang/CIR/MissingFeatures.h +++ b/clang/include/clang/CIR/MissingFeatures.h @@ -219,6 +219,9 @@ struct MissingFeatures { static bool checkBitfieldClipping() { return false; } static bool cirgenABIInfo() { return false; } static bool cleanupAfterErrorDiags() { return false; } + static bool cleanupAppendInsts() { return false; } + static bool cleanupBranchThrough() { return false; } + static bool cleanupIndexAndBIAdjustment() { return false; } static bool cleanupsToDeactivate() { return false; } static bool constEmitterAggILE() { return false; } static bool constEmitterArrayILE() { return false; } @@ -239,6 +242,7 @@ struct MissingFeatures { static bool deleteArray() { return false; } static bool devirtualizeMemberFunction() { return false; } static bool ehCleanupFlags() { return false; } + static bool ehCleanupHasPrebranchedFallthrough() { return false; } static bool ehCleanupScope() { return false; } static bool ehCleanupScopeRequiresEHCleanup() { return false; } static bool ehCleanupBranchFixups() { return false; } @@ -296,6 +300,7 @@ struct MissingFeatures { static bool setNonGC() { return false; } static bool setObjCGCLValueClass() { return false; } static bool setTargetAttributes() { return false; } + static bool simplifyCleanupEntry() { return false; } static bool sourceLanguageCases() { return false; } static bool stackBase() { return false; } static bool stackSaveOp() { return false; } diff --git a/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp b/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp index aabe4bbdf18c8..851328a7db680 100644 --- a/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp @@ -28,6 +28,46 @@ using namespace clang::CIRGen; // CIRGenFunction cleanup related //===----------------------------------------------------------------------===// +/// Build a unconditional branch to the lexical scope cleanup block +/// or with the labeled blocked if already solved. +/// +/// Track on scope basis, goto's we need to fix later. +cir::BrOp CIRGenFunction::emitBranchThroughCleanup(mlir::Location loc, + JumpDest dest) { + // Insert a branch: to the cleanup block (unsolved) or to the already + // materialized label. Keep track of unsolved goto's. + assert(dest.getBlock() && "assumes incoming valid dest"); + auto brOp = cir::BrOp::create(builder, loc, dest.getBlock()); + + // Calculate the innermost active normal cleanup. + EHScopeStack::stable_iterator topCleanup = + ehStack.getInnermostActiveNormalCleanup(); + + // If we're not in an active normal cleanup scope, or if the + // destination scope is within the innermost active normal cleanup + // scope, we don't need to worry about fixups. + if (topCleanup == ehStack.stable_end() || + topCleanup.encloses(dest.getScopeDepth())) { // works for invalid + // FIXME(cir): should we clear insertion point here? + return brOp; + } + + // If we can't resolve the destination cleanup scope, just add this + // to the current cleanup scope as a branch fixup. + if (!dest.getScopeDepth().isValid()) { + BranchFixup &fixup = ehStack.addBranchFixup(); + fixup.destination = dest.getBlock(); + fixup.destinationIndex = dest.getDestIndex(); + fixup.initialBranch = brOp; + fixup.optimisticBranchBlock = nullptr; + // FIXME(cir): should we clear insertion point here? + return brOp; + } + + cgm.errorNYI(loc, "emitBranchThroughCleanup: valid destination scope depth"); + return brOp; +} + /// Emits all the code to cause the given temporary to be cleaned up. void CIRGenFunction::emitCXXTemporary(const CXXTemporary *temporary, QualType tempType, Address ptr) { @@ -40,6 +80,19 @@ void CIRGenFunction::emitCXXTemporary(const CXXTemporary *temporary, void EHScopeStack::Cleanup::anchor() {} +EHScopeStack::stable_iterator +EHScopeStack::getInnermostActiveNormalCleanup() const { + stable_iterator si = getInnermostNormalCleanup(); + stable_iterator se = stable_end(); + while (si != se) { + EHCleanupScope &cleanup = llvm::cast(*find(si)); + if (cleanup.isActive()) + return si; + si = cleanup.getEnclosingNormalCleanup(); + } + return stable_end(); +} + /// Push an entry of the given size onto this protected-scope stack. char *EHScopeStack::allocate(size_t size) { size = llvm::alignTo(size, ScopeStackAlignment); @@ -75,14 +128,30 @@ void EHScopeStack::deallocate(size_t size) { startOfData += llvm::alignTo(size, ScopeStackAlignment); } +/// Remove any 'null' fixups on the stack. However, we can't pop more +/// fixups than the fixup depth on the innermost normal cleanup, or +/// else fixups that we try to add to that cleanup will end up in the +/// wrong place. We *could* try to shrink fixup depths, but that's +/// actually a lot of work for little benefit. +void EHScopeStack::popNullFixups() { + // We expect this to only be called when there's still an innermost + // normal cleanup; otherwise there really shouldn't be any fixups. + cgf->cgm.errorNYI("popNullFixups"); +} + void *EHScopeStack::pushCleanup(CleanupKind kind, size_t size) { char *buffer = allocate(EHCleanupScope::getSizeForCleanupSize(size)); + bool isNormalCleanup = kind & NormalCleanup; bool isEHCleanup = kind & EHCleanup; bool isLifetimeMarker = kind & LifetimeMarker; assert(!cir::MissingFeatures::innermostEHScope()); - EHCleanupScope *scope = new (buffer) EHCleanupScope(size); + EHCleanupScope *scope = new (buffer) + EHCleanupScope(size, branchFixups.size(), innermostNormalCleanup); + + if (isNormalCleanup) + innermostNormalCleanup = stable_begin(); if (isLifetimeMarker) cgf->cgm.errorNYI("push lifetime marker cleanup"); @@ -100,12 +169,23 @@ void EHScopeStack::popCleanup() { assert(isa(*begin())); EHCleanupScope &cleanup = cast(*begin()); + innermostNormalCleanup = cleanup.getEnclosingNormalCleanup(); deallocate(cleanup.getAllocatedSize()); // Destroy the cleanup. cleanup.destroy(); - assert(!cir::MissingFeatures::ehCleanupBranchFixups()); + // Check whether we can shrink the branch-fixups stack. + if (!branchFixups.empty()) { + // If we no longer have any normal cleanups, all the fixups are + // complete. + if (!hasNormalCleanups()) { + branchFixups.clear(); + } else { + // Otherwise we can still trim out unnecessary nulls. + popNullFixups(); + } + } } EHCatchScope *EHScopeStack::pushCatch(unsigned numHandlers) { @@ -123,6 +203,18 @@ static void emitCleanup(CIRGenFunction &cgf, EHScopeStack::Cleanup *cleanup) { assert(cgf.haveInsertPoint() && "cleanup ended with no insertion point?"); } +static mlir::Block *createNormalEntry(CIRGenFunction &cgf, + EHCleanupScope &scope) { + assert(scope.isNormalCleanup()); + mlir::Block *entry = scope.getNormalBlock(); + if (!entry) { + mlir::OpBuilder::InsertionGuard guard(cgf.getBuilder()); + entry = cgf.curLexScope->getOrCreateCleanupBlock(cgf.getBuilder()); + scope.setNormalBlock(entry); + } + return entry; +} + /// Pops a cleanup block. If the block includes a normal cleanup, the /// current insertion point is threaded through the cleanup, as are /// any branch fixups on the cleanup. @@ -130,17 +222,21 @@ void CIRGenFunction::popCleanupBlock() { assert(!ehStack.empty() && "cleanup stack is empty!"); assert(isa(*ehStack.begin()) && "top not a cleanup!"); EHCleanupScope &scope = cast(*ehStack.begin()); + assert(scope.getFixupDepth() <= ehStack.getNumBranchFixups()); // Remember activation information. bool isActive = scope.isActive(); - assert(!cir::MissingFeatures::ehCleanupBranchFixups()); + // - whether there are branch fix-ups through this cleanup + unsigned fixupDepth = scope.getFixupDepth(); + bool hasFixups = ehStack.getNumBranchFixups() != fixupDepth; // - whether there's a fallthrough mlir::Block *fallthroughSource = builder.getInsertionBlock(); bool hasFallthrough = fallthroughSource != nullptr && isActive; - bool requiresNormalCleanup = scope.isNormalCleanup() && hasFallthrough; + bool requiresNormalCleanup = + scope.isNormalCleanup() && (hasFixups || hasFallthrough); // If we don't need the cleanup at all, we're done. assert(!cir::MissingFeatures::ehCleanupScopeRequiresEHCleanup()); @@ -175,9 +271,119 @@ void CIRGenFunction::popCleanupBlock() { assert(!cir::MissingFeatures::ehCleanupFlags()); - ehStack.popCleanup(); - scope.markEmitted(); - emitCleanup(*this, cleanup); + // If we have a fallthrough and no other need for the cleanup, + // emit it directly. + if (hasFallthrough && !hasFixups) { + assert(!cir::MissingFeatures::ehCleanupScopeRequiresEHCleanup()); + ehStack.popCleanup(); + scope.markEmitted(); + emitCleanup(*this, cleanup); + } else { + // Otherwise, the best approach is to thread everything through + // the cleanup block and then try to clean up after ourselves. + + // Force the entry block to exist. + mlir::Block *normalEntry = createNormalEntry(*this, scope); + + // I. Set up the fallthrough edge in. + mlir::OpBuilder::InsertPoint savedInactiveFallthroughIP; + + // If there's a fallthrough, we need to store the cleanup + // destination index. For fall-throughs this is always zero. + if (hasFallthrough) { + assert(!cir::MissingFeatures::ehCleanupHasPrebranchedFallthrough()); + + } else if (fallthroughSource) { + // Otherwise, save and clear the IP if we don't have fallthrough + // because the cleanup is inactive. + assert(!isActive && "source without fallthrough for active cleanup"); + savedInactiveFallthroughIP = builder.saveInsertionPoint(); + } + + // II. Emit the entry block. This implicitly branches to it if + // we have fallthrough. All the fixups and existing branches + // should already be branched to it. + builder.setInsertionPointToEnd(normalEntry); + + // intercept normal cleanup to mark SEH scope end + assert(!cir::MissingFeatures::ehCleanupScopeRequiresEHCleanup()); + + // III. Figure out where we're going and build the cleanup + // epilogue. + bool hasEnclosingCleanups = + (scope.getEnclosingNormalCleanup() != ehStack.stable_end()); + + // Compute the branch-through dest if we need it: + // - if there are branch-throughs threaded through the scope + // - if fall-through is a branch-through + // - if there are fixups that will be optimistically forwarded + // to the enclosing cleanup + assert(!cir::MissingFeatures::cleanupBranchThrough()); + if (hasFixups && hasEnclosingCleanups) + cgm.errorNYI("cleanup branch-through dest"); + + mlir::Block *fallthroughDest = nullptr; + + // If there's exactly one branch-after and no other threads, + // we can route it without a switch. + // Skip for SEH, since ExitSwitch is used to generate code to indicate + // abnormal termination. (SEH: Except _leave and fall-through at + // the end, all other exits in a _try (return/goto/continue/break) + // are considered as abnormal terminations, using NormalCleanupDestSlot + // to indicate abnormal termination) + assert(!cir::MissingFeatures::cleanupBranchThrough()); + assert(!cir::MissingFeatures::ehCleanupScopeRequiresEHCleanup()); + + // IV. Pop the cleanup and emit it. + scope.markEmitted(); + ehStack.popCleanup(); + assert(ehStack.hasNormalCleanups() == hasEnclosingCleanups); + + emitCleanup(*this, cleanup); + + // Append the prepared cleanup prologue from above. + assert(!cir::MissingFeatures::cleanupAppendInsts()); + + // Optimistically hope that any fixups will continue falling through. + if (fixupDepth != ehStack.getNumBranchFixups()) + cgm.errorNYI("cleanup fixup depth mismatch"); + + // V. Set up the fallthrough edge out. + + // Case 1: a fallthrough source exists but doesn't branch to the + // cleanup because the cleanup is inactive. + if (!hasFallthrough && fallthroughSource) { + // Prebranched fallthrough was forwarded earlier. + // Non-prebranched fallthrough doesn't need to be forwarded. + // Either way, all we need to do is restore the IP we cleared before. + assert(!isActive); + cgm.errorNYI("cleanup inactive fallthrough"); + + // Case 2: a fallthrough source exists and should branch to the + // cleanup, but we're not supposed to branch through to the next + // cleanup. + } else if (hasFallthrough && fallthroughDest) { + cgm.errorNYI("cleanup fallthrough destination"); + + // Case 3: a fallthrough source exists and should branch to the + // cleanup and then through to the next. + } else if (hasFallthrough) { + // Everything is already set up for this. + + // Case 4: no fallthrough source exists. + } else { + // FIXME(cir): should we clear insertion point here? + } + + // VI. Assorted cleaning. + + // Check whether we can merge NormalEntry into a single predecessor. + // This might invalidate (non-IR) pointers to NormalEntry. + // + // If it did invalidate those pointers, and normalEntry was the same + // as NormalExit, go back and patch up the fixups. + assert(!cir::MissingFeatures::simplifyCleanupEntry()); + } } /// Pops cleanup blocks until the given savepoint is reached. diff --git a/clang/lib/CIR/CodeGen/CIRGenCleanup.h b/clang/lib/CIR/CodeGen/CIRGenCleanup.h index 5852ea3f43dd5..9acf8b1f20e79 100644 --- a/clang/lib/CIR/CodeGen/CIRGenCleanup.h +++ b/clang/lib/CIR/CodeGen/CIRGenCleanup.h @@ -151,6 +151,18 @@ class EHCatchScope : public EHScope { /// A cleanup scope which generates the cleanup blocks lazily. class alignas(EHScopeStack::ScopeStackAlignment) EHCleanupScope : public EHScope { + /// The nearest normal cleanup scope enclosing this one. + EHScopeStack::stable_iterator enclosingNormal; + + /// The dual entry/exit block along the normal edge. This is lazily + /// created if needed before the cleanup is popped. + mlir::Block *normalBlock = nullptr; + + /// The number of fixups required by enclosing scopes (not including + /// this one). If this is the top cleanup scope, all the fixups + /// from this index onwards belong to this scope. + unsigned fixupDepth = 0; + public: /// Gets the size required for a lazy cleanup scope with the given /// cleanup-data requirements. @@ -162,7 +174,10 @@ class alignas(EHScopeStack::ScopeStackAlignment) EHCleanupScope return sizeof(EHCleanupScope) + cleanupBits.cleanupSize; } - EHCleanupScope(unsigned cleanupSize) : EHScope(EHScope::Cleanup) { + EHCleanupScope(unsigned cleanupSize, unsigned fixupDepth, + EHScopeStack::stable_iterator enclosingNormal) + : EHScope(EHScope::Cleanup), enclosingNormal(enclosingNormal), + fixupDepth(fixupDepth) { // TODO(cir): When exception handling is upstreamed, isNormalCleanup and // isEHCleanup will be arguments to the constructor. cleanupBits.isNormalCleanup = true; @@ -180,11 +195,19 @@ class alignas(EHScopeStack::ScopeStackAlignment) EHCleanupScope // Objects of EHCleanupScope are not destructed. Use destroy(). ~EHCleanupScope() = delete; + mlir::Block *getNormalBlock() const { return normalBlock; } + void setNormalBlock(mlir::Block *bb) { normalBlock = bb; } + bool isNormalCleanup() const { return cleanupBits.isNormalCleanup; } bool isActive() const { return cleanupBits.isActive; } void setActive(bool isActive) { cleanupBits.isActive = isActive; } + unsigned getFixupDepth() const { return fixupDepth; } + EHScopeStack::stable_iterator getEnclosingNormalCleanup() const { + return enclosingNormal; + } + size_t getCleanupSize() const { return cleanupBits.cleanupSize; } void *getCleanupBuffer() { return this + 1; } diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h index 706801f95bb6e..e3b9b6a8180d9 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.h +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h @@ -60,11 +60,44 @@ class CIRGenFunction : public CIRGenTypeCache { /// is where the next operations will be introduced. CIRGenBuilderTy &builder; + /// A jump destination is an abstract label, branching to which may + /// require a jump out through normal cleanups. + struct JumpDest { + JumpDest() = default; + JumpDest(mlir::Block *block, EHScopeStack::stable_iterator depth = {}, + unsigned index = 0) + : block(block) {} + + bool isValid() const { return block != nullptr; } + mlir::Block *getBlock() const { return block; } + EHScopeStack::stable_iterator getScopeDepth() const { return scopeDepth; } + unsigned getDestIndex() const { return index; } + + // This should be used cautiously. + void setScopeDepth(EHScopeStack::stable_iterator depth) { + scopeDepth = depth; + } + + private: + mlir::Block *block = nullptr; + EHScopeStack::stable_iterator scopeDepth; + unsigned index; + }; + public: /// The GlobalDecl for the current function being compiled or the global /// variable currently being initialized. clang::GlobalDecl curGD; + /// Unified return block. + /// In CIR this is a function because each scope might have + /// its associated return block. + JumpDest returnBlock(mlir::Block *retBlock) { + return getJumpDestInCurrentScope(retBlock); + } + + unsigned nextCleanupDestIndex = 1; + /// The compiler-generated variable that holds the return value. std::optional fnRetAlloca; @@ -574,6 +607,16 @@ class CIRGenFunction : public CIRGenTypeCache { } }; + /// The given basic block lies in the current EH scope, but may be a + /// target of a potentially scope-crossing jump; get a stable handle + /// to which we can perform this jump later. + /// CIRGen: this mostly tracks state for figuring out the proper scope + /// information, no actual branches are emitted. + JumpDest getJumpDestInCurrentScope(mlir::Block *target) { + return JumpDest(target, ehStack.getInnermostNormalCleanup(), + nextCleanupDestIndex++); + } + /// Perform the usual unary conversions on the specified expression and /// compare the result against zero, returning an Int1Ty value. mlir::Value evaluateExprAsBool(const clang::Expr *e); @@ -1221,6 +1264,8 @@ class CIRGenFunction : public CIRGenTypeCache { LValue emitBinaryOperatorLValue(const BinaryOperator *e); + cir::BrOp emitBranchThroughCleanup(mlir::Location loc, JumpDest dest); + mlir::LogicalResult emitBreakStmt(const clang::BreakStmt &s); RValue emitBuiltinExpr(const clang::GlobalDecl &gd, unsigned builtinID, diff --git a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp index ad8c4d004654f..f486c46be2035 100644 --- a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp @@ -446,54 +446,89 @@ mlir::LogicalResult CIRGenFunction::emitReturnStmt(const ReturnStmt &s) { mlir::Location loc = getLoc(s.getSourceRange()); const Expr *rv = s.getRetValue(); - if (getContext().getLangOpts().ElideConstructors && s.getNRVOCandidate() && - s.getNRVOCandidate()->isNRVOVariable()) { - assert(!cir::MissingFeatures::openMP()); - assert(!cir::MissingFeatures::nrvo()); - } else if (!rv) { - // No return expression. Do nothing. - } else if (rv->getType()->isVoidType()) { - // Make sure not to return anything, but evaluate the expression - // for side effects. - if (rv) { - emitAnyExpr(rv); + RunCleanupsScope cleanupScope(*this); + bool createNewScope = false; + if (const auto *ewc = dyn_cast_or_null(rv)) { + rv = ewc->getSubExpr(); + createNewScope = true; + } + + auto handleReturnVal = [&]() { + if (getContext().getLangOpts().ElideConstructors && s.getNRVOCandidate() && + s.getNRVOCandidate()->isNRVOVariable()) { + assert(!cir::MissingFeatures::openMP()); + assert(!cir::MissingFeatures::nrvo()); + } else if (!rv) { + // No return expression. Do nothing. + } else if (rv->getType()->isVoidType()) { + // Make sure not to return anything, but evaluate the expression + // for side effects. + if (rv) { + emitAnyExpr(rv); + } + } else if (cast(curGD.getDecl()) + ->getReturnType() + ->isReferenceType()) { + // If this function returns a reference, take the address of the + // expression rather than the value. + RValue result = emitReferenceBindingToExpr(rv); + builder.CIRBaseBuilderTy::createStore(loc, result.getValue(), + *fnRetAlloca); + } else { + mlir::Value value = nullptr; + switch (CIRGenFunction::getEvaluationKind(rv->getType())) { + case cir::TEK_Scalar: + value = emitScalarExpr(rv); + if (value) { // Change this to an assert once emitScalarExpr is complete + builder.CIRBaseBuilderTy::createStore(loc, value, *fnRetAlloca); + } + break; + case cir::TEK_Complex: + emitComplexExprIntoLValue(rv, + makeAddrLValue(returnValue, rv->getType()), + /*isInit=*/true); + break; + case cir::TEK_Aggregate: + assert(!cir::MissingFeatures::aggValueSlotGC()); + emitAggExpr(rv, AggValueSlot::forAddr(returnValue, Qualifiers(), + AggValueSlot::IsDestructed, + AggValueSlot::IsNotAliased, + getOverlapForReturnValue())); + break; + } } - } else if (cast(curGD.getDecl()) - ->getReturnType() - ->isReferenceType()) { - // If this function returns a reference, take the address of the - // expression rather than the value. - RValue result = emitReferenceBindingToExpr(rv); - builder.CIRBaseBuilderTy::createStore(loc, result.getValue(), *fnRetAlloca); + }; + + if (!createNewScope) { + handleReturnVal(); } else { - mlir::Value value = nullptr; - switch (CIRGenFunction::getEvaluationKind(rv->getType())) { - case cir::TEK_Scalar: - value = emitScalarExpr(rv); - if (value) { // Change this to an assert once emitScalarExpr is complete - builder.CIRBaseBuilderTy::createStore(loc, value, *fnRetAlloca); - } - break; - case cir::TEK_Complex: - emitComplexExprIntoLValue(rv, makeAddrLValue(returnValue, rv->getType()), - /*isInit=*/true); - break; - case cir::TEK_Aggregate: - assert(!cir::MissingFeatures::aggValueSlotGC()); - emitAggExpr(rv, AggValueSlot::forAddr(returnValue, Qualifiers(), - AggValueSlot::IsDestructed, - AggValueSlot::IsNotAliased, - getOverlapForReturnValue())); - break; + mlir::Location scopeLoc = + getLoc(rv ? rv->getSourceRange() : s.getSourceRange()); + // First create cir.scope and later emit it's body. Otherwise all CIRGen + // dispatched by `handleReturnVal()` might needs to manipulate blocks and + // look into parents, which are all unlinked. + mlir::OpBuilder::InsertPoint scopeBody; + cir::ScopeOp::create(builder, scopeLoc, /*scopeBuilder=*/ + [&](mlir::OpBuilder &b, mlir::Location loc) { + scopeBody = b.saveInsertionPoint(); + }); + { + mlir::OpBuilder::InsertionGuard guard(builder); + builder.restoreInsertionPoint(scopeBody); + CIRGenFunction::LexicalScope lexScope{*this, scopeLoc, + builder.getInsertionBlock()}; + handleReturnVal(); } } + cleanupScope.forceCleanup(); + + // In CIR we might have returns in different scopes. + // FIXME(cir): cleanup code is handling actual return emission, the logic + // should try to match traditional codegen more closely (to the extent which + // is possible). auto *retBlock = curLexScope->getOrCreateRetBlock(*this, loc); - // This should emit a branch through the cleanup block if one exists. - builder.create(loc, retBlock); - assert(!cir::MissingFeatures::emitBranchThroughCleanup()); - if (ehStack.stable_begin() != currentCleanupStackDepth) - cgm.errorNYI(s.getSourceRange(), "return with cleanup stack"); + emitBranchThroughCleanup(loc, returnBlock(retBlock)); // Insert the new block to continue codegen after branch to ret block. builder.createBlock(builder.getBlock()->getParent()); @@ -1063,5 +1098,5 @@ void CIRGenFunction::emitReturnOfRValue(mlir::Location loc, RValue rv, assert(!cir::MissingFeatures::emitBranchThroughCleanup()); builder.create(loc, retBlock); if (ehStack.stable_begin() != currentCleanupStackDepth) - cgm.errorNYI(loc, "return with cleanup stack"); + cgm.errorNYI(loc, "return of r-value with cleanup stack"); } diff --git a/clang/lib/CIR/CodeGen/EHScopeStack.h b/clang/lib/CIR/CodeGen/EHScopeStack.h index 1d6e671b21b79..4198c23c9cbed 100644 --- a/clang/lib/CIR/CodeGen/EHScopeStack.h +++ b/clang/lib/CIR/CodeGen/EHScopeStack.h @@ -18,12 +18,38 @@ #ifndef CLANG_LIB_CIR_CODEGEN_EHSCOPESTACK_H #define CLANG_LIB_CIR_CODEGEN_EHSCOPESTACK_H +#include "clang/CIR/Dialect/IR/CIRDialect.h" #include "llvm/ADT/SmallVector.h" namespace clang::CIRGen { class CIRGenFunction; +/// A branch fixup. These are required when emitting a goto to a +/// label which hasn't been emitted yet. The goto is optimistically +/// emitted as a branch to the basic block for the label, and (if it +/// occurs in a scope with non-trivial cleanups) a fixup is added to +/// the innermost cleanup. When a (normal) cleanup is popped, any +/// unresolved fixups in that scope are threaded through the cleanup. +struct BranchFixup { + /// The block containing the terminator which needs to be modified + /// into a switch if this fixup is resolved into the current scope. + /// If null, LatestBranch points directly to the destination. + mlir::Block *optimisticBranchBlock = nullptr; + + /// The ultimate destination of the branch. + /// + /// This can be set to null to indicate that this fixup was + /// successfully resolved. + mlir::Block *destination = nullptr; + + /// The destination index value. + unsigned destinationIndex = 0; + + /// The initial branch of the fixup. + cir::BrOp initialBranch = {}; +}; + enum CleanupKind : unsigned { /// Denotes a cleanup that should run when a scope is exited using exceptional /// control flow (a throw statement leading to stack unwinding, ). @@ -126,9 +152,31 @@ class EHScopeStack { /// The first valid entry in the buffer. char *startOfData = nullptr; + /// The innermost normal cleanup on the stack. + stable_iterator innermostNormalCleanup = stable_end(); + /// The CGF this Stack belong to CIRGenFunction *cgf = nullptr; + /// The current set of branch fixups. A branch fixup is a jump to + /// an as-yet unemitted label, i.e. a label for which we don't yet + /// know the EH stack depth. Whenever we pop a cleanup, we have + /// to thread all the current branch fixups through it. + /// + /// Fixups are recorded as the Use of the respective branch or + /// switch statement. The use points to the final destination. + /// When popping out of a cleanup, these uses are threaded through + /// the cleanup and adjusted to point to the new cleanup. + /// + /// Note that branches are allowed to jump into protected scopes + /// in certain situations; e.g. the following code is legal: + /// struct A { ~A(); }; // trivial ctor, non-trivial dtor + /// goto foo; + /// A a; + /// foo: + /// bar(); + llvm::SmallVector branchFixups; + // This class uses a custom allocator for maximum efficiency because cleanups // are allocated and freed very frequently. It's basically a bump pointer // allocator, but we can't use LLVM's BumpPtrAllocator because we use offsets @@ -166,6 +214,18 @@ class EHScopeStack { /// Determines whether the exception-scopes stack is empty. bool empty() const { return startOfData == endOfBuffer; } + /// Determines whether there are any normal cleanups on the stack. + bool hasNormalCleanups() const { + return innermostNormalCleanup != stable_end(); + } + + /// Returns the innermost normal cleanup on the stack, or + /// stable_end() if there are no normal cleanups. + stable_iterator getInnermostNormalCleanup() const { + return innermostNormalCleanup; + } + stable_iterator getInnermostActiveNormalCleanup() const; + /// An unstable reference to a scope-stack depth. Invalidated by /// pushes but not pops. class iterator; @@ -180,12 +240,30 @@ class EHScopeStack { return stable_iterator(endOfBuffer - startOfData); } + /// Create a stable reference to the bottom of the EH stack. + static stable_iterator stable_end() { return stable_iterator(0); } + /// Turn a stable reference to a scope depth into a unstable pointer /// to the EH stack. iterator find(stable_iterator savePoint) const; - /// Create a stable reference to the bottom of the EH stack. - static stable_iterator stable_end() { return stable_iterator(0); } + /// Add a branch fixup to the current cleanup scope. + BranchFixup &addBranchFixup() { + assert(hasNormalCleanups() && "adding fixup in scope without cleanups"); + branchFixups.push_back(BranchFixup()); + return branchFixups.back(); + } + + unsigned getNumBranchFixups() const { return branchFixups.size(); } + BranchFixup &getBranchFixup(unsigned i) { + assert(i < getNumBranchFixups()); + return branchFixups[i]; + } + + /// Pops lazily-removed fixups from the end of the list. This + /// should only be called by procedures which have just popped a + /// cleanup or resolved one or more fixups. + void popNullFixups(); }; } // namespace clang::CIRGen diff --git a/clang/test/CIR/CodeGen/dtors.cpp b/clang/test/CIR/CodeGen/dtors.cpp index f2c80a547f1d3..1fe048b7d5327 100644 --- a/clang/test/CIR/CodeGen/dtors.cpp +++ b/clang/test/CIR/CodeGen/dtors.cpp @@ -35,7 +35,7 @@ bool make_temp(const B &) { return false; } bool test_temp_or() { return make_temp(1) || make_temp(2); } // CIR: cir.func{{.*}} @_Z12test_temp_orv() -// CIR: %[[SCOPE:.*]] = cir.scope { +// CIR: cir.scope { // CIR: %[[REF_TMP0:.*]] = cir.alloca !rec_B, !cir.ptr, ["ref.tmp0"] // CIR: %[[ONE:.*]] = cir.const #cir.int<1> // CIR: cir.call @_ZN1BC2Ei(%[[REF_TMP0]], %[[ONE]]) @@ -51,9 +51,9 @@ bool test_temp_or() { return make_temp(1) || make_temp(2); } // CIR: cir.call @_ZN1BD2Ev(%[[REF_TMP1]]) // CIR: cir.yield %[[MAKE_TEMP1]] : !cir.bool // CIR: }) +// CIR: cir.store{{.*}} %[[TERNARY]], %[[RETVAL:.*]] // CIR: cir.call @_ZN1BD2Ev(%[[REF_TMP0]]) -// CIR: cir.yield %[[TERNARY]] : !cir.bool -// CIR: } : !cir.bool +// CIR: } // LLVM: define{{.*}} i1 @_Z12test_temp_orv(){{.*}} { // LLVM: %[[REF_TMP0:.*]] = alloca %struct.B @@ -105,7 +105,7 @@ bool test_temp_or() { return make_temp(1) || make_temp(2); } bool test_temp_and() { return make_temp(1) && make_temp(2); } // CIR: cir.func{{.*}} @_Z13test_temp_andv() -// CIR: %[[SCOPE:.*]] = cir.scope { +// CIR: cir.scope { // CIR: %[[REF_TMP0:.*]] = cir.alloca !rec_B, !cir.ptr, ["ref.tmp0"] // CIR: %[[ONE:.*]] = cir.const #cir.int<1> // CIR: cir.call @_ZN1BC2Ei(%[[REF_TMP0]], %[[ONE]]) @@ -121,9 +121,9 @@ bool test_temp_and() { return make_temp(1) && make_temp(2); } // CIR: %[[FALSE:.*]] = cir.const #false // CIR: cir.yield %[[FALSE]] : !cir.bool // CIR: }) +// CIR: cir.store{{.*}} %[[TERNARY]], %[[RETVAL:.*]] // CIR: cir.call @_ZN1BD2Ev(%[[REF_TMP0]]) -// CIR: cir.yield %[[TERNARY]] : !cir.bool -// CIR: } : !cir.bool +// CIR: } // LLVM: define{{.*}} i1 @_Z13test_temp_andv(){{.*}} { // LLVM: %[[REF_TMP0:.*]] = alloca %struct.B diff --git a/clang/test/CIR/CodeGen/lambda.cpp b/clang/test/CIR/CodeGen/lambda.cpp index 0c32ceb187df4..91380b9bea296 100644 --- a/clang/test/CIR/CodeGen/lambda.cpp +++ b/clang/test/CIR/CodeGen/lambda.cpp @@ -219,14 +219,13 @@ int f() { // CIR: cir.func dso_local @_Z1fv() -> !s32i {{.*}} { // CIR: %[[RETVAL:.*]] = cir.alloca !s32i, !cir.ptr, ["__retval"] -// CIR: %[[SCOPE_RET:.*]] = cir.scope { +// CIR: cir.scope { // CIR: %[[TMP:.*]] = cir.alloca ![[REC_LAM_G2]], !cir.ptr, ["ref.tmp0"] // CIR: %[[G2:.*]] = cir.call @_Z2g2v() : () -> ![[REC_LAM_G2]] // CIR: cir.store{{.*}} %[[G2]], %[[TMP]] // CIR: %[[RESULT:.*]] = cir.call @_ZZ2g2vENK3$_0clEv(%[[TMP]]) -// CIR: cir.yield %[[RESULT]] +// CIR: cir.store{{.*}} %[[RESULT]], %[[RETVAL]] // CIR: } -// CIR: cir.store{{.*}} %[[SCOPE_RET]], %[[RETVAL]] // CIR: %[[RET:.*]] = cir.load{{.*}} %[[RETVAL]] // CIR: cir.return %[[RET]] @@ -255,10 +254,9 @@ int f() { // LLVM: %[[G2:.*]] = call %[[REC_LAM_G2]] @_Z2g2v() // LLVM: store %[[REC_LAM_G2]] %[[G2]], ptr %[[TMP]] // LLVM: %[[RESULT:.*]] = call i32 @"_ZZ2g2vENK3$_0clEv"(ptr %[[TMP]]) +// LLVM: store i32 %[[RESULT]], ptr %[[RETVAL]] // LLVM: br label %[[RET_BB:.*]] // LLVM: [[RET_BB]]: -// LLVM: %[[RETPHI:.*]] = phi i32 [ %[[RESULT]], %[[SCOPE_BB]] ] -// LLVM: store i32 %[[RETPHI]], ptr %[[RETVAL]] // LLVM: %[[RET:.*]] = load i32, ptr %[[RETVAL]] // LLVM: ret i32 %[[RET]] @@ -333,14 +331,13 @@ struct A { // CIR: %[[RETVAL:.*]] = cir.alloca !s32i, !cir.ptr, ["__retval"] // CIR: cir.store %[[THIS_ARG]], %[[THIS_ADDR]] // CIR: %[[THIS]] = cir.load deref %[[THIS_ADDR]] : !cir.ptr>, !cir.ptr -// CIR: %[[SCOPE_RET:.*]] = cir.scope { +// CIR: cir.scope { // CIR: %[[LAM_ADDR:.*]] = cir.alloca ![[REC_LAM_A]], !cir.ptr, ["ref.tmp0"] // CIR: %[[STRUCT_A:.*]] = cir.get_member %[[LAM_ADDR]][0] {name = "this"} : !cir.ptr -> !cir.ptr // CIR: cir.call @_ZN1AC1ERKS_(%[[STRUCT_A]], %[[THIS]]){{.*}} : (!cir.ptr, !cir.ptr){{.*}} -> () // CIR: %[[LAM_RET:.*]] = cir.call @_ZZN1A3fooEvENKUlvE_clEv(%[[LAM_ADDR]]) -// CIR: cir.yield %[[LAM_RET]] +// CIR: cir.store{{.*}} %[[LAM_RET]], %[[RETVAL]] // CIR: } -// CIR: cir.store{{.*}} %[[SCOPE_RET]], %[[RETVAL]] // CIR: %[[RET:.*]] = cir.load{{.*}} %[[RETVAL]] // CIR: cir.return %[[RET]] @@ -355,10 +352,9 @@ struct A { // LLVM: %[[STRUCT_A:.*]] = getelementptr %[[REC_LAM_A]], ptr %[[LAM_ALLOCA]], i32 0, i32 0 // LLVM: call void @_ZN1AC1ERKS_(ptr %[[STRUCT_A]], ptr %[[THIS]]) // LLVM: %[[LAM_RET:.*]] = call i32 @_ZZN1A3fooEvENKUlvE_clEv(ptr %[[LAM_ALLOCA]]) +// LLVM: store i32 %[[LAM_RET]], ptr %[[RETVAL]] // LLVM: br label %[[RET_BB:.*]] // LLVM: [[RET_BB]]: -// LLVM: %[[RETPHI:.*]] = phi i32 [ %[[LAM_RET]], %[[SCOPE_BB]] ] -// LLVM: store i32 %[[RETPHI]], ptr %[[RETVAL]] // LLVM: %[[RET:.*]] = load i32, ptr %[[RETVAL]] // LLVM: ret i32 %[[RET]] @@ -407,14 +403,13 @@ struct A { // CIR: %[[RETVAL:.*]] = cir.alloca !s32i, !cir.ptr, ["__retval"] // CIR: cir.store %[[THIS_ARG]], %[[THIS_ADDR]] // CIR: %[[THIS]] = cir.load %[[THIS_ADDR]] : !cir.ptr>, !cir.ptr -// CIR: %[[SCOPE_RET:.*]] = cir.scope { +// CIR: cir.scope { // CIR: %[[LAM_ADDR:.*]] = cir.alloca ![[REC_LAM_PTR_A]], !cir.ptr, ["ref.tmp0"] // CIR: %[[A_ADDR_ADDR:.*]] = cir.get_member %[[LAM_ADDR]][0] {name = "this"} : !cir.ptr -> !cir.ptr> // CIR: cir.store{{.*}} %[[THIS]], %[[A_ADDR_ADDR]] // CIR: %[[LAM_RET:.*]] = cir.call @_ZZN1A3barEvENKUlvE_clEv(%[[LAM_ADDR]]) -// CIR: cir.yield %[[LAM_RET]] +// CIR: cir.store{{.*}} %[[LAM_RET]], %[[RETVAL]] // CIR: } -// CIR: cir.store{{.*}} %[[SCOPE_RET]], %[[RETVAL]] // CIR: %[[RET:.*]] = cir.load{{.*}} %[[RETVAL]] // CIR: cir.return %[[RET]] @@ -429,10 +424,9 @@ struct A { // LLVM: %[[A_ADDR_ADDR:.*]] = getelementptr %[[REC_LAM_PTR_A]], ptr %[[LAM_ALLOCA]], i32 0, i32 0 // LLVM: store ptr %[[THIS]], ptr %[[A_ADDR_ADDR]] // LLVM: %[[LAM_RET:.*]] = call i32 @_ZZN1A3barEvENKUlvE_clEv(ptr %[[LAM_ALLOCA]]) +// LLVM: store i32 %[[LAM_RET]], ptr %[[RETVAL]] // LLVM: br label %[[RET_BB:.*]] // LLVM: [[RET_BB]]: -// LLVM: %[[RETPHI:.*]] = phi i32 [ %[[LAM_RET]], %[[SCOPE_BB]] ] -// LLVM: store i32 %[[RETPHI]], ptr %[[RETVAL]] // LLVM: %[[RET:.*]] = load i32, ptr %[[RETVAL]] // LLVM: ret i32 %[[RET]] diff --git a/clang/test/CIR/CodeGen/statement-exprs.c b/clang/test/CIR/CodeGen/statement-exprs.c index f6ec9ecd1b67e..c784ec9eda7d8 100644 --- a/clang/test/CIR/CodeGen/statement-exprs.c +++ b/clang/test/CIR/CodeGen/statement-exprs.c @@ -218,7 +218,7 @@ struct S { int x; }; int test3() { return ({ struct S s = {1}; s; }).x; } // CIR: cir.func no_proto dso_local @test3() -> !s32i // CIR: %[[RETVAL:.+]] = cir.alloca !s32i, !cir.ptr, ["__retval"] -// CIR: %[[YIELDVAL:.+]] = cir.scope { +// CIR: cir.scope { // CIR: %[[REF_TMP0:.+]] = cir.alloca !rec_S, !cir.ptr, ["ref.tmp0"] // CIR: %[[TMP:.+]] = cir.alloca !rec_S, !cir.ptr, ["tmp"] // CIR: cir.scope { @@ -230,9 +230,8 @@ int test3() { return ({ struct S s = {1}; s; }).x; } // CIR: } // CIR: %[[GEP_X_TMP:.+]] = cir.get_member %[[REF_TMP0]][0] {name = "x"} : !cir.ptr -> !cir.ptr // CIR: %[[XVAL:.+]] = cir.load {{.*}} %[[GEP_X_TMP]] : !cir.ptr, !s32i -// CIR: cir.yield %[[XVAL]] : !s32i -// CIR: } : !s32i -// CIR: cir.store %[[YIELDVAL]], %[[RETVAL]] : !s32i, !cir.ptr +// CIR: cir.store{{.*}} %[[XVAL]], %[[RETVAL]] : !s32i, !cir.ptr +// CIR: } // CIR: %[[RES:.+]] = cir.load %[[RETVAL]] : !cir.ptr, !s32i // CIR: cir.return %[[RES]] : !s32i @@ -252,10 +251,9 @@ int test3() { return ({ struct S s = {1}; s; }).x; } // LLVM: [[LBL8]]: // LLVM: %[[GEP_VAR1:.+]] = getelementptr %struct.S, ptr %[[VAR1]], i32 0, i32 0 // LLVM: %[[LOAD_X:.+]] = load i32, ptr %[[GEP_VAR1]] +// LLVM: store i32 %[[LOAD_X]], ptr %[[VAR4]] // LLVM: br label %[[LBL11:.+]] // LLVM: [[LBL11]]: -// LLVM: %[[PHI:.+]] = phi i32 [ %[[LOAD_X]], %[[LBL8]] ] -// LLVM: store i32 %[[PHI]], ptr %[[VAR4]] // LLVM: %[[RES:.+]] = load i32, ptr %[[VAR4]] // LLVM: ret i32 %[[RES]] diff --git a/clang/test/CIR/CodeGen/vla.c b/clang/test/CIR/CodeGen/vla.c index b22c704f67198..0af4f838b6287 100644 --- a/clang/test/CIR/CodeGen/vla.c +++ b/clang/test/CIR/CodeGen/vla.c @@ -282,4 +282,61 @@ void f3(unsigned len) { // break; // } // } - \ No newline at end of file + +int f5(unsigned long len) { + int arr[len]; + return arr[2]; +} + +// CIR: cir.func{{.*}} @f5(%[[LEN_ARG:.*]]: !u64i {{.*}}) -> !s32i +// CIR: %[[LEN_ADDR:.*]] = cir.alloca !u64i, !cir.ptr, ["len", init] +// CIR: %[[RET_ADDR:.*]] = cir.alloca !s32i, !cir.ptr, ["__retval"] +// CIR: %[[SAVED_STACK:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["saved_stack"] +// CIR: cir.store{{.*}} %[[LEN_ARG]], %[[LEN_ADDR]] +// CIR: %[[LEN:.*]] = cir.load{{.*}} %[[LEN_ADDR]] +// CIR: %[[STACK_PTR:.*]] = cir.stacksave +// CIR: cir.store{{.*}} %[[STACK_PTR]], %[[SAVED_STACK]] +// CIR: %[[ARR:.*]] = cir.alloca !s32i, !cir.ptr, %[[LEN]] : !u64i, ["arr"] +// CIR: %[[TWO:.*]] = cir.const #cir.int<2> : !s32i +// CIR: %[[ARR_2:.*]] = cir.ptr_stride %[[ARR]], %[[TWO]] +// CIR: %[[ARR_VAL:.*]] = cir.load{{.*}} %[[ARR_2]] : !cir.ptr, !s32i +// CIR: cir.store{{.*}} %[[ARR_VAL]], %[[RET_ADDR]] : !s32i, !cir.ptr +// CIR: %[[STACK_RESTORE_PTR:.*]] = cir.load{{.*}} %[[SAVED_STACK]] +// CIR: cir.stackrestore %[[STACK_RESTORE_PTR]] +// CIR: %[[RET_VAL:.*]] = cir.load{{.*}} %[[RET_ADDR]] +// CIR: cir.return %[[RET_VAL]] : !s32i + +// LLVM: define{{.*}} i32 @f5(i64 %[[LEN_ARG:.*]]) +// LLVM: %[[LEN_ADDR:.*]] = alloca i64 +// LLVM: %[[RET_ADDR:.*]] = alloca i32 +// LLVM: %[[SAVED_STACK:.*]] = alloca ptr +// LLVM: store i64 %[[LEN_ARG]], ptr %[[LEN_ADDR]] +// LLVM: %[[LEN:.*]] = load i64, ptr %[[LEN_ADDR]] +// LLVM: %[[STACK_PTR:.*]] = call ptr @llvm.stacksave.p0() +// LLVM: store ptr %[[STACK_PTR]], ptr %[[SAVED_STACK]] +// LLVM: %[[ARR:.*]] = alloca i32, i64 %[[LEN]] +// LLVM: %[[ARR_2:.*]] = getelementptr i32, ptr %[[ARR]], i64 2 +// LLVM: %[[ARR_VAL:.*]] = load i32, ptr %[[ARR_2]] +// LLVM: store i32 %[[ARR_VAL]], ptr %[[RET_ADDR]] +// LLVM: %[[STACK_RESTORE_PTR:.*]] = load ptr, ptr %[[SAVED_STACK]] +// LLVM: call void @llvm.stackrestore.p0(ptr %[[STACK_RESTORE_PTR]]) +// LLVM: %[[RET_VAL:.*]] = load i32, ptr %[[RET_ADDR]] +// LLVM: ret i32 %[[RET_VAL]] + +// Note: VLA_EXPR0 below is emitted to capture debug info. + +// OGCG: define{{.*}} i32 @f5(i64 {{.*}} %[[LEN_ARG:.*]]) +// OGCG: %[[LEN_ADDR:.*]] = alloca i64 +// OGCG: %[[SAVED_STACK:.*]] = alloca ptr +// OGCG: %[[VLA_EXPR0:.*]] = alloca i64 +// OGCG: store i64 %[[LEN_ARG]], ptr %[[LEN_ADDR]] +// OGCG: %[[LEN:.*]] = load i64, ptr %[[LEN_ADDR]] +// OGCG: %[[STACK_PTR:.*]] = call ptr @llvm.stacksave.p0() +// OGCG: store ptr %[[STACK_PTR]], ptr %[[SAVED_STACK]] +// OGCG: %[[ARR:.*]] = alloca i32, i64 %[[LEN]] +// OGCG: store i64 %[[LEN]], ptr %[[VLA_EXPR0]] +// OGCG: %[[ARR_2:.*]] = getelementptr inbounds i32, ptr %[[ARR]], i64 2 +// OGCG: %[[ARR_VAL:.*]] = load i32, ptr %[[ARR_2]] +// OGCG: %[[STACK_RESTORE_PTR:.*]] = load ptr, ptr %[[SAVED_STACK]] +// OGCG: call void @llvm.stackrestore.p0(ptr %[[STACK_RESTORE_PTR]]) +// OGCG: ret i32 %[[ARR_VAL]] From 75cad42de2462caa1ebbf386361749d5da4d992a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Tue, 21 Oct 2025 10:59:13 -1000 Subject: [PATCH 014/530] [flang][cuda] Do not convert fir.declare op with no user (#164492) --- .../Optimizer/Transforms/CUFOpConversion.cpp | 4 ++++ flang/test/Fir/CUDA/cuda-global-addr.mlir | 20 +++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp index 759e3a65dd24f..8d00272b09f42 100644 --- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp @@ -454,6 +454,8 @@ struct DeclareOpConversion : public mlir::OpRewritePattern { mlir::LogicalResult matchAndRewrite(fir::DeclareOp op, mlir::PatternRewriter &rewriter) const override { + if (op.getResult().getUsers().empty()) + return success(); if (auto addrOfOp = op.getMemref().getDefiningOp()) { if (auto global = symTab.lookup( addrOfOp.getSymbol().getRootReference().getValue())) { @@ -963,6 +965,8 @@ class CUFOpConversion : public fir::impl::CUFOpConversionBase { } target.addDynamicallyLegalOp([&](fir::DeclareOp op) { + if (op.getResult().getUsers().empty()) + return true; if (inDeviceContext(op)) return true; if (auto addrOfOp = op.getMemref().getDefiningOp()) { diff --git a/flang/test/Fir/CUDA/cuda-global-addr.mlir b/flang/test/Fir/CUDA/cuda-global-addr.mlir index 3e50c7a51f49c..6f7816c9163cb 100644 --- a/flang/test/Fir/CUDA/cuda-global-addr.mlir +++ b/flang/test/Fir/CUDA/cuda-global-addr.mlir @@ -63,6 +63,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry : // We cannot call _FortranACUFGetDeviceAddress on a constant global. // There is no symbol for it and the call would result into an unresolved reference. +// CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "arraysize"} // CHECK-NOT: fir.call {{.*}}GetDeviceAddress // ----- @@ -90,3 +91,22 @@ func.func @_QQmain() attributes {fir.bindc_name = "test"} { // CHECK-NOT: fir.call {{.*}}GetDeviceAddress } + +// ----- + +// Check that we do not introduce call to _FortranACUFGetDeviceAddress when the +// value has no user. + +module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry, dense<64> : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} { + func.func @_QQmain() attributes {fir.bindc_name = "T"} { + %0 = fir.dummy_scope : !fir.dscope + %1 = fir.address_of(@_QMcon2Ezzz) : !fir.ref + %2 = fir.declare %1 {data_attr = #cuf.cuda, uniq_name = "_QMcon2Ezzz"} : (!fir.ref) -> !fir.ref + return + } + fir.global @_QMcon2Ezzz {data_attr = #cuf.cuda} : i32 +} + +// CHECK-LABEL: func.func @_QQmain() +// CHECK: fir.address_of(@_QMcon2Ezzz) : !fir.ref +// CHECK-NOT: fir.call {{.*}}GetDeviceAddress From 288ef04d2f19c5b0e4ce3cae450c63a365ab11e9 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Tue, 21 Oct 2025 14:04:03 -0700 Subject: [PATCH 015/530] [Github] Only look at previous commit for MacOS Premerge (#164483) Otherwise we're looking for a commit that does not exist given the fetch depth. Fixes #164430. --- .github/workflows/premerge.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml index 951fc16bed215..6303a119750b5 100644 --- a/.github/workflows/premerge.yaml +++ b/.github/workflows/premerge.yaml @@ -193,7 +193,7 @@ jobs: uses: llvm/actions/install-ninja@main - name: Build and Test run: | - source <(git diff --name-only HEAD~2..HEAD | python3 .ci/compute_projects.py) + source <(git diff --name-only HEAD~1...HEAD | python3 .ci/compute_projects.py) if [[ "${projects_to_build}" == "" ]]; then echo "No projects to build" From 228a353dc6e69419360e3017bbadda1919803529 Mon Sep 17 00:00:00 2001 From: Ruoyu Zhong Date: Wed, 22 Oct 2025 05:12:28 +0800 Subject: [PATCH 016/530] [clang-format] Fix repeated backslash insertion in macro line comments (#164300) Line comments in preprocessor directives were incorrectly marked as continuing the directive, causing clang-format to add backslashes after them on repeated runs. Backslashes appended after line comments in this way do not continue the PP directive because the following line would also become part of the comment. Fix by unsetting `InPPDirective` in `WhitespaceManager::replaceWhitespace` for line comments in two places: when breaking lines and when formatting tokens on the same line. This stops the spurious backslash insertion for both standalone line comments after PP directives and trailing line comments after macro bodies. Fixes https://github.com/llvm/llvm-project/issues/164282. Signed-off-by: Ruoyu Zhong --- clang/lib/Format/ContinuationIndenter.cpp | 11 +++++++---- clang/unittests/Format/FormatTestComments.cpp | 19 +++++++++++++++++++ 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp index 37c10c66503bb..e5abf833194d4 100644 --- a/clang/lib/Format/ContinuationIndenter.cpp +++ b/clang/lib/Format/ContinuationIndenter.cpp @@ -798,9 +798,11 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun, } if (!DryRun) { + const bool ContinuePPDirective = + State.Line->InMacroBody && Current.isNot(TT_LineComment); Whitespaces.replaceWhitespace(Current, /*Newlines=*/0, Spaces, State.Column + Spaces + PPColumnCorrection, - /*IsAligned=*/false, State.Line->InMacroBody); + /*IsAligned=*/false, ContinuePPDirective); } // If "BreakBeforeInheritanceComma" mode, don't break within the inheritance @@ -1176,10 +1178,11 @@ unsigned ContinuationIndenter::addTokenOnNewLine(LineState &State, // about removing empty lines on closing blocks. Special case them here. MaxEmptyLinesToKeep = 1; } - unsigned Newlines = + const unsigned Newlines = std::max(1u, std::min(Current.NewlinesBefore, MaxEmptyLinesToKeep)); - bool ContinuePPDirective = - State.Line->InPPDirective && State.Line->Type != LT_ImportStatement; + const bool ContinuePPDirective = State.Line->InPPDirective && + State.Line->Type != LT_ImportStatement && + Current.isNot(TT_LineComment); Whitespaces.replaceWhitespace(Current, Newlines, State.Column, State.Column, CurrentState.IsAligned, ContinuePPDirective); } diff --git a/clang/unittests/Format/FormatTestComments.cpp b/clang/unittests/Format/FormatTestComments.cpp index fc80bf4024fd9..6b433bb384864 100644 --- a/clang/unittests/Format/FormatTestComments.cpp +++ b/clang/unittests/Format/FormatTestComments.cpp @@ -839,6 +839,25 @@ TEST_F(FormatTestComments, MultiLineCommentsInDefines) { getLLVMStyleWithColumns(17))); } +TEST_F(FormatTestComments, LineCommentsInMacrosDoNotGetEscapedNewlines) { + FormatStyle Style = getLLVMStyleWithColumns(0); + Style.ReflowComments = FormatStyle::RCS_Never; + verifyFormat("#define FOO (1U) // comment\n" + " // comment", + Style); + + Style.ColumnLimit = 32; + verifyFormat("#define SOME_MACRO(x) x\n" + "#define FOO \\\n" + " SOME_MACRO(1) + \\\n" + " SOME_MACRO(2) // comment\n" + " // comment", + "#define SOME_MACRO(x) x\n" + "#define FOO SOME_MACRO(1) + SOME_MACRO(2) // comment\n" + " // comment", + Style); +} + TEST_F(FormatTestComments, ParsesCommentsAdjacentToPPDirectives) { EXPECT_EQ("namespace {}\n// Test\n#define A", format("namespace {}\n // Test\n#define A")); From d52e78e33a1beff5274e812eb3c38cf9e2a549f0 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Tue, 21 Oct 2025 14:13:44 -0700 Subject: [PATCH 017/530] [AMDGPU] Add clamp support to v_add_{max|min}_{i|u}32 (#164489) --- llvm/lib/Target/AMDGPU/VOP3Instructions.td | 8 +-- llvm/test/CodeGen/AMDGPU/add-max.ll | 18 +++---- llvm/test/CodeGen/AMDGPU/bf16.ll | 26 +++++----- llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s | 52 ++++++++++++------- llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s | 52 ++++++++++++------- .../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt | 52 ++++++++++++------- 6 files changed, 122 insertions(+), 86 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 42ec8ba876a8f..7cce033b30141 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -775,10 +775,10 @@ let SubtargetPredicate = HasMinimum3Maximum3F16, ReadsModeReg = 0 in { } // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 let SubtargetPredicate = HasAddMinMaxInsts, isCommutable = 1, isReMaterializable = 1 in { - defm V_ADD_MAX_I32 : VOP3Inst <"v_add_max_i32", VOP_I32_I32_I32_I32>; - defm V_ADD_MAX_U32 : VOP3Inst <"v_add_max_u32", VOP_I32_I32_I32_I32>; - defm V_ADD_MIN_I32 : VOP3Inst <"v_add_min_i32", VOP_I32_I32_I32_I32>; - defm V_ADD_MIN_U32 : VOP3Inst <"v_add_min_u32", VOP_I32_I32_I32_I32>; + defm V_ADD_MAX_I32 : VOP3Inst <"v_add_max_i32", VOP3_Profile>; + defm V_ADD_MAX_U32 : VOP3Inst <"v_add_max_u32", VOP3_Profile>; + defm V_ADD_MIN_I32 : VOP3Inst <"v_add_min_i32", VOP3_Profile>; + defm V_ADD_MIN_U32 : VOP3Inst <"v_add_min_u32", VOP3_Profile>; } defm V_ADD_I16 : VOP3Inst_t16 <"v_add_i16", VOP_I16_I16_I16>; diff --git a/llvm/test/CodeGen/AMDGPU/add-max.ll b/llvm/test/CodeGen/AMDGPU/add-max.ll index 00c66563572ea..b3a70577b6979 100644 --- a/llvm/test/CodeGen/AMDGPU/add-max.ll +++ b/llvm/test/CodeGen/AMDGPU/add-max.ll @@ -5,7 +5,7 @@ define amdgpu_ps float @add_max_u32_vvv(i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: add_max_u32_vvv: ; GCN: ; %bb.0: -; GCN-NEXT: v_add_max_u32_e64 v0, v0, v1, v2 +; GCN-NEXT: v_add_max_u32 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog %add = add i32 %a, %b %max = call i32 @llvm.umax.i32(i32 %add, i32 %c) @@ -16,7 +16,7 @@ define amdgpu_ps float @add_max_u32_vvv(i32 %a, i32 %b, i32 %c) { define amdgpu_ps float @add_max_u32_svv(i32 inreg %a, i32 %b, i32 %c) { ; GCN-LABEL: add_max_u32_svv: ; GCN: ; %bb.0: -; GCN-NEXT: v_add_max_u32_e64 v0, s0, v0, v1 +; GCN-NEXT: v_add_max_u32 v0, s0, v0, v1 ; GCN-NEXT: ; return to shader part epilog %add = add i32 %a, %b %max = call i32 @llvm.umax.i32(i32 %add, i32 %c) @@ -27,7 +27,7 @@ define amdgpu_ps float @add_max_u32_svv(i32 inreg %a, i32 %b, i32 %c) { define amdgpu_ps float @add_max_u32_ssv(i32 inreg %a, i32 inreg %b, i32 %c) { ; SDAG-LABEL: add_max_u32_ssv: ; SDAG: ; %bb.0: -; SDAG-NEXT: v_add_max_u32_e64 v0, s0, s1, v0 +; SDAG-NEXT: v_add_max_u32 v0, s0, s1, v0 ; SDAG-NEXT: ; return to shader part epilog ; ; GISEL-LABEL: add_max_u32_ssv: @@ -59,7 +59,7 @@ define amdgpu_ps float @add_max_u32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c define amdgpu_ps float @add_max_u32_vsi(i32 %a, i32 inreg %b) { ; GCN-LABEL: add_max_u32_vsi: ; GCN: ; %bb.0: -; GCN-NEXT: v_add_max_u32_e64 v0, v0, s0, 4 +; GCN-NEXT: v_add_max_u32 v0, v0, s0, 4 ; GCN-NEXT: ; return to shader part epilog %add = add i32 %a, %b %max = call i32 @llvm.umax.i32(i32 %add, i32 4) @@ -70,7 +70,7 @@ define amdgpu_ps float @add_max_u32_vsi(i32 %a, i32 inreg %b) { define amdgpu_ps float @add_max_u32_svl(i32 inreg %a, i32 %b) { ; GCN-LABEL: add_max_u32_svl: ; GCN: ; %bb.0: -; GCN-NEXT: v_add_max_u32_e64 v0, s0, v0, 0x64 +; GCN-NEXT: v_add_max_u32 v0, s0, v0, 0x64 ; GCN-NEXT: ; return to shader part epilog %add = add i32 %a, %b %max = call i32 @llvm.umax.i32(i32 %add, i32 100) @@ -81,7 +81,7 @@ define amdgpu_ps float @add_max_u32_svl(i32 inreg %a, i32 %b) { define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b) { ; SDAG-LABEL: add_max_u32_slv: ; SDAG: ; %bb.0: -; SDAG-NEXT: v_add_max_u32_e64 v0, 0x64, s0, v0 +; SDAG-NEXT: v_add_max_u32 v0, 0x64, s0, v0 ; SDAG-NEXT: ; return to shader part epilog ; ; GISEL-LABEL: add_max_u32_slv: @@ -99,7 +99,7 @@ define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b) { define amdgpu_ps float @add_max_i32_vvv(i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: add_max_i32_vvv: ; GCN: ; %bb.0: -; GCN-NEXT: v_add_max_i32_e64 v0, v0, v1, v2 +; GCN-NEXT: v_add_max_i32 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog %add = add i32 %a, %b %max = call i32 @llvm.smax.i32(i32 %add, i32 %c) @@ -110,7 +110,7 @@ define amdgpu_ps float @add_max_i32_vvv(i32 %a, i32 %b, i32 %c) { define amdgpu_ps float @add_min_u32_vvv(i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: add_min_u32_vvv: ; GCN: ; %bb.0: -; GCN-NEXT: v_add_min_u32_e64 v0, v0, v1, v2 +; GCN-NEXT: v_add_min_u32 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog %add = add i32 %a, %b %max = call i32 @llvm.umin.i32(i32 %add, i32 %c) @@ -121,7 +121,7 @@ define amdgpu_ps float @add_min_u32_vvv(i32 %a, i32 %b, i32 %c) { define amdgpu_ps float @add_min_i32_vvv(i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: add_min_i32_vvv: ; GCN: ; %bb.0: -; GCN-NEXT: v_add_min_i32_e64 v0, v0, v1, v2 +; GCN-NEXT: v_add_min_i32 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog %add = add i32 %a, %b %max = call i32 @llvm.smin.i32(i32 %add, i32 %c) diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 7ee0015f15997..711d57baac15f 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -39137,7 +39137,7 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) { ; GFX1250-NEXT: v_ashrrev_i32_e32 v2, 31, v2 ; GFX1250-NEXT: v_add_nc_u32_e32 v2, 32, v2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_add_min_u32_e64 v2, v3, -1, v2 +; GFX1250-NEXT: v_add_min_u32 v2, v3, -1, v2 ; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v2, v[0:1] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_min_u32_e32 v0, 1, v0 @@ -39487,8 +39487,8 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX1250-NEXT: v_dual_ashrrev_i32 v5, 31, v5 :: v_dual_ashrrev_i32 v4, 31, v4 ; GFX1250-NEXT: v_dual_add_nc_u32 v5, 32, v5 :: v_dual_add_nc_u32 v4, 32, v4 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_add_min_u32_e64 v5, v7, -1, v5 -; GFX1250-NEXT: v_add_min_u32_e64 v4, v6, -1, v4 +; GFX1250-NEXT: v_add_min_u32 v5, v7, -1, v5 +; GFX1250-NEXT: v_add_min_u32 v4, v6, -1, v4 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v5, v[0:1] ; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v4, v[2:3] @@ -39979,9 +39979,9 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX1250TRUE16-NEXT: v_dual_add_nc_u32 v7, 32, v7 :: v_dual_add_nc_u32 v6, 32, v6 ; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250TRUE16-NEXT: v_ashrrev_i32_e32 v8, 31, v8 -; GFX1250TRUE16-NEXT: v_add_min_u32_e64 v7, v10, -1, v7 +; GFX1250TRUE16-NEXT: v_add_min_u32 v7, v10, -1, v7 ; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250TRUE16-NEXT: v_add_min_u32_e64 v6, v9, -1, v6 +; GFX1250TRUE16-NEXT: v_add_min_u32 v6, v9, -1, v6 ; GFX1250TRUE16-NEXT: v_lshlrev_b64_e32 v[2:3], v7, v[2:3] ; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250TRUE16-NEXT: v_lshlrev_b64_e32 v[4:5], v6, v[4:5] @@ -39991,7 +39991,7 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX1250TRUE16-NEXT: v_min_u32_e32 v4, 1, v4 ; GFX1250TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250TRUE16-NEXT: v_add_min_u32_e64 v8, v11, -1, v8 +; GFX1250TRUE16-NEXT: v_add_min_u32 v8, v11, -1, v8 ; GFX1250TRUE16-NEXT: v_dual_sub_nc_u32 v3, 32, v6 :: v_dual_bitop2_b32 v4, v5, v4 bitop3:0x54 ; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250TRUE16-NEXT: v_cvt_f32_i32_e32 v2, v2 @@ -40027,8 +40027,8 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX1250FAKE16-NEXT: v_dual_ashrrev_i32 v6, 31, v6 :: v_dual_ashrrev_i32 v7, 31, v7 ; GFX1250FAKE16-NEXT: v_dual_add_nc_u32 v6, 32, v6 :: v_dual_add_nc_u32 v7, 32, v7 ; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250FAKE16-NEXT: v_add_min_u32_e64 v6, v10, -1, v6 -; GFX1250FAKE16-NEXT: v_add_min_u32_e64 v7, v11, -1, v7 +; GFX1250FAKE16-NEXT: v_add_min_u32 v6, v10, -1, v6 +; GFX1250FAKE16-NEXT: v_add_min_u32 v7, v11, -1, v7 ; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250FAKE16-NEXT: v_lshlrev_b64_e32 v[2:3], v6, v[2:3] ; GFX1250FAKE16-NEXT: v_lshlrev_b64_e32 v[0:1], v7, v[0:1] @@ -40038,7 +40038,7 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX1250FAKE16-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250FAKE16-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX1250FAKE16-NEXT: v_add_min_u32_e64 v8, v9, -1, v8 +; GFX1250FAKE16-NEXT: v_add_min_u32 v8, v9, -1, v8 ; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250FAKE16-NEXT: v_dual_sub_nc_u32 v3, 32, v6 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54 ; GFX1250FAKE16-NEXT: v_cvt_f32_i32_e32 v2, v2 @@ -40656,18 +40656,18 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX1250-NEXT: v_dual_add_nc_u32 v9, 32, v9 :: v_dual_add_nc_u32 v8, 32, v8 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250-NEXT: v_dual_ashrrev_i32 v10, 31, v10 :: v_dual_bitop2_b32 v11, v0, v1 bitop3:0x14 -; GFX1250-NEXT: v_add_min_u32_e64 v9, v13, -1, v9 +; GFX1250-NEXT: v_add_min_u32 v9, v13, -1, v9 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_add_min_u32_e64 v8, v12, -1, v8 +; GFX1250-NEXT: v_add_min_u32 v8, v12, -1, v8 ; GFX1250-NEXT: v_dual_ashrrev_i32 v11, 31, v11 :: v_dual_add_nc_u32 v10, 32, v10 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-NEXT: v_lshlrev_b64_e32 v[4:5], v9, v[4:5] ; GFX1250-NEXT: v_lshlrev_b64_e32 v[6:7], v8, v[6:7] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1250-NEXT: v_add_nc_u32_e32 v11, 32, v11 -; GFX1250-NEXT: v_add_min_u32_e64 v10, v14, -1, v10 +; GFX1250-NEXT: v_add_min_u32 v10, v14, -1, v10 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_add_min_u32_e64 v11, v15, -1, v11 +; GFX1250-NEXT: v_add_min_u32 v11, v15, -1, v11 ; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v10, v[2:3] ; GFX1250-NEXT: v_min_u32_e32 v6, 1, v6 ; GFX1250-NEXT: v_min_u32_e32 v4, 1, v4 diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s index d3b44eb788444..8160544ddea4d 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s @@ -218,64 +218,76 @@ v_mad_nc_i64_i32 v[2:3], s4, v7, v[8:9] clamp // GFX1250: v_mad_nc_i64_i32 v[2:3], s4, v7, v[8:9] clamp ; encoding: [0x02,0x80,0xfb,0xd6,0x04,0x0e,0x22,0x04] v_add_min_i32 v2, s4, v7, v8 -// GFX1250: v_add_min_i32_e64 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x0e,0x22,0x04] +// GFX1250: v_add_min_i32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x0e,0x22,0x04] v_add_min_i32 v2, v4, 0, 1 -// GFX1250: v_add_min_i32_e64 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x01,0x05,0x02] +// GFX1250: v_add_min_i32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x01,0x05,0x02] v_add_min_i32 v2, v4, 3, s2 -// GFX1250: v_add_min_i32_e64 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x07,0x09,0x00] +// GFX1250: v_add_min_i32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x07,0x09,0x00] v_add_min_i32 v2, s4, 4, v2 -// GFX1250: v_add_min_i32_e64 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x08,0x09,0x04] +// GFX1250: v_add_min_i32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x08,0x09,0x04] v_add_min_i32 v2, v4, v7, 12345 -// GFX1250: v_add_min_i32_e64 v2, v4, v7, 0x3039 ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] +// GFX1250: v_add_min_i32 v2, v4, v7, 0x3039 ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] + +v_add_min_i32 v0, v1, v2, v3 clamp +// GFX1250: v_add_min_i32 v0, v1, v2, v3 clamp ; encoding: [0x00,0x80,0x60,0xd6,0x01,0x05,0x0e,0x04] v_add_max_i32 v2, s4, v7, v8 -// GFX1250: v_add_max_i32_e64 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x0e,0x22,0x04] +// GFX1250: v_add_max_i32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x0e,0x22,0x04] v_add_max_i32 v2, v4, 0, 1 -// GFX1250: v_add_max_i32_e64 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x01,0x05,0x02] +// GFX1250: v_add_max_i32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x01,0x05,0x02] v_add_max_i32 v2, v4, 3, s2 -// GFX1250: v_add_max_i32_e64 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x07,0x09,0x00] +// GFX1250: v_add_max_i32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x07,0x09,0x00] v_add_max_i32 v2, s4, 4, v2 -// GFX1250: v_add_max_i32_e64 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x08,0x09,0x04] +// GFX1250: v_add_max_i32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x08,0x09,0x04] v_add_max_i32 v2, v4, v7, 12345 -// GFX1250: v_add_max_i32_e64 v2, v4, v7, 0x3039 ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] +// GFX1250: v_add_max_i32 v2, v4, v7, 0x3039 ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] + +v_add_max_i32 v0, v1, v2, v3 clamp +// GFX1250: v_add_max_i32 v0, v1, v2, v3 clamp ; encoding: [0x00,0x80,0x5e,0xd6,0x01,0x05,0x0e,0x04] v_add_min_u32 v2, s4, v7, v8 -// GFX1250: v_add_min_u32_e64 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x0e,0x22,0x04] +// GFX1250: v_add_min_u32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x0e,0x22,0x04] v_add_min_u32 v2, v4, 0, 1 -// GFX1250: v_add_min_u32_e64 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x01,0x05,0x02] +// GFX1250: v_add_min_u32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x01,0x05,0x02] v_add_min_u32 v2, v4, 3, s2 -// GFX1250: v_add_min_u32_e64 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x07,0x09,0x00] +// GFX1250: v_add_min_u32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x07,0x09,0x00] v_add_min_u32 v2, s4, 4, v2 -// GFX1250: v_add_min_u32_e64 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x08,0x09,0x04] +// GFX1250: v_add_min_u32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x08,0x09,0x04] v_add_min_u32 v2, v4, v7, 12345 -// GFX1250: v_add_min_u32_e64 v2, v4, v7, 0x3039 ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] +// GFX1250: v_add_min_u32 v2, v4, v7, 0x3039 ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] + +v_add_min_u32 v0, v1, v2, v3 clamp +// GFX1250: v_add_min_u32 v0, v1, v2, v3 clamp ; encoding: [0x00,0x80,0x61,0xd6,0x01,0x05,0x0e,0x04] v_add_max_u32 v2, s4, v7, v8 -// GFX1250: v_add_max_u32_e64 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x0e,0x22,0x04] +// GFX1250: v_add_max_u32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x0e,0x22,0x04] v_add_max_u32 v2, v4, 0, 1 -// GFX1250: v_add_max_u32_e64 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x01,0x05,0x02] +// GFX1250: v_add_max_u32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x01,0x05,0x02] v_add_max_u32 v2, v4, 3, s2 -// GFX1250: v_add_max_u32_e64 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x07,0x09,0x00] +// GFX1250: v_add_max_u32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x07,0x09,0x00] v_add_max_u32 v2, s4, 4, v2 -// GFX1250: v_add_max_u32_e64 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x08,0x09,0x04] +// GFX1250: v_add_max_u32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x08,0x09,0x04] v_add_max_u32 v2, v4, v7, 12345 -// GFX1250: v_add_max_u32_e64 v2, v4, v7, 0x3039 ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] +// GFX1250: v_add_max_u32 v2, v4, v7, 0x3039 ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] + +v_add_max_u32 v0, v1, v2, v3 clamp +// GFX1250: v_add_max_u32 v0, v1, v2, v3 clamp ; encoding: [0x00,0x80,0x5f,0xd6,0x01,0x05,0x0e,0x04] v_cvt_pk_bf16_f32 v5, v1, v2 // GFX1250: v_cvt_pk_bf16_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x6d,0xd7,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s index 98d07ac1ece27..d913bd2db504b 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s @@ -218,64 +218,76 @@ v_mad_nc_i64_i32 v[2:3], s4, v7, v[8:9] clamp // GFX1250: v_mad_nc_i64_i32 v[2:3], s4, v7, v[8:9] clamp ; encoding: [0x02,0x80,0xfb,0xd6,0x04,0x0e,0x22,0x04] v_add_min_i32 v2, s4, v7, v8 -// GFX1250: v_add_min_i32_e64 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x0e,0x22,0x04] +// GFX1250: v_add_min_i32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x0e,0x22,0x04] v_add_min_i32 v2, v4, 0, 1 -// GFX1250: v_add_min_i32_e64 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x01,0x05,0x02] +// GFX1250: v_add_min_i32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x01,0x05,0x02] v_add_min_i32 v2, v4, 3, s2 -// GFX1250: v_add_min_i32_e64 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x07,0x09,0x00] +// GFX1250: v_add_min_i32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x07,0x09,0x00] v_add_min_i32 v2, s4, 4, v2 -// GFX1250: v_add_min_i32_e64 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x08,0x09,0x04] +// GFX1250: v_add_min_i32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x08,0x09,0x04] v_add_min_i32 v2, v4, v7, 12345 -// GFX1250: v_add_min_i32_e64 v2, v4, v7, 0x3039 ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] +// GFX1250: v_add_min_i32 v2, v4, v7, 0x3039 ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] + +v_add_min_i32 v0, v1, v2, v3 clamp +// GFX1250: v_add_min_i32 v0, v1, v2, v3 clamp ; encoding: [0x00,0x80,0x60,0xd6,0x01,0x05,0x0e,0x04] v_add_max_i32 v2, s4, v7, v8 -// GFX1250: v_add_max_i32_e64 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x0e,0x22,0x04] +// GFX1250: v_add_max_i32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x0e,0x22,0x04] v_add_max_i32 v2, v4, 0, 1 -// GFX1250: v_add_max_i32_e64 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x01,0x05,0x02] +// GFX1250: v_add_max_i32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x01,0x05,0x02] v_add_max_i32 v2, v4, 3, s2 -// GFX1250: v_add_max_i32_e64 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x07,0x09,0x00] +// GFX1250: v_add_max_i32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x07,0x09,0x00] v_add_max_i32 v2, s4, 4, v2 -// GFX1250: v_add_max_i32_e64 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x08,0x09,0x04] +// GFX1250: v_add_max_i32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x08,0x09,0x04] v_add_max_i32 v2, v4, v7, 12345 -// GFX1250: v_add_max_i32_e64 v2, v4, v7, 0x3039 ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] +// GFX1250: v_add_max_i32 v2, v4, v7, 0x3039 ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] + +v_add_max_u32 v0, v1, v2, v3 clamp +// GFX1250: v_add_max_u32 v0, v1, v2, v3 clamp ; encoding: [0x00,0x80,0x5f,0xd6,0x01,0x05,0x0e,0x04] v_add_min_u32 v2, s4, v7, v8 -// GFX1250: v_add_min_u32_e64 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x0e,0x22,0x04] +// GFX1250: v_add_min_u32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x0e,0x22,0x04] v_add_min_u32 v2, v4, 0, 1 -// GFX1250: v_add_min_u32_e64 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x01,0x05,0x02] +// GFX1250: v_add_min_u32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x01,0x05,0x02] v_add_min_u32 v2, v4, 3, s2 -// GFX1250: v_add_min_u32_e64 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x07,0x09,0x00] +// GFX1250: v_add_min_u32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x07,0x09,0x00] v_add_min_u32 v2, s4, 4, v2 -// GFX1250: v_add_min_u32_e64 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x08,0x09,0x04] +// GFX1250: v_add_min_u32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x08,0x09,0x04] v_add_min_u32 v2, v4, v7, 12345 -// GFX1250: v_add_min_u32_e64 v2, v4, v7, 0x3039 ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] +// GFX1250: v_add_min_u32 v2, v4, v7, 0x3039 ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] + +v_add_min_u32 v0, v1, v2, v3 clamp +// GFX1250: v_add_min_u32 v0, v1, v2, v3 clamp ; encoding: [0x00,0x80,0x61,0xd6,0x01,0x05,0x0e,0x04] v_add_max_u32 v2, s4, v7, v8 -// GFX1250: v_add_max_u32_e64 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x0e,0x22,0x04] +// GFX1250: v_add_max_u32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x0e,0x22,0x04] v_add_max_u32 v2, v4, 0, 1 -// GFX1250: v_add_max_u32_e64 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x01,0x05,0x02] +// GFX1250: v_add_max_u32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x01,0x05,0x02] v_add_max_u32 v2, v4, 3, s2 -// GFX1250: v_add_max_u32_e64 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x07,0x09,0x00] +// GFX1250: v_add_max_u32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x07,0x09,0x00] v_add_max_u32 v2, s4, 4, v2 -// GFX1250: v_add_max_u32_e64 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x08,0x09,0x04] +// GFX1250: v_add_max_u32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x08,0x09,0x04] v_add_max_u32 v2, v4, v7, 12345 -// GFX1250: v_add_max_u32_e64 v2, v4, v7, 0x3039 ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] +// GFX1250: v_add_max_u32 v2, v4, v7, 0x3039 ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] + +v_add_max_u32 v0, v1, v2, v3 clamp +// GFX1250: v_add_max_u32 v0, v1, v2, v3 clamp ; encoding: [0x00,0x80,0x5f,0xd6,0x01,0x05,0x0e,0x04] v_cvt_pk_bf16_f32 v5, v1, v2 // GFX1250: v_cvt_pk_bf16_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x6d,0xd7,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt index 29bfa54f2c10d..7af0bfe5b5327 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt @@ -237,64 +237,76 @@ # GFX1250: v_mad_nc_i64_i32 v[2:3], s4, v7, v[8:9] clamp ; encoding: [0x02,0x80,0xfb,0xd6,0x04,0x0e,0x22,0x04] 0x02,0x00,0x60,0xd6,0x04,0x08,0x09,0x04 -# GFX1250: v_add_min_i32_e64 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x08,0x09,0x04] +# GFX1250: v_add_min_i32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x08,0x09,0x04] 0x02,0x00,0x60,0xd6,0x04,0x0e,0x22,0x04 -# GFX1250: v_add_min_i32_e64 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x0e,0x22,0x04] +# GFX1250: v_add_min_i32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x0e,0x22,0x04] 0x02,0x00,0x60,0xd6,0x04,0x01,0x05,0x02 -# GFX1250: v_add_min_i32_e64 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x01,0x05,0x02] +# GFX1250: v_add_min_i32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x01,0x05,0x02] 0x02,0x00,0x60,0xd6,0x04,0x07,0x09,0x00 -# GFX1250: v_add_min_i32_e64 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x07,0x09,0x00] +# GFX1250: v_add_min_i32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x07,0x09,0x00] 0x02,0x00,0x60,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00 -# GFX1250: v_add_min_i32_e64 v2, v4, v7, 0x3039 ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] +# GFX1250: v_add_min_i32 v2, v4, v7, 0x3039 ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] + +0x00,0x80,0x5f,0xd6,0x01,0x05,0x0e,0x04 +# GFX1250: v_add_max_u32 v0, v1, v2, v3 clamp ; encoding: [0x00,0x80,0x5f,0xd6,0x01,0x05,0x0e,0x04] + +0x00,0x80,0x60,0xd6,0x01,0x05,0x0e,0x04 +# GFX1250: v_add_min_i32 v0, v1, v2, v3 clamp ; encoding: [0x00,0x80,0x60,0xd6,0x01,0x05,0x0e,0x04] 0x02,0x00,0x5e,0xd6,0x04,0x08,0x09,0x04 -# GFX1250: v_add_max_i32_e64 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x08,0x09,0x04] +# GFX1250: v_add_max_i32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x08,0x09,0x04] 0x02,0x00,0x5e,0xd6,0x04,0x0e,0x22,0x04 -# GFX1250: v_add_max_i32_e64 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x0e,0x22,0x04] +# GFX1250: v_add_max_i32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x0e,0x22,0x04] 0x02,0x00,0x5e,0xd6,0x04,0x01,0x05,0x02 -# GFX1250: v_add_max_i32_e64 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x01,0x05,0x02] +# GFX1250: v_add_max_i32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x01,0x05,0x02] 0x02,0x00,0x5e,0xd6,0x04,0x07,0x09,0x00 -# GFX1250: v_add_max_i32_e64 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x07,0x09,0x00] +# GFX1250: v_add_max_i32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x07,0x09,0x00] 0x02,0x00,0x5e,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00 -# GFX1250: v_add_max_i32_e64 v2, v4, v7, 0x3039 ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] +# GFX1250: v_add_max_i32 v2, v4, v7, 0x3039 ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] 0x02,0x00,0x61,0xd6,0x04,0x08,0x09,0x04 -# GFX1250: v_add_min_u32_e64 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x08,0x09,0x04] +# GFX1250: v_add_min_u32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x08,0x09,0x04] 0x02,0x00,0x61,0xd6,0x04,0x0e,0x22,0x04 -# GFX1250: v_add_min_u32_e64 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x0e,0x22,0x04] +# GFX1250: v_add_min_u32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x0e,0x22,0x04] 0x02,0x00,0x61,0xd6,0x04,0x01,0x05,0x02 -# GFX1250: v_add_min_u32_e64 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x01,0x05,0x02] +# GFX1250: v_add_min_u32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x01,0x05,0x02] 0x02,0x00,0x61,0xd6,0x04,0x07,0x09,0x00 -# GFX1250: v_add_min_u32_e64 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x07,0x09,0x00] +# GFX1250: v_add_min_u32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x07,0x09,0x00] 0x02,0x00,0x61,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00 -# GFX1250: v_add_min_u32_e64 v2, v4, v7, 0x3039 ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] +# GFX1250: v_add_min_u32 v2, v4, v7, 0x3039 ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] + +0x00,0x80,0x61,0xd6,0x01,0x05,0x0e,0x04 +# GFX1250: v_add_min_u32 v0, v1, v2, v3 clamp ; encoding: [0x00,0x80,0x61,0xd6,0x01,0x05,0x0e,0x04] 0x02,0x00,0x5f,0xd6,0x04,0x08,0x09,0x04 -# GFX1250: v_add_max_u32_e64 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x08,0x09,0x04] +# GFX1250: v_add_max_u32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x08,0x09,0x04] 0x02,0x00,0x5f,0xd6,0x04,0x0e,0x22,0x04 -# GFX1250: v_add_max_u32_e64 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x0e,0x22,0x04] +# GFX1250: v_add_max_u32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x0e,0x22,0x04] 0x02,0x00,0x5f,0xd6,0x04,0x01,0x05,0x02 -# GFX1250: v_add_max_u32_e64 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x01,0x05,0x02] +# GFX1250: v_add_max_u32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x01,0x05,0x02] 0x02,0x00,0x5f,0xd6,0x04,0x07,0x09,0x00 -# GFX1250: v_add_max_u32_e64 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x07,0x09,0x00] +# GFX1250: v_add_max_u32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x07,0x09,0x00] 0x02,0x00,0x5f,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00 -# GFX1250: v_add_max_u32_e64 v2, v4, v7, 0x3039 ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] +# GFX1250: v_add_max_u32 v2, v4, v7, 0x3039 ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] + +0x00,0x80,0x5f,0xd6,0x01,0x05,0x0e,0x04 +# GFX1250: v_add_max_u32 v0, v1, v2, v3 clamp ; encoding: [0x00,0x80,0x5f,0xd6,0x01,0x05,0x0e,0x04] 0xff,0x81,0x6d,0xd7,0xff,0xd6,0x00,0x38,0x56,0x34,0x12,0xaf # GFX1250: v_cvt_pk_bf16_f32 v255, -|0xaf123456|, vcc_hi clamp div:2 ; encoding: [0xff,0x81,0x6d,0xd7,0xff,0xd6,0x00,0x38,0x56,0x34,0x12,0xaf] From 0e8ee0ec78dc370a1bf2688411cf2db36c3a4cd0 Mon Sep 17 00:00:00 2001 From: Rahman Lavaee Date: Tue, 21 Oct 2025 14:15:03 -0700 Subject: [PATCH 018/530] Remove unnecessary static_cast in AsmPrinter.cpp. --- llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index e2af0c5925248..a114406b32fb6 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -1438,7 +1438,7 @@ getBBAddrMapFeature(const MachineFunction &MF, int NumMBBSectionRanges, BBFreqEnabled, BrProbEnabled, MF.hasBBSections() && NumMBBSectionRanges > 1, - static_cast(BBAddrMapSkipEmitBBEntries), + BBAddrMapSkipEmitBBEntries, HasCalls, false}; } From 073335d056a39d273df4f395dd9cbad58657e207 Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Tue, 21 Oct 2025 14:19:48 -0700 Subject: [PATCH 019/530] [CAS] Fix msan failure attempt 2 (#164493) Try to fix msan error again. Previously, the error wasn't correctly identified as the uninitialized value is in a different function than the line crashed. Make sure value return from hash generating function is fully zero initialized. --- llvm/unittests/CAS/OnDiskCommonUtils.h | 2 +- llvm/unittests/CAS/OnDiskKeyValueDBTest.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/unittests/CAS/OnDiskCommonUtils.h b/llvm/unittests/CAS/OnDiskCommonUtils.h index 57c8c228867fd..89f93e08366c9 100644 --- a/llvm/unittests/CAS/OnDiskCommonUtils.h +++ b/llvm/unittests/CAS/OnDiskCommonUtils.h @@ -45,7 +45,7 @@ inline HashType digest(StringRef Data) { } inline ValueType valueFromString(StringRef S) { - ValueType Val; + ValueType Val = {}; llvm::copy(S.substr(0, sizeof(Val)), Val.data()); return Val; } diff --git a/llvm/unittests/CAS/OnDiskKeyValueDBTest.cpp b/llvm/unittests/CAS/OnDiskKeyValueDBTest.cpp index 19ea8f5d30b71..41512d04b60bd 100644 --- a/llvm/unittests/CAS/OnDiskKeyValueDBTest.cpp +++ b/llvm/unittests/CAS/OnDiskKeyValueDBTest.cpp @@ -65,7 +65,7 @@ TEST_F(OnDiskCASTest, OnDiskKeyValueDBTest) { // Insert a lot of entries. for (unsigned I = 0; I < 1024 * 100; ++I) { std::string Index = Twine(I).str(); - ArrayRef Val; + std::optional> Val; ASSERT_THAT_ERROR( DB->put(digest(Index), valueFromString(Index)).moveInto(Val), Succeeded()); From 5129b37254f0c6e7aac94861e552156e920e2e6c Mon Sep 17 00:00:00 2001 From: Siavash Nazari Date: Tue, 21 Oct 2025 23:22:40 +0200 Subject: [PATCH 020/530] [MLIR][Python] Add shard Dialect Python Bindings (#162578) Add Python bindings for `shard` dialect. Provide means for creating constructs in this dialect in Python. --- mlir/python/CMakeLists.txt | 9 +++ mlir/python/mlir/dialects/ShardOps.td | 14 ++++ mlir/python/mlir/dialects/shard.py | 6 ++ mlir/test/python/dialects/shard.py | 67 +++++++++++++++++++ .../mlir/python/BUILD.bazel | 32 +++++++++ 5 files changed, 128 insertions(+) create mode 100644 mlir/python/mlir/dialects/ShardOps.td create mode 100644 mlir/python/mlir/dialects/shard.py create mode 100644 mlir/test/python/dialects/shard.py diff --git a/mlir/python/CMakeLists.txt b/mlir/python/CMakeLists.txt index 84ff08a7631d6..e79b51ad72e2b 100644 --- a/mlir/python/CMakeLists.txt +++ b/mlir/python/CMakeLists.txt @@ -346,6 +346,15 @@ declare_mlir_dialect_python_bindings( dialects/memref.py DIALECT_NAME memref) +declare_mlir_dialect_python_bindings( + ADD_TO_PARENT MLIRPythonSources.Dialects + ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir" + TD_FILE dialects/ShardOps.td + SOURCES + dialects/shard.py + DIALECT_NAME shard + GEN_ENUM_BINDINGS) + declare_mlir_dialect_python_bindings( ADD_TO_PARENT MLIRPythonSources.Dialects ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir" diff --git a/mlir/python/mlir/dialects/ShardOps.td b/mlir/python/mlir/dialects/ShardOps.td new file mode 100644 index 0000000000000..f8527664df67b --- /dev/null +++ b/mlir/python/mlir/dialects/ShardOps.td @@ -0,0 +1,14 @@ +//===-- ShardOps.td - Entry point for ShardOps bindings ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef PYTHON_BINDINGS_SHARD_OPS +#define PYTHON_BINDINGS_SHARD_OPS + +include "mlir/Dialect/Shard/IR/ShardOps.td" + +#endif diff --git a/mlir/python/mlir/dialects/shard.py b/mlir/python/mlir/dialects/shard.py new file mode 100644 index 0000000000000..8d69f17954290 --- /dev/null +++ b/mlir/python/mlir/dialects/shard.py @@ -0,0 +1,6 @@ +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +from ._shard_ops_gen import * +from ._shard_enum_gen import * diff --git a/mlir/test/python/dialects/shard.py b/mlir/test/python/dialects/shard.py new file mode 100644 index 0000000000000..c3ba605296345 --- /dev/null +++ b/mlir/test/python/dialects/shard.py @@ -0,0 +1,67 @@ +# RUN: %PYTHON %s | FileCheck %s + +from mlir.ir import * +from mlir.dialects import shard +from mlir.dialects import func + + +def constructAndPrintInModule(f): + print("\nTEST:", f.__name__) + with Context(), Location.unknown(): + module = Module.create() + with InsertionPoint(module.body): + f() + print(module) + module.operation.verify() + return f + + +# CHECK-LABEL: TEST: testShardGrid +@constructAndPrintInModule +def testShardGrid(): + # Test creating shard grids with different shapes + grid2d = shard.GridOp("grid_2d", [2, 2]) + grid1d = shard.GridOp("grid_1d", [4]) + + # CHECK: shard.grid @grid_2d(shape = 2x2) + # CHECK: shard.grid @grid_1d(shape = 4) + + +# CHECK-LABEL: TEST: testCollectiveOperations +@constructAndPrintInModule +def testCollectiveOperations(): + # Create grid and types + grid_op = shard.GridOp("grid_2x2", [2, 2]) + i32 = IntegerType.get_signless(32) + index_type = IndexType.get() + input_type = RankedTensorType.get([4, 2], i32) + gather_result_type = RankedTensorType.get([4, 4], i32) + + # Create a function to hold the operations + func_type = FunctionType.get([input_type], [input_type]) + test_func = func.FuncOp("test_collectives", func_type) + + with InsertionPoint(test_func.add_entry_block()): + arg = test_func.entry_block.arguments[0] + + gather_op = shard.AllGatherOp( + input=arg, + grid=FlatSymbolRefAttr.get("grid_2x2"), + grid_axes=DenseI16ArrayAttr.get([1]), + gather_axis=IntegerAttr.get(index_type, 1), + result=gather_result_type, + ) + + reduce_op = shard.AllReduceOp( + input=arg, + grid=FlatSymbolRefAttr.get("grid_2x2"), + reduction=shard.ReductionKind.Sum, + result=input_type, + ) + + func.ReturnOp([reduce_op]) + + # CHECK: shard.grid @grid_2x2(shape = 2x2) + # CHECK: func.func @test_collectives(%arg0: tensor<4x2xi32>) -> tensor<4x2xi32> + # CHECK: %all_gather = shard.all_gather %arg0 on @grid_2x2 grid_axes = [1] gather_axis = 1 : tensor<4x2xi32> -> tensor<4x4xi32> + # CHECK: %all_reduce = shard.all_reduce %arg0 on @grid_2x2 : tensor<4x2xi32> -> tensor<4x2xi32> diff --git a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel index 0a84bd246b8e0..43613ad507070 100644 --- a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel @@ -1009,6 +1009,38 @@ filegroup( ], ) +##---------------------------------------------------------------------------## +# Shard dialect. +##---------------------------------------------------------------------------## + +gentbl_filegroup( + name = "ShardOpsPyGen", + tbl_outs = { + "mlir/dialects/_shard_enum_gen.py": [ + "-gen-python-enum-bindings", + "-bind-dialect=shard", + ], + "mlir/dialects/_shard_ops_gen.py": [ + "-gen-python-op-bindings", + "-bind-dialect=shard", + ], + }, + tblgen = "//mlir:mlir-tblgen", + td_file = "mlir/dialects/ShardOps.td", + deps = [ + "//mlir:OpBaseTdFiles", + "//mlir:ShardTdFiles", + ], +) + +filegroup( + name = "ShardOpsPyFiles", + srcs = [ + "mlir/dialects/shard.py", + ":ShardOpsPyGen", + ], +) + ##---------------------------------------------------------------------------## # Shape dialect. ##---------------------------------------------------------------------------## From 841e1e1e17c3bd83e9eaa9e10a057b8217eb8de3 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Tue, 21 Oct 2025 14:47:12 -0700 Subject: [PATCH 021/530] [lldb-dap] Send a 'process' event on restart. (#163833) When we restart a process, send an updated 'process' event describing the newly launched process. I also updated the `isLocalProcess` value based on if we're on the 'host' platform or not. --- .../bindings/interface/SBPlatformExtensions.i | 7 ++++ lldb/bindings/interfaces.swig | 1 + lldb/include/lldb/API/SBPlatform.h | 3 ++ lldb/source/API/SBPlatform.cpp | 5 +++ lldb/tools/lldb-dap/EventHelper.cpp | 40 +++++++++++-------- .../Handler/RestartRequestHandler.cpp | 2 + 6 files changed, 41 insertions(+), 17 deletions(-) create mode 100644 lldb/bindings/interface/SBPlatformExtensions.i diff --git a/lldb/bindings/interface/SBPlatformExtensions.i b/lldb/bindings/interface/SBPlatformExtensions.i new file mode 100644 index 0000000000000..86759bda50630 --- /dev/null +++ b/lldb/bindings/interface/SBPlatformExtensions.i @@ -0,0 +1,7 @@ +%extend lldb::SBPlatform { +#ifdef SWIGPYTHON + %pythoncode %{ + is_host = property(IsHost, None, doc='''A read only property that returns a boolean value that indiciates if this platform is the host platform.''') + %} +#endif +} diff --git a/lldb/bindings/interfaces.swig b/lldb/bindings/interfaces.swig index e71ed136f20e6..b3d44979c916c 100644 --- a/lldb/bindings/interfaces.swig +++ b/lldb/bindings/interfaces.swig @@ -53,6 +53,7 @@ %include "./interface/SBModuleSpecDocstrings.i" %include "./interface/SBMutexExtensions.i" %include "./interface/SBPlatformDocstrings.i" +%include "./interface/SBPlatformExtensions.i" %include "./interface/SBProcessDocstrings.i" %include "./interface/SBProcessInfoDocstrings.i" %include "./interface/SBProgressDocstrings.i" diff --git a/lldb/include/lldb/API/SBPlatform.h b/lldb/include/lldb/API/SBPlatform.h index d63d2ed1eaba6..7cb19b3f7ec46 100644 --- a/lldb/include/lldb/API/SBPlatform.h +++ b/lldb/include/lldb/API/SBPlatform.h @@ -112,6 +112,9 @@ class LLDB_API SBPlatform { bool IsValid() const; + /// Returns true if this platform is the host platform, otherwise false. + bool IsHost() const; + void Clear(); const char *GetWorkingDirectory(); diff --git a/lldb/source/API/SBPlatform.cpp b/lldb/source/API/SBPlatform.cpp index ec59e27a08b30..9a0b47ce80336 100644 --- a/lldb/source/API/SBPlatform.cpp +++ b/lldb/source/API/SBPlatform.cpp @@ -331,6 +331,11 @@ SBPlatform::operator bool() const { return m_opaque_sp.get() != nullptr; } +bool SBPlatform::IsHost() const { + LLDB_INSTRUMENT_VA(this); + return m_opaque_sp.get() != nullptr && m_opaque_sp->IsHost(); +} + void SBPlatform::Clear() { LLDB_INSTRUMENT_VA(this); diff --git a/lldb/tools/lldb-dap/EventHelper.cpp b/lldb/tools/lldb-dap/EventHelper.cpp index 3042d3293b482..c5d5f2bb59b42 100644 --- a/lldb/tools/lldb-dap/EventHelper.cpp +++ b/lldb/tools/lldb-dap/EventHelper.cpp @@ -15,6 +15,7 @@ #include "Protocol/ProtocolRequests.h" #include "Protocol/ProtocolTypes.h" #include "lldb/API/SBFileSpec.h" +#include "lldb/API/SBPlatform.h" #include "llvm/Support/Error.h" #include @@ -78,11 +79,9 @@ void SendExtraCapabilities(DAP &dap) { // { "$ref": "#/definitions/Event" }, // { // "type": "object", -// "description": "Event message for 'process' event type. The event -// indicates that the debugger has begun debugging a -// new process. Either one that it has launched, or one -// that it has attached to.", -// "properties": { +// "description": "The event indicates that the debugger has begun +// debugging a new process. Either one that it has launched, or one that +// it has attached to.", "properties": { // "event": { // "type": "string", // "enum": [ "process" ] @@ -93,31 +92,37 @@ void SendExtraCapabilities(DAP &dap) { // "name": { // "type": "string", // "description": "The logical name of the process. This is -// usually the full path to process's executable -// file. Example: /home/myproj/program.js." +// usually the full path to process's executable file. Example: +// /home/example/myproj/program.js." // }, // "systemProcessId": { // "type": "integer", -// "description": "The system process id of the debugged process. -// This property will be missing for non-system -// processes." +// "description": "The process ID of the debugged process, as +// assigned by the operating system. This property should be +// omitted for logical processes that do not map to operating +// system processes on the machine." // }, // "isLocalProcess": { // "type": "boolean", // "description": "If true, the process is running on the same -// computer as the debug adapter." +// computer as the debug adapter." // }, // "startMethod": { // "type": "string", // "enum": [ "launch", "attach", "attachForSuspendedLaunch" ], // "description": "Describes how the debug engine started -// debugging this process.", -// "enumDescriptions": [ +// debugging this process.", "enumDescriptions": [ // "Process was launched under the debugger.", // "Debugger attached to an existing process.", -// "A project launcher component has launched a new process in -// a suspended state and then asked the debugger to attach." +// "A project launcher component has launched a new process in a +// suspended state and then asked the debugger to attach." // ] +// }, +// "pointerSize": { +// "type": "integer", +// "description": "The size of a pointer or address for this +// process, in bits. This value may be used by clients when +// formatting addresses for display." // } // }, // "required": [ "name" ] @@ -126,7 +131,7 @@ void SendExtraCapabilities(DAP &dap) { // "required": [ "event", "body" ] // } // ] -// } +// }, void SendProcessEvent(DAP &dap, LaunchMethod launch_method) { lldb::SBFileSpec exe_fspec = dap.target.GetExecutable(); char exe_path[PATH_MAX]; @@ -136,7 +141,8 @@ void SendProcessEvent(DAP &dap, LaunchMethod launch_method) { EmplaceSafeString(body, "name", exe_path); const auto pid = dap.target.GetProcess().GetProcessID(); body.try_emplace("systemProcessId", (int64_t)pid); - body.try_emplace("isLocalProcess", true); + body.try_emplace("isLocalProcess", dap.target.GetPlatform().IsHost()); + body.try_emplace("pointerSize", dap.target.GetAddressByteSize() * 8); const char *startMethod = nullptr; switch (launch_method) { case Launch: diff --git a/lldb/tools/lldb-dap/Handler/RestartRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/RestartRequestHandler.cpp index 45dd7ddce0428..100173bfc3082 100644 --- a/lldb/tools/lldb-dap/Handler/RestartRequestHandler.cpp +++ b/lldb/tools/lldb-dap/Handler/RestartRequestHandler.cpp @@ -124,6 +124,8 @@ void RestartRequestHandler::operator()( return; } + SendProcessEvent(dap, Launch); + // This is normally done after receiving a "configuration done" request. // Because we're restarting, configuration has already happened so we can // continue the process right away. From 3728ac7dae5bfd92a2c7f8b27fd0748e913cbde4 Mon Sep 17 00:00:00 2001 From: David Justo Date: Tue, 21 Oct 2025 15:07:57 -0700 Subject: [PATCH 022/530] [Docs] [Developer Policy] Document best practice of not tagging a username in commit messages and PR descriptions (#164328) Related to: https://discourse.llvm.org/t/forbidding-username-in-commits/86997 **Context:** When we merge a commit including tag to another username (e.g. `@`), that account will receive an email / notification every time that PR is cherry-picked and pushed to a fork, generating _a lot_ of spam in the user's inbox. As of today, there's no way of disabling this on GitHub settings. **This PR** documents this error in our developer policy, and reminds contributors to avoid it. **Next steps:** I'm a big believer that any policy that is not enforced via automation is not enforced at all, so I'd love to see some kind of check to prevent this error. However, there does not seem to be an agreement to do that _yet_. I'll see if I can gather support for doing that. --------- Co-authored-by: Jakub Kuderski --- llvm/docs/DeveloperPolicy.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llvm/docs/DeveloperPolicy.rst b/llvm/docs/DeveloperPolicy.rst index b94e79e5e7bd5..45f2df20984e6 100644 --- a/llvm/docs/DeveloperPolicy.rst +++ b/llvm/docs/DeveloperPolicy.rst @@ -413,6 +413,10 @@ Below are some guidelines about the format of the message itself: message self-explanatory. Note that such non-public links should not be included in the submitted code. +* Avoid 'tagging' someone's username in your commits and PR descriptions + (e.g., `@`), doing so results in that account receiving a notification + every time the commit is cherry-picked and/or pushed to a fork. + LLVM uses a squash workflow for pull requests, so as the pull request evolves during review, it's important to update the pull request description over the course of a review. GitHub uses the initial commit message to create the pull From 4cb91e79b6db916c47eacc839cbda6769e876aaa Mon Sep 17 00:00:00 2001 From: Eugene Epshteyn Date: Tue, 21 Oct 2025 18:18:44 -0400 Subject: [PATCH 023/530] [flang][OpenACC] Relax COMMON block usage restriction in OpenACC directives (#162659) Unlike OpenMP, OpenACC doesn't require that the COMMON block be defined in the same scope as the directive. --- flang/include/flang/Semantics/scope.h | 23 +++++++-- flang/lib/Semantics/resolve-directives.cpp | 21 ++++----- flang/lib/Semantics/resolve-names.cpp | 14 ++++++ flang/lib/Semantics/scope.cpp | 52 ++++++++++++++++++--- flang/test/Semantics/OpenACC/acc-common.f90 | 41 ++++++++++++++++ 5 files changed, 129 insertions(+), 22 deletions(-) create mode 100644 flang/test/Semantics/OpenACC/acc-common.f90 diff --git a/flang/include/flang/Semantics/scope.h b/flang/include/flang/Semantics/scope.h index 3195892fa7b91..ecffdb468bf6c 100644 --- a/flang/include/flang/Semantics/scope.h +++ b/flang/include/flang/Semantics/scope.h @@ -86,6 +86,13 @@ class Scope { CHECK(parent_ != this); return *parent_; } + + mapType &commonBlocks() { return commonBlocks_; } + const mapType &commonBlocks() const { return commonBlocks_; } + + mapType &commonBlockUses() { return commonBlockUses_; } + const mapType &commonBlockUses() const { return commonBlockUses_; } + Kind kind() const { return kind_; } bool IsGlobal() const { return kind_ == Kind::Global; } bool IsIntrinsicModules() const { return kind_ == Kind::IntrinsicModules; } @@ -186,10 +193,19 @@ class Scope { // Cray pointers are saved as map of pointee name -> pointer symbol const mapType &crayPointers() const { return crayPointers_; } void add_crayPointer(const SourceName &, Symbol &); - mapType &commonBlocks() { return commonBlocks_; } - const mapType &commonBlocks() const { return commonBlocks_; } Symbol &MakeCommonBlock(SourceName, SourceName location); - Symbol *FindCommonBlock(const SourceName &) const; + bool AddCommonBlockUse( + const SourceName &name, Attrs attrs, Symbol &cbUltimate); + + // Find COMMON block that is declared in the current scope + Symbol *FindCommonBlock(const SourceName &name) const; + + // Find USE-associated COMMON block in the current scope + Symbol *FindCommonBlockUse(const SourceName &name) const; + + // Find COMMON block in current and surrounding scopes, follow USE + // associations + Symbol *FindCommonBlockInVisibleScopes(const SourceName &) const; /// Make a Symbol but don't add it to the scope. template @@ -283,6 +299,7 @@ class Scope { std::list children_; mapType symbols_; mapType commonBlocks_; + mapType commonBlockUses_; // USE-assocated COMMON blocks std::list equivalenceSets_; mapType crayPointers_; std::map> submodules_; diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp index b0c36eced54c8..c410bd48b352c 100644 --- a/flang/lib/Semantics/resolve-directives.cpp +++ b/flang/lib/Semantics/resolve-directives.cpp @@ -1723,17 +1723,12 @@ void AccAttributeVisitor::Post(const parser::Name &name) { Symbol *AccAttributeVisitor::ResolveAccCommonBlockName( const parser::Name *name) { - if (auto *prev{name - ? GetContext().scope.parent().FindCommonBlock(name->source) - : nullptr}) { - name->symbol = prev; - return prev; - } - // Check if the Common Block is declared in the current scope - if (auto *commonBlockSymbol{ - name ? GetContext().scope.FindCommonBlock(name->source) : nullptr}) { - name->symbol = commonBlockSymbol; - return commonBlockSymbol; + if (name) { + if (Symbol * + cb{GetContext().scope.FindCommonBlockInVisibleScopes(name->source)}) { + name->symbol = cb; + return cb; + } } return nullptr; } @@ -1783,8 +1778,8 @@ void AccAttributeVisitor::ResolveAccObject( } } else { context_.Say(name.source, - "COMMON block must be declared in the same scoping unit " - "in which the OpenACC directive or clause appears"_err_en_US); + "Could not find COMMON block '%s' used in OpenACC directive"_err_en_US, + name.ToString()); } }, }, diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index 88cc4469034e7..db75437708a6c 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -3645,6 +3645,20 @@ void ModuleVisitor::Post(const parser::UseStmt &x) { } } } + // Go through the list of COMMON block symbols in the module scope and add + // their USE association to the current scope's USE-associated COMMON blocks. + for (const auto &[name, symbol] : useModuleScope_->commonBlocks()) { + if (!currScope().FindCommonBlockInVisibleScopes(name)) { + currScope().AddCommonBlockUse( + name, symbol->attrs(), symbol->GetUltimate()); + } + } + // Go through the list of USE-associated COMMON block symbols in the module + // scope and add USE associations to their ultimate symbols to the current + // scope's USE-associated COMMON blocks. + for (const auto &[name, symbol] : useModuleScope_->commonBlockUses()) { + currScope().AddCommonBlockUse(name, symbol->attrs(), symbol->GetUltimate()); + } useModuleScope_ = nullptr; } diff --git a/flang/lib/Semantics/scope.cpp b/flang/lib/Semantics/scope.cpp index 4af371f3611f3..ab75d4c608387 100644 --- a/flang/lib/Semantics/scope.cpp +++ b/flang/lib/Semantics/scope.cpp @@ -144,9 +144,8 @@ void Scope::add_crayPointer(const SourceName &name, Symbol &pointer) { } Symbol &Scope::MakeCommonBlock(SourceName name, SourceName location) { - const auto it{commonBlocks_.find(name)}; - if (it != commonBlocks_.end()) { - return *it->second; + if (auto *cb{FindCommonBlock(name)}) { + return *cb; } else { Symbol &symbol{MakeSymbol( name, Attrs{}, CommonBlockDetails{name.empty() ? location : name})}; @@ -154,9 +153,25 @@ Symbol &Scope::MakeCommonBlock(SourceName name, SourceName location) { return symbol; } } -Symbol *Scope::FindCommonBlock(const SourceName &name) const { - const auto it{commonBlocks_.find(name)}; - return it != commonBlocks_.end() ? &*it->second : nullptr; + +Symbol *Scope::FindCommonBlockInVisibleScopes(const SourceName &name) const { + if (Symbol * cb{FindCommonBlock(name)}) { + return cb; + } else if (Symbol * cb{FindCommonBlockUse(name)}) { + return &cb->GetUltimate(); + } else if (IsSubmodule()) { + if (const Scope *parent{ + symbol_ ? symbol_->get().parent() : nullptr}) { + if (auto *cb{parent->FindCommonBlockInVisibleScopes(name)}) { + return cb; + } + } + } else if (!IsTopLevel() && parent_) { + if (auto *cb{parent_->FindCommonBlockInVisibleScopes(name)}) { + return cb; + } + } + return nullptr; } Scope *Scope::FindSubmodule(const SourceName &name) const { @@ -167,6 +182,31 @@ Scope *Scope::FindSubmodule(const SourceName &name) const { return &*it->second; } } + +bool Scope::AddCommonBlockUse( + const SourceName &name, Attrs attrs, Symbol &cbUltimate) { + CHECK(cbUltimate.has()); + // Make a symbol, but don't add it to the Scope, since it needs to + // be added to the USE-associated COMMON blocks + Symbol &useCB{MakeSymbol(name, attrs, UseDetails{name, cbUltimate})}; + return commonBlockUses_.emplace(name, useCB).second; +} + +Symbol *Scope::FindCommonBlock(const SourceName &name) const { + if (const auto it{commonBlocks_.find(name)}; it != commonBlocks_.end()) { + return &*it->second; + } + return nullptr; +} + +Symbol *Scope::FindCommonBlockUse(const SourceName &name) const { + if (const auto it{commonBlockUses_.find(name)}; + it != commonBlockUses_.end()) { + return &*it->second; + } + return nullptr; +} + bool Scope::AddSubmodule(const SourceName &name, Scope &submodule) { return submodules_.emplace(name, submodule).second; } diff --git a/flang/test/Semantics/OpenACC/acc-common.f90 b/flang/test/Semantics/OpenACC/acc-common.f90 new file mode 100644 index 0000000000000..31c4d190d576b --- /dev/null +++ b/flang/test/Semantics/OpenACC/acc-common.f90 @@ -0,0 +1,41 @@ +! RUN: %python %S/../test_errors.py %s %flang -fopenacc +module acc_common_decl + implicit none + integer a + common /a_common/ a +!$acc declare create (/a_common/) + data a/42/ +end module acc_common_decl + +module acc_common_another + implicit none + integer c, d + common /a_common/ c, d +!$acc declare create (/a_common/) +end module acc_common_another + +module acc_common_intermediate + use acc_common_decl + implicit none + integer b + common /b_common/ b +!$acc declare create (/b_common/) +end module acc_common_intermediate + +program acc_decl_test + use acc_common_intermediate + use acc_common_another + implicit none + + a = 1 + b = 10 +!$acc update device (/a_common/) + a = 2 +!$acc update device (/b_common/) + b = 20 +!$acc update device (/a_common/) + c = 3 + d = 30 +!ERROR: Could not find COMMON block 'a_common_bad' used in OpenACC directive +!$acc update device (/a_common_bad/) +end program From c89eb13e6e2b839ca4e6851849224a41d35f8637 Mon Sep 17 00:00:00 2001 From: Andy Kaylor Date: Tue, 21 Oct 2025 15:20:26 -0700 Subject: [PATCH 024/530] [CIR] Handle implicit value init constants (#164504) This adds a handle for VisitImplicitValueInitExpr in ConstExprEmitter. --- clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp | 4 +--- clang/test/CIR/CodeGen/struct-init.cpp | 6 ++++++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp index 19ed6560245b4..65e6a3915f241 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp @@ -1028,9 +1028,7 @@ class ConstExprEmitter mlir::Attribute VisitImplicitValueInitExpr(ImplicitValueInitExpr *e, QualType t) { - cgm.errorNYI(e->getBeginLoc(), - "ConstExprEmitter::VisitImplicitValueInitExpr"); - return {}; + return cgm.getBuilder().getZeroInitAttr(cgm.convertType(t)); } mlir::Attribute VisitInitListExpr(InitListExpr *ile, QualType t) { diff --git a/clang/test/CIR/CodeGen/struct-init.cpp b/clang/test/CIR/CodeGen/struct-init.cpp index a47ef530b8369..2887e6f404ffc 100644 --- a/clang/test/CIR/CodeGen/struct-init.cpp +++ b/clang/test/CIR/CodeGen/struct-init.cpp @@ -9,6 +9,12 @@ struct S { int a, b, c; }; +S partial_init = { 1 }; + +// CIR: cir.global external @partial_init = #cir.const_record<{#cir.int<1> : !s32i, #cir.int<0> : !s32i, #cir.int<0> : !s32i}> : !rec_S +// LLVM: @partial_init = global %struct.S { i32 1, i32 0, i32 0 } +// OGCG: @partial_init = global %struct.S { i32 1, i32 0, i32 0 } + void init() { S s1 = {1, 2, 3}; S s2 = {4, 5}; From e29d4ebb89705162b588caf69b45581f1da3a45c Mon Sep 17 00:00:00 2001 From: John Harrison Date: Tue, 21 Oct 2025 15:21:27 -0700 Subject: [PATCH 025/530] [lldb-dap] Correct lldb-dap `seq` handling. (#164306) We've been treating the `seq` attribute incorrectly in lldb-dap. Previously, we always had `seq=0` for events and responses. We only filled in the `seq` field on reverse requests. However, looking at the spec and other DAP implementations, we are supposed to fill in the `seq` field for each request we send to the DAP client. I've updated our usage to fill in `seq` in `DAP::Send` so that all events/requests/responses have a properly filled `seq` value. --- .../test/tools/lldb-dap/dap_server.py | 6 +++ lldb/tools/lldb-dap/DAP.cpp | 45 +++++++++------- lldb/tools/lldb-dap/DAP.h | 24 ++++----- .../tools/lldb-dap/Handler/RequestHandler.cpp | 11 ++-- lldb/tools/lldb-dap/JSONUtils.cpp | 5 +- lldb/tools/lldb-dap/Protocol/ProtocolBase.cpp | 35 ++++++------ lldb/tools/lldb-dap/Protocol/ProtocolBase.h | 54 +++++++++++++------ 7 files changed, 105 insertions(+), 75 deletions(-) diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py index a3d924d495fb1..d892c01f0bc71 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py +++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py @@ -11,6 +11,7 @@ import signal import sys import threading +import warnings import time from typing import ( Any, @@ -383,6 +384,10 @@ def _process_recv_packets(self) -> None: """Process received packets, updating the session state.""" with self._recv_condition: for packet in self._recv_packets: + if packet and ("seq" not in packet or packet["seq"] == 0): + warnings.warn( + f"received a malformed packet, expected 'seq != 0' for {packet!r}" + ) # Handle events that may modify any stateful properties of # the DAP session. if packet and packet["type"] == "event": @@ -576,6 +581,7 @@ def wait_for_stopped(self) -> Optional[List[Event]]: # If we exited, then we are done if stopped_event["event"] == "exited": break + # Otherwise we stopped and there might be one or more 'stopped' # events for each thread that stopped with a reason, so keep # checking for more 'stopped' events and return all of them diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp index 61226cceb6db0..52c8c6b9092b9 100644 --- a/lldb/tools/lldb-dap/DAP.cpp +++ b/lldb/tools/lldb-dap/DAP.cpp @@ -266,40 +266,49 @@ void DAP::SendJSON(const llvm::json::Value &json) { Send(message); } -void DAP::Send(const Message &message) { - if (const protocol::Event *event = std::get_if(&message)) { +Id DAP::Send(const Message &message) { + std::lock_guard guard(call_mutex); + Message msg = std::visit( + [this](auto &&msg) -> Message { + if (msg.seq == kCalculateSeq) + msg.seq = seq++; + return msg; + }, + Message(message)); + + if (const protocol::Event *event = std::get_if(&msg)) { if (llvm::Error err = transport.Send(*event)) DAP_LOG_ERROR(log, std::move(err), "({0}) sending event failed", m_client_name); - return; + return event->seq; } - if (const Request *req = std::get_if(&message)) { + if (const Request *req = std::get_if(&msg)) { if (llvm::Error err = transport.Send(*req)) DAP_LOG_ERROR(log, std::move(err), "({0}) sending request failed", m_client_name); - return; + return req->seq; } - if (const Response *resp = std::get_if(&message)) { + if (const Response *resp = std::get_if(&msg)) { // FIXME: After all the requests have migrated from LegacyRequestHandler > // RequestHandler<> this should be handled in RequestHandler<>::operator(). // If the debugger was interrupted, convert this response into a // 'cancelled' response because we might have a partial result. - llvm::Error err = - (debugger.InterruptRequested()) - ? transport.Send({/*request_seq=*/resp->request_seq, - /*command=*/resp->command, - /*success=*/false, - /*message=*/eResponseMessageCancelled, - /*body=*/std::nullopt}) - : transport.Send(*resp); - if (err) { + llvm::Error err = (debugger.InterruptRequested()) + ? transport.Send({ + /*request_seq=*/resp->request_seq, + /*command=*/resp->command, + /*success=*/false, + /*message=*/eResponseMessageCancelled, + /*body=*/std::nullopt, + /*seq=*/resp->seq, + }) + : transport.Send(*resp); + if (err) DAP_LOG_ERROR(log, std::move(err), "({0}) sending response failed", m_client_name); - return; - } - return; + return resp->seq; } llvm_unreachable("Unexpected message type"); diff --git a/lldb/tools/lldb-dap/DAP.h b/lldb/tools/lldb-dap/DAP.h index bf2c3f146a396..b4f111e4e720c 100644 --- a/lldb/tools/lldb-dap/DAP.h +++ b/lldb/tools/lldb-dap/DAP.h @@ -136,7 +136,7 @@ struct DAP final : public DAPTransport::MessageHandler { /// unless we send a "thread" event to indicate the thread exited. llvm::DenseSet thread_ids; - uint32_t reverse_request_seq = 0; + protocol::Id seq = 0; std::mutex call_mutex; llvm::SmallDenseMap> inflight_reverse_requests; @@ -220,8 +220,8 @@ struct DAP final : public DAPTransport::MessageHandler { /// Serialize the JSON value into a string and send the JSON packet to the /// "out" stream. void SendJSON(const llvm::json::Value &json); - /// Send the given message to the client - void Send(const protocol::Message &message); + /// Send the given message to the client. + protocol::Id Send(const protocol::Message &message); void SendOutput(OutputType o, const llvm::StringRef output); @@ -353,19 +353,13 @@ struct DAP final : public DAPTransport::MessageHandler { template void SendReverseRequest(llvm::StringRef command, llvm::json::Value arguments) { - int64_t id; - { - std::lock_guard locker(call_mutex); - id = ++reverse_request_seq; - inflight_reverse_requests[id] = std::make_unique(command, id); - } - - SendJSON(llvm::json::Object{ - {"type", "request"}, - {"seq", id}, - {"command", command}, - {"arguments", std::move(arguments)}, + protocol::Id id = Send(protocol::Request{ + command.str(), + std::move(arguments), }); + + std::lock_guard locker(call_mutex); + inflight_reverse_requests[id] = std::make_unique(command, id); } /// The set of capabilities supported by this adapter. diff --git a/lldb/tools/lldb-dap/Handler/RequestHandler.cpp b/lldb/tools/lldb-dap/Handler/RequestHandler.cpp index de63c9d78efd1..d67437ad5b3ae 100644 --- a/lldb/tools/lldb-dap/Handler/RequestHandler.cpp +++ b/lldb/tools/lldb-dap/Handler/RequestHandler.cpp @@ -157,11 +157,12 @@ RunInTerminal(DAP &dap, const protocol::LaunchRequestArguments &arguments) { void BaseRequestHandler::Run(const Request &request) { // If this request was cancelled, send a cancelled response. if (dap.IsCancelled(request)) { - Response cancelled{/*request_seq=*/request.seq, - /*command=*/request.command, - /*success=*/false, - /*message=*/eResponseMessageCancelled, - /*body=*/std::nullopt}; + Response cancelled{ + /*request_seq=*/request.seq, + /*command=*/request.command, + /*success=*/false, + /*message=*/eResponseMessageCancelled, + }; dap.Send(cancelled); return; } diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp index 71e91f8f41a68..2780a5b7748e8 100644 --- a/lldb/tools/lldb-dap/JSONUtils.cpp +++ b/lldb/tools/lldb-dap/JSONUtils.cpp @@ -10,6 +10,7 @@ #include "DAP.h" #include "ExceptionBreakpoint.h" #include "LLDBUtils.h" +#include "Protocol/ProtocolBase.h" #include "ProtocolUtils.h" #include "lldb/API/SBAddress.h" #include "lldb/API/SBCompileUnit.h" @@ -284,7 +285,7 @@ void FillResponse(const llvm::json::Object &request, // Fill in all of the needed response fields to a "request" and set "success" // to true by default. response.try_emplace("type", "response"); - response.try_emplace("seq", (int64_t)0); + response.try_emplace("seq", protocol::kCalculateSeq); EmplaceSafeString(response, "command", GetString(request, "command").value_or("")); const uint64_t seq = GetInteger(request, "seq").value_or(0); @@ -417,7 +418,7 @@ llvm::json::Value CreateScope(const llvm::StringRef name, // } llvm::json::Object CreateEventObject(const llvm::StringRef event_name) { llvm::json::Object event; - event.try_emplace("seq", 0); + event.try_emplace("seq", protocol::kCalculateSeq); event.try_emplace("type", "event"); EmplaceSafeString(event, "event", event_name); return event; diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolBase.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolBase.cpp index 9cd9028d879e9..72359214c8537 100644 --- a/lldb/tools/lldb-dap/Protocol/ProtocolBase.cpp +++ b/lldb/tools/lldb-dap/Protocol/ProtocolBase.cpp @@ -58,6 +58,8 @@ bool fromJSON(const json::Value &Params, MessageType &M, json::Path P) { } json::Value toJSON(const Request &R) { + assert(R.seq != kCalculateSeq && "invalid seq"); + json::Object Result{ {"type", "request"}, {"seq", R.seq}, @@ -103,8 +105,10 @@ bool operator==(const Request &a, const Request &b) { } json::Value toJSON(const Response &R) { + assert(R.seq != kCalculateSeq && "invalid seq"); + json::Object Result{{"type", "response"}, - {"seq", 0}, + {"seq", R.seq}, {"command", R.command}, {"request_seq", R.request_seq}, {"success", R.success}}; @@ -132,8 +136,9 @@ json::Value toJSON(const Response &R) { return std::move(Result); } -bool fromJSON(json::Value const &Params, - std::variant &M, json::Path P) { +static bool fromJSON(json::Value const &Params, + std::variant &M, + json::Path P) { auto rawMessage = Params.getAsString(); if (!rawMessage) { P.report("expected a string"); @@ -157,8 +162,7 @@ bool fromJSON(json::Value const &Params, Response &R, json::Path P) { return false; MessageType type; - int64_t seq; - if (!O.map("type", type) || !O.map("seq", seq) || + if (!O.map("type", type) || !O.map("seq", R.seq) || !O.map("command", R.command) || !O.map("request_seq", R.request_seq)) return false; @@ -168,12 +172,7 @@ bool fromJSON(json::Value const &Params, Response &R, json::Path P) { } if (R.command.empty()) { - P.field("command").report("expected to not be ''"); - return false; - } - - if (R.request_seq == 0) { - P.field("request_seq").report("expected to not be '0'"); + P.field("command").report("expected to not be empty"); return false; } @@ -217,9 +216,11 @@ bool fromJSON(json::Value const &Params, ErrorMessage &EM, json::Path P) { } json::Value toJSON(const Event &E) { + assert(E.seq != kCalculateSeq && "invalid seq"); + json::Object Result{ {"type", "event"}, - {"seq", 0}, + {"seq", E.seq}, {"event", E.event}, }; @@ -235,8 +236,7 @@ bool fromJSON(json::Value const &Params, Event &E, json::Path P) { return false; MessageType type; - int64_t seq; - if (!O.map("type", type) || !O.map("seq", seq) || !O.map("event", E.event)) + if (!O.map("type", type) || !O.map("seq", E.seq) || !O.map("event", E.event)) return false; if (type != eMessageTypeEvent) { @@ -244,13 +244,8 @@ bool fromJSON(json::Value const &Params, Event &E, json::Path P) { return false; } - if (seq != 0) { - P.field("seq").report("expected to be '0'"); - return false; - } - if (E.event.empty()) { - P.field("event").report("expected to not be ''"); + P.field("event").report("expected to not be empty"); return false; } diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolBase.h b/lldb/tools/lldb-dap/Protocol/ProtocolBase.h index 92e41b1dbf595..42c6c8890af24 100644 --- a/lldb/tools/lldb-dap/Protocol/ProtocolBase.h +++ b/lldb/tools/lldb-dap/Protocol/ProtocolBase.h @@ -30,19 +30,15 @@ namespace lldb_dap::protocol { // MARK: Base Protocol +/// Message unique identifier type. using Id = int64_t; +/// A unique identifier that indicates the `seq` field should be calculated by +/// the current session. +static constexpr Id kCalculateSeq = INT64_MAX; + /// A client or debug adapter initiated request. struct Request { - /// Sequence number of the message (also known as message ID). The `seq` for - /// the first message sent by a client or debug adapter is 1, and for each - /// subsequent message is 1 greater than the previous message sent by that - /// actor. `seq` can be used to order requests, responses, and events, and to - /// associate requests with their corresponding responses. For protocol - /// messages of type `request` the sequence number can be used to cancel the - /// request. - Id seq; - /// The command to execute. std::string command; @@ -50,7 +46,16 @@ struct Request { /// /// Request handlers are expected to validate the arguments, which is handled /// by `RequestHandler`. - std::optional arguments; + std::optional arguments = std::nullopt; + + /// Sequence number of the message (also known as message ID). The `seq` for + /// the first message sent by a client or debug adapter is 1, and for each + /// subsequent message is 1 greater than the previous message sent by that + /// actor. `seq` can be used to order requests, responses, and events, and to + /// associate requests with their corresponding responses. For protocol + /// messages of type `request` the sequence number can be used to cancel the + /// request. + Id seq = kCalculateSeq; }; llvm::json::Value toJSON(const Request &); bool fromJSON(const llvm::json::Value &, Request &, llvm::json::Path); @@ -62,7 +67,16 @@ struct Event { std::string event; /// Event-specific information. - std::optional body; + std::optional body = std::nullopt; + + /// Sequence number of the message (also known as message ID). The `seq` for + /// the first message sent by a client or debug adapter is 1, and for each + /// subsequent message is 1 greater than the previous message sent by that + /// actor. `seq` can be used to order requests, responses, and events, and to + /// associate requests with their corresponding responses. For protocol + /// messages of type `request` the sequence number can be used to cancel the + /// request. + Id seq = kCalculateSeq; }; llvm::json::Value toJSON(const Event &); bool fromJSON(const llvm::json::Value &, Event &, llvm::json::Path); @@ -78,7 +92,7 @@ enum ResponseMessage : unsigned { /// Response for a request. struct Response { /// Sequence number of the corresponding request. - Id request_seq; + Id request_seq = 0; /// The command requested. std::string command; @@ -87,21 +101,31 @@ struct Response { /// attribute may contain the result of the request. If the value is false, /// the attribute `message` contains the error in short form and the `body` /// may contain additional information (see `ErrorMessage`). - bool success; + bool success = false; // FIXME: Migrate usage of fallback string to ErrorMessage /// Contains the raw error in short form if `success` is false. This raw error /// might be interpreted by the client and is not shown in the UI. Some /// predefined values exist. - std::optional> message; + std::optional> message = + std::nullopt; /// Contains request result if success is true and error details if success is /// false. /// /// Request handlers are expected to build an appropriate body, see /// `RequestHandler`. - std::optional body; + std::optional body = std::nullopt; + + /// Sequence number of the message (also known as message ID). The `seq` for + /// the first message sent by a client or debug adapter is 1, and for each + /// subsequent message is 1 greater than the previous message sent by that + /// actor. `seq` can be used to order requests, responses, and events, and to + /// associate requests with their corresponding responses. For protocol + /// messages of type `request` the sequence number can be used to cancel the + /// request. + Id seq = kCalculateSeq; }; bool fromJSON(const llvm::json::Value &, Response &, llvm::json::Path); llvm::json::Value toJSON(const Response &); From daa15f428338c586ab685ebf510471d215a941b7 Mon Sep 17 00:00:00 2001 From: Erick Velez Date: Tue, 21 Oct 2025 15:38:18 -0700 Subject: [PATCH 026/530] [clang-doc] fix overlapping navbar (#164314) The navbar has a height of 60px, but the sidebar didn't have any top padding so it began underneath the navbar. Now the sidebar has a top padding of 60px. --- clang-tools-extra/clang-doc/assets/clang-doc-mustache.css | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang-tools-extra/clang-doc/assets/clang-doc-mustache.css b/clang-tools-extra/clang-doc/assets/clang-doc-mustache.css index e555ee7c370f7..67f11f77eae61 100644 --- a/clang-tools-extra/clang-doc/assets/clang-doc-mustache.css +++ b/clang-tools-extra/clang-doc/assets/clang-doc-mustache.css @@ -396,7 +396,7 @@ body, html { .sidebar { width: 250px; - top: 0; + top: 60px; left: 0; height: 100%; position: fixed; From 5f20b5a267a8739216bba1c4f19b9e50d472bf40 Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Tue, 21 Oct 2025 18:12:45 -0500 Subject: [PATCH 027/530] [clang] Fix Typo on cindex.py (#92945) Co-authored-by: Thomas Applencourt --- clang/bindings/python/clang/cindex.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/bindings/python/clang/cindex.py b/clang/bindings/python/clang/cindex.py index da97bd375608c..2786add27f5e8 100644 --- a/clang/bindings/python/clang/cindex.py +++ b/clang/bindings/python/clang/cindex.py @@ -3015,7 +3015,7 @@ class _CXUnsavedFile(Structure): # Functions calls through the python interface are rather slow. Fortunately, -# for most symboles, we do not need to perform a function call. Their spelling +# for most symbols, we do not need to perform a function call. Their spelling # never changes and is consequently provided by this spelling cache. SPELLING_CACHE = { # 0: CompletionChunk.Kind("Optional"), From fde69fd0bcb139092df370f13ef8b61ebd658105 Mon Sep 17 00:00:00 2001 From: LeeYoungJoon Date: Wed, 22 Oct 2025 08:31:35 +0900 Subject: [PATCH 028/530] [QualGroup][docs] Document decision-taking process and refresh sync-up materials (#163122) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary This PR documents the LLVM Qualification Working Group’s decision-taking process and updates the latest sync-up meeting materials. ## Changes * Added a new **Decision Taking** section describing: * Principles (consensus-first, inclusiveness, transparency)Discussion time limits * Voting procedure * Documentation of outcomes * Updated Meeting Materials with the October 2025 presentation slides. ## Background The LLVM Qualification Working Group aims to ensure decisions are made transparently and collaboratively. This new section documents how proposals are discussed, consensus is reached, and outcomes are recorded, providing clear guidance for contributors. It also keeps the group’s meeting resources up to date for easier reference and continuity. ## Testing - [x] Documentation builds successfully - [x] New “Decision Taking” section renders correctly - [x] Links and internal references are properly formatted - [x] Meeting Materials section updated with the October 2025 slides ## Related Links * [LLVM Qualification Working Group Documentation](https://llvm.org/docs/QualGroup.html) --- llvm/docs/QualGroup.rst | 43 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/llvm/docs/QualGroup.rst b/llvm/docs/QualGroup.rst index 5c05e4ed72075..0e73ec5c8be78 100644 --- a/llvm/docs/QualGroup.rst +++ b/llvm/docs/QualGroup.rst @@ -181,6 +181,46 @@ Membership Review To ensure the group remains active and focused, member participation will be reviewed every six months. Inactive members may be removed following this review. +Decision Taking +--------------- + +The LLVM Qualification Working Group aims to make decisions transparently, collaboratively, and without unnecessary formality. The goal is to maintain efficiency while encouraging broad participation and mutual understanding. + +This section describes the lightweight process used to handle proposals and decisions within the group. It may be revised as the group evolves and gains experience. + +Principles +^^^^^^^^^^ + +* **Consensus first:** The preferred mode of decision-making is consensus through open discussion (primarily on Discord or during sync-up meetings). +* **Inclusiveness and respect:** All viewpoints are encouraged, and members are expected to contribute constructively toward reaching a shared understanding. +* **Transparency:** Discussions leading to a decision should be visible to the group and, whenever appropriate, summarized in public channels (e.g., Discourse meeting notes, Discord channel, documentation updates). + +Consensus and Time Limits +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Discussions remain open until a clear consensus emerges, meaning no sustained objections have been raised after reasonable discussion. + +To prevent open-ended debates, if no new viewpoints are expressed after an agreed period (e.g., 2 weeks), the moderator (typically the person who started the discussion thread) may take one of the following actions: + +* **Summarize the apparent consensus** and close the discussion, or +* **Postpone the topic** to the next sync-up meeting if the outcome remains unclear, or +* **Call for a short vote** to confirm the group’s position. + +Voting Procedure +^^^^^^^^^^^^^^^^ + +When consensus cannot be reached or when a clear yes/no decision is needed: + +* The moderator may call for a **simple vote** using emoji reactions on Discord or a similar visible method. +* A decision passes if it receives a **majority (>50%)** of votes among **participants who voted.** Non-votes are **not counted** in the total. +* To ensure decisions reflect the collective position of the group, **at least three-quarters of the total core members** must participate in the vote for it to be considered valid. +* If results are evenly split **(50/50)**, or if participation falls below this threshold, the topic may be postponed to the next sync-up meeting for further discussion. + +Documentation +^^^^^^^^^^^^^ + +Final decisions should be briefly documented (e.g., in meeting minutes, the corresponding GitHub issue, or Discord discussion thread). Once stable, the resulting policy or outcome may be reflected in this documentation for reference. + Current Topics & Backlog ======================== @@ -205,10 +245,11 @@ Slides used to support discussions during sync-up meetings are stored in LLVM's Available slides: +* (add future entries here) +* `October 2025 `_ * `September 2025 `_ * `August 2025 `_ * `July 2025 `_ -* (add future entries here) AI Transcription Policy ======================= From 93dd17a0e0a437f9c60450db52b019c9d40e3d05 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Tue, 21 Oct 2025 16:32:10 -0700 Subject: [PATCH 029/530] [lit] Ensure ulimit does not persist across tests (#164485) When constructing a container in a default argument in Python, we actually end up with a reference to the same default container for every invocation of the function. Given this happened with the ulimit variable in ShellEnvironment, we ended up persisting limits across tests. This would cause some LLVM tests to fail, particularly jitlink and dsymutil tests, if they ended up running after the one clang test that uses ulimit -v to set a maximum amount of memory that can be allocated. This patch fixes that behavior by constructing the dict inside the function to ensure we get a new instance and adds test coverage. --- llvm/utils/lit/lit/TestRunner.py | 4 ++-- llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_reset.txt | 3 +++ llvm/utils/lit/tests/shtest-ulimit.py | 7 +++++-- 3 files changed, 10 insertions(+), 4 deletions(-) create mode 100644 llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_reset.txt diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py index a7e2705f609af..f88314547bb3f 100644 --- a/llvm/utils/lit/lit/TestRunner.py +++ b/llvm/utils/lit/lit/TestRunner.py @@ -92,12 +92,12 @@ class ShellEnvironment(object): we maintain a dir stack for pushd/popd. """ - def __init__(self, cwd, env, umask=-1, ulimit={}): + def __init__(self, cwd, env, umask=-1, ulimit=None): self.cwd = cwd self.env = dict(env) self.umask = umask self.dirStack = [] - self.ulimit = ulimit + self.ulimit = ulimit if ulimit else {} def change_dir(self, newdir): if os.path.isabs(newdir): diff --git a/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_reset.txt b/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_reset.txt new file mode 100644 index 0000000000000..011d6db6c127f --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_reset.txt @@ -0,0 +1,3 @@ +# RUN: %{python} %S/print_limits.py +# Fail the test so that we can assert on the output. +# RUN: not echo return diff --git a/llvm/utils/lit/tests/shtest-ulimit.py b/llvm/utils/lit/tests/shtest-ulimit.py index e84327772d3a1..9a8febd3333c4 100644 --- a/llvm/utils/lit/tests/shtest-ulimit.py +++ b/llvm/utils/lit/tests/shtest-ulimit.py @@ -5,9 +5,9 @@ # as well. # UNSUPPORTED: system-windows, system-solaris -# RUN: not %{lit} -a -v %{inputs}/shtest-ulimit | FileCheck %s +# RUN: not %{lit} -a -v %{inputs}/shtest-ulimit --order=lexical | FileCheck %s -# CHECK: -- Testing: 2 tests{{.*}} +# CHECK: -- Testing: 3 tests{{.*}} # CHECK-LABEL: FAIL: shtest-ulimit :: ulimit-bad-arg.txt ({{[^)]*}}) # CHECK: ulimit -n @@ -16,3 +16,6 @@ # CHECK-LABEL: FAIL: shtest-ulimit :: ulimit_okay.txt ({{[^)]*}}) # CHECK: ulimit -n 50 # CHECK: RLIMIT_NOFILE=50 + +# CHECK-LABEL: FAIL: shtest-ulimit :: ulimit_reset.txt ({{[^)]*}}) +# CHECK-NOT: RLIMIT_NOFILE=50 From 2bc4ad35e7dc9f963eba5e4b80ea6e5e6c673ef8 Mon Sep 17 00:00:00 2001 From: lntue Date: Tue, 21 Oct 2025 20:09:41 -0400 Subject: [PATCH 030/530] [libc][math] Add tolerance to math tests so that they still work when accurate path is skipped. (#164522) --- libc/src/__support/math/atan2f.h | 4 +- libc/test/UnitTest/FPMatcher.h | 53 ++++++++++++++------------ libc/test/src/math/acos_test.cpp | 9 ++++- libc/test/src/math/acosf16_test.cpp | 11 +++++- libc/test/src/math/acosf_test.cpp | 11 +++++- libc/test/src/math/acoshf_test.cpp | 9 ++++- libc/test/src/math/asin_test.cpp | 11 +++++- libc/test/src/math/asinf_test.cpp | 11 +++++- libc/test/src/math/asinhf16_test.cpp | 13 ++++++- libc/test/src/math/asinhf_test.cpp | 11 +++++- libc/test/src/math/atan2f_test.cpp | 19 +++++++-- libc/test/src/math/atanf16_test.cpp | 11 +++++- libc/test/src/math/cbrt_test.cpp | 13 +++++-- libc/test/src/math/cos_test.cpp | 9 ++++- libc/test/src/math/cosf16_test.cpp | 11 +++++- libc/test/src/math/cosf_test.cpp | 24 ++++++++---- libc/test/src/math/coshf16_test.cpp | 11 +++++- libc/test/src/math/erff_test.cpp | 18 ++++++--- libc/test/src/math/exp10_test.cpp | 18 ++++++--- libc/test/src/math/exp10f16_test.cpp | 13 ++++++- libc/test/src/math/exp10m1f16_test.cpp | 13 ++++++- libc/test/src/math/exp10m1f_test.cpp | 11 +++++- libc/test/src/math/exp2_test.cpp | 6 +-- libc/test/src/math/exp2f16_test.cpp | 11 +++++- libc/test/src/math/exp2f_test.cpp | 10 ++++- libc/test/src/math/exp2m1f16_test.cpp | 13 ++++++- libc/test/src/math/exp2m1f_test.cpp | 10 ++++- libc/test/src/math/exp_test.cpp | 18 ++++++--- libc/test/src/math/expf16_test.cpp | 11 +++++- libc/test/src/math/expf_test.cpp | 10 ++++- libc/test/src/math/expm1_test.cpp | 12 ++++-- libc/test/src/math/expm1f16_test.cpp | 13 ++++++- libc/test/src/math/expm1f_test.cpp | 12 ++++-- libc/test/src/math/log10_test.cpp | 16 +++++--- libc/test/src/math/log10f16_test.cpp | 13 ++++++- libc/test/src/math/log10f_test.cpp | 10 ++++- libc/test/src/math/log1p_test.cpp | 19 +++++---- libc/test/src/math/log1pf_test.cpp | 10 ++++- libc/test/src/math/log2_test.cpp | 6 +-- libc/test/src/math/log2f16_test.cpp | 10 ++++- libc/test/src/math/log_test.cpp | 16 +++++--- libc/test/src/math/logf16_test.cpp | 11 +++++- libc/test/src/math/logf_test.cpp | 14 ++++++- libc/test/src/math/powf_test.cpp | 13 +++++-- libc/test/src/math/sin_test.cpp | 9 ++++- libc/test/src/math/sincos_test.cpp | 23 +++++++---- libc/test/src/math/sincosf_test.cpp | 24 +++++++----- libc/test/src/math/sinf16_test.cpp | 11 +++++- libc/test/src/math/sinf_test.cpp | 36 ++++++++++------- libc/test/src/math/sinhf16_test.cpp | 11 +++++- libc/test/src/math/sinhf_test.cpp | 12 ++++-- libc/test/src/math/smoke/sinf_test.cpp | 6 +++ libc/test/src/math/tan_test.cpp | 13 +++++-- libc/test/src/math/tanf16_test.cpp | 11 +++++- libc/test/src/math/tanf_test.cpp | 11 +++++- libc/test/src/math/tanhf16_test.cpp | 11 +++++- libc/test/src/math/tanpif16_test.cpp | 13 ++++++- 57 files changed, 579 insertions(+), 190 deletions(-) diff --git a/libc/src/__support/math/atan2f.h b/libc/src/__support/math/atan2f.h index e3b19329126f4..0133d12c1e071 100644 --- a/libc/src/__support/math/atan2f.h +++ b/libc/src/__support/math/atan2f.h @@ -18,9 +18,11 @@ #include "src/__support/FPUtil/nearest_integer.h" #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY +#include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA #if defined(LIBC_MATH_HAS_SKIP_ACCURATE_PASS) && \ - defined(LIBC_MATH_HAS_INTERMEDIATE_COMP_IN_FLOAT) + defined(LIBC_MATH_HAS_INTERMEDIATE_COMP_IN_FLOAT) && \ + defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT) // We use float-float implementation to reduce size. #include "atan2f_float.h" diff --git a/libc/test/UnitTest/FPMatcher.h b/libc/test/UnitTest/FPMatcher.h index f74276f54eb25..430727e537107 100644 --- a/libc/test/UnitTest/FPMatcher.h +++ b/libc/test/UnitTest/FPMatcher.h @@ -16,6 +16,7 @@ #include "src/__support/FPUtil/fpbits_str.h" #include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" +#include "src/__support/macros/optimization.h" #include "src/__support/macros/properties/architectures.h" #include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/RoundingModeUtils.h" @@ -294,42 +295,46 @@ struct ModifyMXCSR { #define EXPECT_MATH_ERRNO(expected) \ do { \ - if (math_errhandling & MATH_ERRNO) { \ - int actual = libc_errno; \ - libc_errno = 0; \ - EXPECT_EQ(actual, expected); \ - } \ + if ((LIBC_MATH & LIBC_MATH_NO_ERRNO) == 0) \ + if (math_errhandling & MATH_ERRNO) { \ + int actual = libc_errno; \ + libc_errno = 0; \ + EXPECT_EQ(actual, expected); \ + } \ } while (0) #define ASSERT_MATH_ERRNO(expected) \ do { \ - if (math_errhandling & MATH_ERRNO) { \ - int actual = libc_errno; \ - libc_errno = 0; \ - ASSERT_EQ(actual, expected); \ - } \ + if ((LIBC_MATH & LIBC_MATH_NO_ERRNO) == 0) \ + if (math_errhandling & MATH_ERRNO) { \ + int actual = libc_errno; \ + libc_errno = 0; \ + ASSERT_EQ(actual, expected); \ + } \ } while (0) #define EXPECT_FP_EXCEPTION(expected) \ do { \ - if (math_errhandling & MATH_ERREXCEPT) { \ - EXPECT_EQ( \ - LIBC_NAMESPACE::fputil::test_except( \ - static_cast(FE_ALL_EXCEPT)) & \ - ((expected) ? (expected) : static_cast(FE_ALL_EXCEPT)), \ - (expected)); \ - } \ + if ((LIBC_MATH & LIBC_MATH_NO_EXCEPT) == 0) \ + if (math_errhandling & MATH_ERREXCEPT) { \ + EXPECT_EQ( \ + LIBC_NAMESPACE::fputil::test_except( \ + static_cast(FE_ALL_EXCEPT)) & \ + ((expected) ? (expected) : static_cast(FE_ALL_EXCEPT)), \ + (expected)); \ + } \ } while (0) #define ASSERT_FP_EXCEPTION(expected) \ do { \ - if (math_errhandling & MATH_ERREXCEPT) { \ - ASSERT_EQ( \ - LIBC_NAMESPACE::fputil::test_except( \ - static_cast(FE_ALL_EXCEPT)) & \ - ((expected) ? (expected) : static_cast(FE_ALL_EXCEPT)), \ - (expected)); \ - } \ + if ((LIBC_MATH & LIBC_MATH_NO_EXCEPT) == 0) \ + if (math_errhandling & MATH_ERREXCEPT) { \ + ASSERT_EQ( \ + LIBC_NAMESPACE::fputil::test_except( \ + static_cast(FE_ALL_EXCEPT)) & \ + ((expected) ? (expected) : static_cast(FE_ALL_EXCEPT)), \ + (expected)); \ + } \ } while (0) #define EXPECT_FP_EQ_WITH_EXCEPTION(expected_val, actual_val, expected_except) \ diff --git a/libc/test/src/math/acos_test.cpp b/libc/test/src/math/acos_test.cpp index 140488702f0bc..8678fe620d6d3 100644 --- a/libc/test/src/math/acos_test.cpp +++ b/libc/test/src/math/acos_test.cpp @@ -6,11 +6,18 @@ // //===----------------------------------------------------------------------===// +#include "src/__support/macros/optimization.h" #include "src/math/acos.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 8 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + using LlvmLibcAcosTest = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; @@ -46,7 +53,7 @@ TEST_F(LlvmLibcAcosTest, InDoubleRange) { ++count; if (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Acos, x, result, - 0.5, rounding_mode)) { + TOLERANCE + 0.5, rounding_mode)) { ++fails; while (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Acos, x, result, tol, rounding_mode)) { diff --git a/libc/test/src/math/acosf16_test.cpp b/libc/test/src/math/acosf16_test.cpp index f4890c81b0bcb..ca33550c6ce92 100644 --- a/libc/test/src/math/acosf16_test.cpp +++ b/libc/test/src/math/acosf16_test.cpp @@ -6,11 +6,18 @@ // //===----------------------------------------------------------------------===// +#include "src/__support/macros/optimization.h" #include "src/math/acosf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + using LlvmLibcAcosf16Test = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; @@ -28,7 +35,7 @@ TEST_F(LlvmLibcAcosf16Test, PositiveRange) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Acos, x, - LIBC_NAMESPACE::acosf16(x), 0.5); + LIBC_NAMESPACE::acosf16(x), TOLERANCE + 0.5); } } @@ -37,6 +44,6 @@ TEST_F(LlvmLibcAcosf16Test, NegativeRange) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Acos, x, - LIBC_NAMESPACE::acosf16(x), 0.5); + LIBC_NAMESPACE::acosf16(x), TOLERANCE + 0.5); } } diff --git a/libc/test/src/math/acosf_test.cpp b/libc/test/src/math/acosf_test.cpp index 3b45749467b80..40b87021109f9 100644 --- a/libc/test/src/math/acosf_test.cpp +++ b/libc/test/src/math/acosf_test.cpp @@ -10,11 +10,18 @@ #include "hdr/math_macros.h" #include "hdr/stdint_proxy.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/optimization.h" #include "src/math/acosf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + namespace mpfr = LIBC_NAMESPACE::testing::mpfr; using LlvmLibcAcosfTest = LIBC_NAMESPACE::testing::FPTest; @@ -72,8 +79,8 @@ TEST_F(LlvmLibcAcosfTest, SpecificBitPatterns) { for (int i = 0; i < N; ++i) { float x = FPBits(INPUTS[i]).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Acos, x, - LIBC_NAMESPACE::acosf(x), 0.5); + LIBC_NAMESPACE::acosf(x), TOLERANCE + 0.5); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Acos, -x, - LIBC_NAMESPACE::acosf(-x), 0.5); + LIBC_NAMESPACE::acosf(-x), TOLERANCE + 0.5); } } diff --git a/libc/test/src/math/acoshf_test.cpp b/libc/test/src/math/acoshf_test.cpp index 506f17680887e..a0e9b390b58de 100644 --- a/libc/test/src/math/acoshf_test.cpp +++ b/libc/test/src/math/acoshf_test.cpp @@ -10,11 +10,18 @@ #include "hdr/math_macros.h" #include "hdr/stdint_proxy.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/optimization.h" #include "src/math/acoshf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + using LlvmLibcAcoshfTest = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; @@ -68,6 +75,6 @@ TEST_F(LlvmLibcAcoshfTest, SpecificBitPatterns) { for (int i = 0; i < N; ++i) { float x = FPBits(INPUTS[i]).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Acosh, x, - LIBC_NAMESPACE::acoshf(x), 0.5); + LIBC_NAMESPACE::acoshf(x), TOLERANCE + 0.5); } } diff --git a/libc/test/src/math/asin_test.cpp b/libc/test/src/math/asin_test.cpp index 03ae963e9f924..4e3638410b484 100644 --- a/libc/test/src/math/asin_test.cpp +++ b/libc/test/src/math/asin_test.cpp @@ -6,11 +6,18 @@ // //===----------------------------------------------------------------------===// +#include "src/__support/macros/optimization.h" #include "src/math/asin.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 6 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + using LlvmLibcAsinTest = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; @@ -47,7 +54,7 @@ TEST_F(LlvmLibcAsinTest, InDoubleRange) { ++count; if (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Asin, x, result, - 0.5, rounding_mode)) { + TOLERANCE + 0.5, rounding_mode)) { ++fails; while (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Asin, x, result, tol, rounding_mode)) { @@ -72,6 +79,7 @@ TEST_F(LlvmLibcAsinTest, InDoubleRange) { tlog << " Test Rounding To Nearest...\n"; test(mpfr::RoundingMode::Nearest); +#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS tlog << " Test Rounding Downward...\n"; test(mpfr::RoundingMode::Downward); @@ -80,4 +88,5 @@ TEST_F(LlvmLibcAsinTest, InDoubleRange) { tlog << " Test Rounding Toward Zero...\n"; test(mpfr::RoundingMode::TowardZero); +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS } diff --git a/libc/test/src/math/asinf_test.cpp b/libc/test/src/math/asinf_test.cpp index 824bc1ef868af..20702c5c15e88 100644 --- a/libc/test/src/math/asinf_test.cpp +++ b/libc/test/src/math/asinf_test.cpp @@ -11,11 +11,18 @@ #include "hdr/math_macros.h" #include "hdr/stdint_proxy.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/optimization.h" #include "src/math/asinf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + using LlvmLibcAsinfTest = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; @@ -68,8 +75,8 @@ TEST_F(LlvmLibcAsinfTest, SpecificBitPatterns) { for (int i = 0; i < N; ++i) { float x = FPBits(INPUTS[i]).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Asin, x, - LIBC_NAMESPACE::asinf(x), 0.5); + LIBC_NAMESPACE::asinf(x), TOLERANCE + 0.5); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Asin, -x, - LIBC_NAMESPACE::asinf(-x), 0.5); + LIBC_NAMESPACE::asinf(-x), TOLERANCE + 0.5); } } diff --git a/libc/test/src/math/asinhf16_test.cpp b/libc/test/src/math/asinhf16_test.cpp index 929d13713d197..8d0f754a0645b 100644 --- a/libc/test/src/math/asinhf16_test.cpp +++ b/libc/test/src/math/asinhf16_test.cpp @@ -6,11 +6,18 @@ // //===----------------------------------------------------------------------===// +#include "src/__support/macros/optimization.h" #include "src/math/asinhf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + using LlvmLibcAsinhf16Test = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; @@ -28,7 +35,8 @@ TEST_F(LlvmLibcAsinhf16Test, PositiveRange) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Asinh, x, - LIBC_NAMESPACE::asinhf16(x), 0.5); + LIBC_NAMESPACE::asinhf16(x), + TOLERANCE + 0.5); } } @@ -37,6 +45,7 @@ TEST_F(LlvmLibcAsinhf16Test, NegativeRange) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Asinh, x, - LIBC_NAMESPACE::asinhf16(x), 0.5); + LIBC_NAMESPACE::asinhf16(x), + TOLERANCE + 0.5); } } diff --git a/libc/test/src/math/asinhf_test.cpp b/libc/test/src/math/asinhf_test.cpp index 4681aad03187d..9d9c042208d28 100644 --- a/libc/test/src/math/asinhf_test.cpp +++ b/libc/test/src/math/asinhf_test.cpp @@ -9,11 +9,18 @@ #include "hdr/math_macros.h" #include "hdr/stdint_proxy.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/optimization.h" #include "src/math/asinhf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + using LlvmLibcAsinhfTest = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; @@ -68,8 +75,8 @@ TEST_F(LlvmLibcAsinhfTest, SpecificBitPatterns) { for (int i = 0; i < N; ++i) { float x = FPBits(INPUTS[i]).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Asinh, x, - LIBC_NAMESPACE::asinhf(x), 0.5); + LIBC_NAMESPACE::asinhf(x), TOLERANCE + 0.5); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Asinh, -x, - LIBC_NAMESPACE::asinhf(-x), 0.5); + LIBC_NAMESPACE::asinhf(-x), TOLERANCE + 0.5); } } diff --git a/libc/test/src/math/atan2f_test.cpp b/libc/test/src/math/atan2f_test.cpp index 50ab38208089a..56b6967069523 100644 --- a/libc/test/src/math/atan2f_test.cpp +++ b/libc/test/src/math/atan2f_test.cpp @@ -8,11 +8,18 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/optimization.h" #include "src/math/atan2f.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + using LlvmLibcAtan2fTest = LIBC_NAMESPACE::testing::FPTest; using LIBC_NAMESPACE::testing::tlog; @@ -36,16 +43,20 @@ TEST_F(LlvmLibcAtan2fTest, TrickyInputs) { float x = INPUTS[i].x; float y = INPUTS[i].y; ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Atan2, INPUTS[i], - LIBC_NAMESPACE::atan2f(x, y), 0.5); + LIBC_NAMESPACE::atan2f(x, y), + TOLERANCE + 0.5); INPUTS[i].x = -INPUTS[i].x; ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Atan2, INPUTS[i], - LIBC_NAMESPACE::atan2f(-x, y), 0.5); + LIBC_NAMESPACE::atan2f(-x, y), + TOLERANCE + 0.5); INPUTS[i].y = -INPUTS[i].y; ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Atan2, INPUTS[i], - LIBC_NAMESPACE::atan2f(-x, -y), 0.5); + LIBC_NAMESPACE::atan2f(-x, -y), + TOLERANCE + 0.5); INPUTS[i].x = -INPUTS[i].x; ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Atan2, INPUTS[i], - LIBC_NAMESPACE::atan2f(x, -y), 0.5); + LIBC_NAMESPACE::atan2f(x, -y), + TOLERANCE + 0.5); } } diff --git a/libc/test/src/math/atanf16_test.cpp b/libc/test/src/math/atanf16_test.cpp index fa383e718834e..e91133fc6e4ca 100644 --- a/libc/test/src/math/atanf16_test.cpp +++ b/libc/test/src/math/atanf16_test.cpp @@ -6,11 +6,18 @@ // //===----------------------------------------------------------------------===// +#include "src/__support/macros/optimization.h" #include "src/math/atanf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + using LlvmLibcAtanf16Test = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; @@ -27,7 +34,7 @@ TEST_F(LlvmLibcAtanf16Test, PositiveRange) { for (uint16_t v = POS_START; v <= POS_STOP; ++v) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Atan, x, - LIBC_NAMESPACE::atanf16(x), 0.5); + LIBC_NAMESPACE::atanf16(x), TOLERANCE + 0.5); } } @@ -35,6 +42,6 @@ TEST_F(LlvmLibcAtanf16Test, NegativeRange) { for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Atan, x, - LIBC_NAMESPACE::atanf16(x), 0.5); + LIBC_NAMESPACE::atanf16(x), TOLERANCE + 0.5); } } diff --git a/libc/test/src/math/cbrt_test.cpp b/libc/test/src/math/cbrt_test.cpp index 2e2de16fc859d..61b8583cbc7ff 100644 --- a/libc/test/src/math/cbrt_test.cpp +++ b/libc/test/src/math/cbrt_test.cpp @@ -8,11 +8,18 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/optimization.h" #include "src/math/cbrt.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + using LlvmLibcCbrtTest = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; @@ -49,7 +56,7 @@ TEST_F(LlvmLibcCbrtTest, InDoubleRange) { ++tested; if (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Cbrt, x, result, - 0.5, rounding_mode)) { + TOLERANCE + 0.5, rounding_mode)) { ++fails; while (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Cbrt, x, result, ulp, rounding_mode)) { @@ -98,8 +105,8 @@ TEST_F(LlvmLibcCbrtTest, SpecialValues) { for (double v : INPUTS) { double x = FPBits(v).get_val(); ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cbrt, x, - LIBC_NAMESPACE::cbrt(x), 0.5); + LIBC_NAMESPACE::cbrt(x), TOLERANCE + 0.5); ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cbrt, -x, - LIBC_NAMESPACE::cbrt(-x), 0.5); + LIBC_NAMESPACE::cbrt(-x), TOLERANCE + 0.5); } } diff --git a/libc/test/src/math/cos_test.cpp b/libc/test/src/math/cos_test.cpp index e2d47917e545e..40c43f9894639 100644 --- a/libc/test/src/math/cos_test.cpp +++ b/libc/test/src/math/cos_test.cpp @@ -7,11 +7,18 @@ //===----------------------------------------------------------------------===// #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/optimization.h" #include "src/math/cos.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + using LlvmLibcCosTest = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; @@ -57,7 +64,7 @@ TEST_F(LlvmLibcCosTest, TrickyInputs) { for (int i = 0; i < N; ++i) { double x = INPUTS[i]; ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cos, x, - LIBC_NAMESPACE::cos(x), 0.5); + LIBC_NAMESPACE::cos(x), TOLERANCE + 0.5); } } diff --git a/libc/test/src/math/cosf16_test.cpp b/libc/test/src/math/cosf16_test.cpp index b744e7817e4ba..fe6f4abd39009 100644 --- a/libc/test/src/math/cosf16_test.cpp +++ b/libc/test/src/math/cosf16_test.cpp @@ -6,11 +6,18 @@ // //===----------------------------------------------------------------------===// +#include "src/__support/macros/optimization.h" #include "src/math/cosf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + using LlvmLibcCosf16Test = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; @@ -27,7 +34,7 @@ TEST_F(LlvmLibcCosf16Test, PositiveRange) { for (uint16_t v = POS_START; v <= POS_STOP; ++v) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cos, x, - LIBC_NAMESPACE::cosf16(x), 0.5); + LIBC_NAMESPACE::cosf16(x), TOLERANCE + 0.5); } } @@ -35,6 +42,6 @@ TEST_F(LlvmLibcCosf16Test, NegativeRange) { for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cos, x, - LIBC_NAMESPACE::cosf16(x), 0.5); + LIBC_NAMESPACE::cosf16(x), TOLERANCE + 0.5); } } diff --git a/libc/test/src/math/cosf_test.cpp b/libc/test/src/math/cosf_test.cpp index 6afaa42e1f57d..d5d5dc3b4cd50 100644 --- a/libc/test/src/math/cosf_test.cpp +++ b/libc/test/src/math/cosf_test.cpp @@ -10,12 +10,21 @@ #include "hdr/math_macros.h" #include "hdr/stdint_proxy.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/optimization.h" #include "src/math/cosf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "test/src/math/sdcomp26094.h" #include "utils/MPFRWrapper/MPFRUtils.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 3 +#define FP_ASSERT ASSERT_MPFR_MATCH +#else +#define TOLERANCE 0 +#define FP_ASSERT ASSERT_MPFR_MATCH_ALL_ROUNDING +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + using LIBC_NAMESPACE::testing::SDCOMP26094_VALUES; using LlvmLibcCosfTest = LIBC_NAMESPACE::testing::FPTest; @@ -45,8 +54,8 @@ TEST_F(LlvmLibcCosfTest, InFloatRange) { float x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf()) continue; - ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cos, x, - LIBC_NAMESPACE::cosf(x), 0.5); + FP_ASSERT(mpfr::Operation::Cos, x, LIBC_NAMESPACE::cosf(x), + TOLERANCE + 0.5); } } @@ -99,10 +108,10 @@ TEST_F(LlvmLibcCosfTest, SpecificBitPatterns) { for (int i = 0; i < N; ++i) { float x = FPBits(INPUTS[i]).get_val(); - EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cos, x, - LIBC_NAMESPACE::cosf(x), 0.5); - EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cos, -x, - LIBC_NAMESPACE::cosf(-x), 0.5); + FP_ASSERT(mpfr::Operation::Cos, x, LIBC_NAMESPACE::cosf(x), + TOLERANCE + 0.5); + FP_ASSERT(mpfr::Operation::Cos, -x, LIBC_NAMESPACE::cosf(-x), + TOLERANCE + 0.5); } } @@ -111,6 +120,7 @@ TEST_F(LlvmLibcCosfTest, SpecificBitPatterns) { TEST_F(LlvmLibcCosfTest, SDCOMP_26094) { for (uint32_t v : SDCOMP26094_VALUES) { float x = FPBits(v).get_val(); - ASSERT_MPFR_MATCH(mpfr::Operation::Cos, x, LIBC_NAMESPACE::cosf(x), 0.5); + FP_ASSERT(mpfr::Operation::Cos, x, LIBC_NAMESPACE::cosf(x), + TOLERANCE + 0.5); } } diff --git a/libc/test/src/math/coshf16_test.cpp b/libc/test/src/math/coshf16_test.cpp index a0d1fd2110478..8e8f55e128edf 100644 --- a/libc/test/src/math/coshf16_test.cpp +++ b/libc/test/src/math/coshf16_test.cpp @@ -6,11 +6,18 @@ // //===----------------------------------------------------------------------===// +#include "src/__support/macros/optimization.h" #include "src/math/coshf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + using LlvmLibcCoshf16Test = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; @@ -27,7 +34,7 @@ TEST_F(LlvmLibcCoshf16Test, PositiveRange) { for (uint16_t v = POS_START; v <= POS_STOP; ++v) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cosh, x, - LIBC_NAMESPACE::coshf16(x), 0.5); + LIBC_NAMESPACE::coshf16(x), TOLERANCE + 0.5); } } @@ -35,6 +42,6 @@ TEST_F(LlvmLibcCoshf16Test, NegativeRange) { for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cosh, x, - LIBC_NAMESPACE::coshf16(x), 0.5); + LIBC_NAMESPACE::coshf16(x), TOLERANCE + 0.5); } } diff --git a/libc/test/src/math/erff_test.cpp b/libc/test/src/math/erff_test.cpp index c40aacf852bc9..17ff6c43f7717 100644 --- a/libc/test/src/math/erff_test.cpp +++ b/libc/test/src/math/erff_test.cpp @@ -7,13 +7,19 @@ //===----------------------------------------------------------------------===// #include "hdr/math_macros.h" +#include "hdr/stdint_proxy.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/optimization.h" #include "src/math/erff.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" -#include "hdr/stdint_proxy.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS using LlvmLibcErffTest = LIBC_NAMESPACE::testing::FPTest; @@ -37,9 +43,9 @@ TEST_F(LlvmLibcErffTest, TrickyInputs) { for (int i = 0; i < N; ++i) { float x = FPBits(INPUTS[i]).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Erf, x, - LIBC_NAMESPACE::erff(x), 0.5); + LIBC_NAMESPACE::erff(x), TOLERANCE + 0.5); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Erf, -x, - LIBC_NAMESPACE::erff(-x), 0.5); + LIBC_NAMESPACE::erff(-x), TOLERANCE + 0.5); } } @@ -83,10 +89,10 @@ TEST_F(LlvmLibcErffTest, InFloatRange) { } } } - tlog << " Log failed: " << fails << "/" << count << "/" << cc - << " tests.\n"; - tlog << " Max ULPs is at most: " << static_cast(tol) << ".\n"; if (fails) { + tlog << " Log failed: " << fails << "/" << count << "/" << cc + << " tests.\n"; + tlog << " Max ULPs is at most: " << static_cast(tol) << ".\n"; EXPECT_MPFR_MATCH(mpfr::Operation::Erf, mx, mr, 0.5, rounding_mode); } }; diff --git a/libc/test/src/math/exp10_test.cpp b/libc/test/src/math/exp10_test.cpp index 2ddcef0d77672..4257dd0d1458b 100644 --- a/libc/test/src/math/exp10_test.cpp +++ b/libc/test/src/math/exp10_test.cpp @@ -7,13 +7,19 @@ //===----------------------------------------------------------------------===// #include "hdr/math_macros.h" +#include "hdr/stdint_proxy.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/optimization.h" #include "src/math/exp10.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" -#include "hdr/stdint_proxy.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS using LlvmLibcExp10Test = LIBC_NAMESPACE::testing::FPTest; @@ -79,7 +85,7 @@ TEST_F(LlvmLibcExp10Test, TrickyInputs) { for (int i = 0; i < N; ++i) { double x = FPBits(INPUTS[i]).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10, x, - LIBC_NAMESPACE::exp10(x), 0.5); + LIBC_NAMESPACE::exp10(x), TOLERANCE + 0.5); } } @@ -112,7 +118,7 @@ TEST_F(LlvmLibcExp10Test, InDoubleRange) { ++count; if (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Exp10, x, result, - 0.5, rounding_mode)) { + TOLERANCE + 0.5, rounding_mode)) { ++fails; while (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Exp10, x, result, tol, rounding_mode)) { @@ -126,10 +132,10 @@ TEST_F(LlvmLibcExp10Test, InDoubleRange) { } } } - tlog << " Exp10 failed: " << fails << "/" << count << "/" << cc - << " tests.\n"; - tlog << " Max ULPs is at most: " << static_cast(tol) << ".\n"; if (fails) { + tlog << " Exp10 failed: " << fails << "/" << count << "/" << cc + << " tests.\n"; + tlog << " Max ULPs is at most: " << static_cast(tol) << ".\n"; EXPECT_MPFR_MATCH(mpfr::Operation::Exp10, mx, mr, 0.5, rounding_mode); } }; diff --git a/libc/test/src/math/exp10f16_test.cpp b/libc/test/src/math/exp10f16_test.cpp index fc49331b5917c..7bd085012e583 100644 --- a/libc/test/src/math/exp10f16_test.cpp +++ b/libc/test/src/math/exp10f16_test.cpp @@ -6,11 +6,18 @@ // //===----------------------------------------------------------------------===// +#include "src/__support/macros/optimization.h" #include "src/math/exp10f16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + using LlvmLibcExp10f16Test = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; @@ -27,7 +34,8 @@ TEST_F(LlvmLibcExp10f16Test, PositiveRange) { for (uint16_t v = POS_START; v <= POS_STOP; ++v) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10, x, - LIBC_NAMESPACE::exp10f16(x), 0.5); + LIBC_NAMESPACE::exp10f16(x), + TOLERANCE + 0.5); } } @@ -35,6 +43,7 @@ TEST_F(LlvmLibcExp10f16Test, NegativeRange) { for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10, x, - LIBC_NAMESPACE::exp10f16(x), 0.5); + LIBC_NAMESPACE::exp10f16(x), + TOLERANCE + 0.5); } } diff --git a/libc/test/src/math/exp10m1f16_test.cpp b/libc/test/src/math/exp10m1f16_test.cpp index 41bb12f7d0973..4ba16e0e70b69 100644 --- a/libc/test/src/math/exp10m1f16_test.cpp +++ b/libc/test/src/math/exp10m1f16_test.cpp @@ -6,11 +6,18 @@ // //===----------------------------------------------------------------------===// +#include "src/__support/macros/optimization.h" #include "src/math/exp10m1f16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + using LlvmLibcExp10m1f16Test = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; @@ -27,7 +34,8 @@ TEST_F(LlvmLibcExp10m1f16Test, PositiveRange) { for (uint16_t v = POS_START; v <= POS_STOP; ++v) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10m1, x, - LIBC_NAMESPACE::exp10m1f16(x), 0.5); + LIBC_NAMESPACE::exp10m1f16(x), + TOLERANCE + 0.5); } } @@ -35,6 +43,7 @@ TEST_F(LlvmLibcExp10m1f16Test, NegativeRange) { for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10m1, x, - LIBC_NAMESPACE::exp10m1f16(x), 0.5); + LIBC_NAMESPACE::exp10m1f16(x), + TOLERANCE + 0.5); } } diff --git a/libc/test/src/math/exp10m1f_test.cpp b/libc/test/src/math/exp10m1f_test.cpp index 009c85d6cc453..b41a412b3bddc 100644 --- a/libc/test/src/math/exp10m1f_test.cpp +++ b/libc/test/src/math/exp10m1f_test.cpp @@ -7,14 +7,20 @@ //===----------------------------------------------------------------------===// #include "hdr/math_macros.h" +#include "hdr/stdint_proxy.h" #include "src/__support/CPP/array.h" #include "src/__support/libc_errno.h" +#include "src/__support/macros/optimization.h" #include "src/math/exp10m1f.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" -#include "hdr/stdint_proxy.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS using LlvmLibcExp10m1fTest = LIBC_NAMESPACE::testing::FPTest; @@ -70,7 +76,8 @@ TEST_F(LlvmLibcExp10m1fTest, TrickyInputs) { for (float x : INPUTS) { EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10m1, x, - LIBC_NAMESPACE::exp10m1f(x), 0.5); + LIBC_NAMESPACE::exp10m1f(x), + TOLERANCE + 0.5); } } diff --git a/libc/test/src/math/exp2_test.cpp b/libc/test/src/math/exp2_test.cpp index a3dcb58c5c288..324cafd926756 100644 --- a/libc/test/src/math/exp2_test.cpp +++ b/libc/test/src/math/exp2_test.cpp @@ -101,10 +101,10 @@ TEST_F(LlvmLibcExp2Test, InDoubleRange) { } } } - tlog << " Exp2 failed: " << fails << "/" << count << "/" << cc - << " tests.\n"; - tlog << " Max ULPs is at most: " << static_cast(tol) << ".\n"; if (fails) { + tlog << " Exp2 failed: " << fails << "/" << count << "/" << cc + << " tests.\n"; + tlog << " Max ULPs is at most: " << static_cast(tol) << ".\n"; EXPECT_MPFR_MATCH(mpfr::Operation::Exp2, mx, mr, 0.5, rounding_mode); } }; diff --git a/libc/test/src/math/exp2f16_test.cpp b/libc/test/src/math/exp2f16_test.cpp index 503d8c2d89d94..d28632a77c4cb 100644 --- a/libc/test/src/math/exp2f16_test.cpp +++ b/libc/test/src/math/exp2f16_test.cpp @@ -6,11 +6,18 @@ // //===----------------------------------------------------------------------===// +#include "src/__support/macros/optimization.h" #include "src/math/exp2f16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + using LlvmLibcExp2f16Test = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; @@ -27,7 +34,7 @@ TEST_F(LlvmLibcExp2f16Test, PositiveRange) { for (uint16_t v = POS_START; v <= POS_STOP; ++v) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2, x, - LIBC_NAMESPACE::exp2f16(x), 0.5); + LIBC_NAMESPACE::exp2f16(x), TOLERANCE + 0.5); } } @@ -35,6 +42,6 @@ TEST_F(LlvmLibcExp2f16Test, NegativeRange) { for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2, x, - LIBC_NAMESPACE::exp2f16(x), 0.5); + LIBC_NAMESPACE::exp2f16(x), TOLERANCE + 0.5); } } diff --git a/libc/test/src/math/exp2f_test.cpp b/libc/test/src/math/exp2f_test.cpp index 94141dda71843..5e5fd699ab4ac 100644 --- a/libc/test/src/math/exp2f_test.cpp +++ b/libc/test/src/math/exp2f_test.cpp @@ -7,14 +7,20 @@ //===----------------------------------------------------------------------===// #include "hdr/math_macros.h" +#include "hdr/stdint_proxy.h" #include "src/__support/FPUtil/FPBits.h" #include "src/__support/libc_errno.h" +#include "src/__support/macros/optimization.h" #include "src/math/exp2f.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" -#include "hdr/stdint_proxy.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS using LlvmLibcExp2fTest = LIBC_NAMESPACE::testing::FPTest; @@ -71,7 +77,7 @@ TEST_F(LlvmLibcExp2fTest, TrickyInputs) { libc_errno = 0; float x = FPBits(INPUTS[i]).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2, x, - LIBC_NAMESPACE::exp2f(x), 0.5); + LIBC_NAMESPACE::exp2f(x), TOLERANCE + 0.5); EXPECT_MATH_ERRNO(0); } } diff --git a/libc/test/src/math/exp2m1f16_test.cpp b/libc/test/src/math/exp2m1f16_test.cpp index bf299118429d7..526484d8d196b 100644 --- a/libc/test/src/math/exp2m1f16_test.cpp +++ b/libc/test/src/math/exp2m1f16_test.cpp @@ -6,11 +6,18 @@ // //===----------------------------------------------------------------------===// +#include "src/__support/macros/optimization.h" #include "src/math/exp2m1f16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + using LlvmLibcExp2m1f16Test = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; @@ -27,7 +34,8 @@ TEST_F(LlvmLibcExp2m1f16Test, PositiveRange) { for (uint16_t v = POS_START; v <= POS_STOP; ++v) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2m1, x, - LIBC_NAMESPACE::exp2m1f16(x), 0.5); + LIBC_NAMESPACE::exp2m1f16(x), + TOLERANCE + 0.5); } } @@ -35,6 +43,7 @@ TEST_F(LlvmLibcExp2m1f16Test, NegativeRange) { for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2m1, x, - LIBC_NAMESPACE::exp2m1f16(x), 0.5); + LIBC_NAMESPACE::exp2m1f16(x), + TOLERANCE + 0.5); } } diff --git a/libc/test/src/math/exp2m1f_test.cpp b/libc/test/src/math/exp2m1f_test.cpp index 7e9f6b50ce816..16b20d72e2958 100644 --- a/libc/test/src/math/exp2m1f_test.cpp +++ b/libc/test/src/math/exp2m1f_test.cpp @@ -7,15 +7,21 @@ //===----------------------------------------------------------------------===// #include "hdr/math_macros.h" +#include "hdr/stdint_proxy.h" #include "src/__support/CPP/array.h" #include "src/__support/FPUtil/FPBits.h" #include "src/__support/libc_errno.h" +#include "src/__support/macros/optimization.h" #include "src/math/exp2m1f.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" -#include "hdr/stdint_proxy.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS using LlvmLibcExp2m1fTest = LIBC_NAMESPACE::testing::FPTest; @@ -39,7 +45,7 @@ TEST_F(LlvmLibcExp2m1fTest, TrickyInputs) { for (float x : INPUTS) { EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2m1, x, - LIBC_NAMESPACE::exp2m1f(x), 0.5); + LIBC_NAMESPACE::exp2m1f(x), TOLERANCE + 0.5); } } diff --git a/libc/test/src/math/exp_test.cpp b/libc/test/src/math/exp_test.cpp index 854bb5155e11d..5a785fbaee4fd 100644 --- a/libc/test/src/math/exp_test.cpp +++ b/libc/test/src/math/exp_test.cpp @@ -7,13 +7,19 @@ //===----------------------------------------------------------------------===// #include "hdr/math_macros.h" +#include "hdr/stdint_proxy.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/optimization.h" #include "src/math/exp.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" -#include "hdr/stdint_proxy.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS using LlvmLibcExpTest = LIBC_NAMESPACE::testing::FPTest; @@ -52,7 +58,7 @@ TEST_F(LlvmLibcExpTest, TrickyInputs) { for (int i = 0; i < N; ++i) { double x = FPBits(INPUTS[i]).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp, x, - LIBC_NAMESPACE::exp(x), 0.5); + LIBC_NAMESPACE::exp(x), TOLERANCE + 0.5); } } @@ -85,7 +91,7 @@ TEST_F(LlvmLibcExpTest, InDoubleRange) { ++count; // ASSERT_MPFR_MATCH(mpfr::Operation::Log, x, result, 0.5); if (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Exp, x, result, - 0.5, rounding_mode)) { + TOLERANCE + 0.5, rounding_mode)) { ++fails; while (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Exp, x, result, tol, rounding_mode)) { @@ -99,10 +105,10 @@ TEST_F(LlvmLibcExpTest, InDoubleRange) { } } } - tlog << " Exp failed: " << fails << "/" << count << "/" << cc - << " tests.\n"; - tlog << " Max ULPs is at most: " << static_cast(tol) << ".\n"; if (fails) { + tlog << " Exp failed: " << fails << "/" << count << "/" << cc + << " tests.\n"; + tlog << " Max ULPs is at most: " << static_cast(tol) << ".\n"; EXPECT_MPFR_MATCH(mpfr::Operation::Exp, mx, mr, 0.5, rounding_mode); } }; diff --git a/libc/test/src/math/expf16_test.cpp b/libc/test/src/math/expf16_test.cpp index ee89a9c716e12..4ea10ea5c8ad4 100644 --- a/libc/test/src/math/expf16_test.cpp +++ b/libc/test/src/math/expf16_test.cpp @@ -6,11 +6,18 @@ // //===----------------------------------------------------------------------===// +#include "src/__support/macros/optimization.h" #include "src/math/expf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + using LlvmLibcExpf16Test = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; @@ -27,7 +34,7 @@ TEST_F(LlvmLibcExpf16Test, PositiveRange) { for (uint16_t v = POS_START; v <= POS_STOP; ++v) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp, x, - LIBC_NAMESPACE::expf16(x), 0.5); + LIBC_NAMESPACE::expf16(x), TOLERANCE + 0.5); } } @@ -35,6 +42,6 @@ TEST_F(LlvmLibcExpf16Test, NegativeRange) { for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp, x, - LIBC_NAMESPACE::expf16(x), 0.5); + LIBC_NAMESPACE::expf16(x), TOLERANCE + 0.5); } } diff --git a/libc/test/src/math/expf_test.cpp b/libc/test/src/math/expf_test.cpp index 1695a60fbd491..05167b80ec06a 100644 --- a/libc/test/src/math/expf_test.cpp +++ b/libc/test/src/math/expf_test.cpp @@ -7,14 +7,20 @@ //===----------------------------------------------------------------------===// #include "hdr/math_macros.h" +#include "hdr/stdint_proxy.h" #include "src/__support/FPUtil/FPBits.h" #include "src/__support/libc_errno.h" +#include "src/__support/macros/optimization.h" #include "src/math/expf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" -#include "hdr/stdint_proxy.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS using LlvmLibcExpfTest = LIBC_NAMESPACE::testing::FPTest; @@ -94,7 +100,7 @@ TEST_F(LlvmLibcExpfTest, Borderline) { x = FPBits(0xc236bd8cU).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp, x, - LIBC_NAMESPACE::expf(x), 0.5); + LIBC_NAMESPACE::expf(x), TOLERANCE + 0.5); EXPECT_MATH_ERRNO(0); } diff --git a/libc/test/src/math/expm1_test.cpp b/libc/test/src/math/expm1_test.cpp index c185be9e35f60..8ca9b7710427b 100644 --- a/libc/test/src/math/expm1_test.cpp +++ b/libc/test/src/math/expm1_test.cpp @@ -7,13 +7,19 @@ //===----------------------------------------------------------------------===// #include "hdr/math_macros.h" +#include "hdr/stdint_proxy.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/optimization.h" #include "src/math/expm1.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" -#include "hdr/stdint_proxy.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS using LlvmLibcExpm1Test = LIBC_NAMESPACE::testing::FPTest; @@ -36,9 +42,9 @@ TEST_F(LlvmLibcExpm1Test, TrickyInputs) { for (int i = 0; i < N; ++i) { double x = INPUTS[i]; EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x, - LIBC_NAMESPACE::expm1(x), 0.5); + LIBC_NAMESPACE::expm1(x), TOLERANCE + 0.5); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, -x, - LIBC_NAMESPACE::expm1(-x), 0.5); + LIBC_NAMESPACE::expm1(-x), TOLERANCE + 0.5); } } diff --git a/libc/test/src/math/expm1f16_test.cpp b/libc/test/src/math/expm1f16_test.cpp index a6a6fcf73d383..952a8796ea7c3 100644 --- a/libc/test/src/math/expm1f16_test.cpp +++ b/libc/test/src/math/expm1f16_test.cpp @@ -6,11 +6,18 @@ // //===----------------------------------------------------------------------===// +#include "src/__support/macros/optimization.h" #include "src/math/expm1f16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + using LlvmLibcExpm1f16Test = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; @@ -27,7 +34,8 @@ TEST_F(LlvmLibcExpm1f16Test, PositiveRange) { for (uint16_t v = POS_START; v <= POS_STOP; ++v) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x, - LIBC_NAMESPACE::expm1f16(x), 0.5); + LIBC_NAMESPACE::expm1f16(x), + TOLERANCE + 0.5); } } @@ -35,6 +43,7 @@ TEST_F(LlvmLibcExpm1f16Test, NegativeRange) { for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x, - LIBC_NAMESPACE::expm1f16(x), 0.5); + LIBC_NAMESPACE::expm1f16(x), + TOLERANCE + 0.5); } } diff --git a/libc/test/src/math/expm1f_test.cpp b/libc/test/src/math/expm1f_test.cpp index 28d7106397801..2f7a9f2de78f3 100644 --- a/libc/test/src/math/expm1f_test.cpp +++ b/libc/test/src/math/expm1f_test.cpp @@ -7,14 +7,20 @@ //===----------------------------------------------------------------------===// #include "hdr/math_macros.h" +#include "hdr/stdint_proxy.h" #include "src/__support/FPUtil/FPBits.h" #include "src/__support/libc_errno.h" +#include "src/__support/macros/optimization.h" #include "src/math/expm1f.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" -#include "hdr/stdint_proxy.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS using LlvmLibcExpm1fTest = LIBC_NAMESPACE::testing::FPTest; @@ -93,7 +99,7 @@ TEST_F(LlvmLibcExpm1fTest, Borderline) { x = FPBits(0x3e35bec5U).get_val(); ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x, - LIBC_NAMESPACE::expm1f(x), 0.5); + LIBC_NAMESPACE::expm1f(x), TOLERANCE + 0.5); EXPECT_MATH_ERRNO(0); x = FPBits(0x942ed494U).get_val(); @@ -103,7 +109,7 @@ TEST_F(LlvmLibcExpm1fTest, Borderline) { x = FPBits(0xbdc1c6cbU).get_val(); ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x, - LIBC_NAMESPACE::expm1f(x), 0.5); + LIBC_NAMESPACE::expm1f(x), TOLERANCE + 0.5); EXPECT_MATH_ERRNO(0); } diff --git a/libc/test/src/math/log10_test.cpp b/libc/test/src/math/log10_test.cpp index 62a19d02309ad..3e48f7447b6cb 100644 --- a/libc/test/src/math/log10_test.cpp +++ b/libc/test/src/math/log10_test.cpp @@ -7,13 +7,19 @@ //===----------------------------------------------------------------------===// #include "hdr/math_macros.h" +#include "hdr/stdint_proxy.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/optimization.h" #include "src/math/log10.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" -#include "hdr/stdint_proxy.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS using LlvmLibcLog10Test = LIBC_NAMESPACE::testing::FPTest; @@ -66,7 +72,7 @@ TEST_F(LlvmLibcLog10Test, TrickyInputs) { for (int i = 0; i < N; ++i) { double x = FPBits(INPUTS[i]).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log10, x, - LIBC_NAMESPACE::log10(x), 0.5); + LIBC_NAMESPACE::log10(x), TOLERANCE + 0.5); } } @@ -118,10 +124,10 @@ TEST_F(LlvmLibcLog10Test, InDoubleRange) { } } } - tlog << " Log10 failed: " << fails << "/" << count << "/" << cc - << " tests.\n"; - tlog << " Max ULPs is at most: " << static_cast(tol) << ".\n"; if (fails) { + tlog << " Log10 failed: " << fails << "/" << count << "/" << cc + << " tests.\n"; + tlog << " Max ULPs is at most: " << static_cast(tol) << ".\n"; EXPECT_MPFR_MATCH(mpfr::Operation::Log10, mx, mr, 0.5, rounding_mode); } }; diff --git a/libc/test/src/math/log10f16_test.cpp b/libc/test/src/math/log10f16_test.cpp index a71e3309ac5f0..0e456dc96a6ce 100644 --- a/libc/test/src/math/log10f16_test.cpp +++ b/libc/test/src/math/log10f16_test.cpp @@ -6,11 +6,18 @@ // //===----------------------------------------------------------------------===// +#include "src/__support/macros/optimization.h" #include "src/math/log10f16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + using LlvmLibcLog10f16Test = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; @@ -27,7 +34,8 @@ TEST_F(LlvmLibcLog10f16Test, PositiveRange) { for (uint16_t v = POS_START; v <= POS_STOP; ++v) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log10, x, - LIBC_NAMESPACE::log10f16(x), 0.5); + LIBC_NAMESPACE::log10f16(x), + TOLERANCE + 0.5); } } @@ -35,6 +43,7 @@ TEST_F(LlvmLibcLog10f16Test, NegativeRange) { for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log10, x, - LIBC_NAMESPACE::log10f16(x), 0.5); + LIBC_NAMESPACE::log10f16(x), + TOLERANCE + 0.5); } } diff --git a/libc/test/src/math/log10f_test.cpp b/libc/test/src/math/log10f_test.cpp index c554948f7e647..1a1bd14a65d84 100644 --- a/libc/test/src/math/log10f_test.cpp +++ b/libc/test/src/math/log10f_test.cpp @@ -7,13 +7,19 @@ //===----------------------------------------------------------------------===// #include "hdr/math_macros.h" +#include "hdr/stdint_proxy.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/optimization.h" #include "src/math/log10f.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" -#include "hdr/stdint_proxy.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS using LlvmLibcLog10fTest = LIBC_NAMESPACE::testing::FPTest; @@ -60,7 +66,7 @@ TEST_F(LlvmLibcLog10fTest, TrickyInputs) { for (int i = 0; i < N; ++i) { float x = FPBits(INPUTS[i]).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log10, x, - LIBC_NAMESPACE::log10f(x), 0.5); + LIBC_NAMESPACE::log10f(x), TOLERANCE + 0.5); } } diff --git a/libc/test/src/math/log1p_test.cpp b/libc/test/src/math/log1p_test.cpp index be83ce899db59..dd720739ac569 100644 --- a/libc/test/src/math/log1p_test.cpp +++ b/libc/test/src/math/log1p_test.cpp @@ -7,13 +7,19 @@ //===----------------------------------------------------------------------===// #include "hdr/math_macros.h" +#include "hdr/stdint_proxy.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/optimization.h" #include "src/math/log1p.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" -#include "hdr/stdint_proxy.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS using LlvmLibcLog1pTest = LIBC_NAMESPACE::testing::FPTest; @@ -68,7 +74,7 @@ TEST_F(LlvmLibcLog1pTest, TrickyInputs) { for (int i = 0; i < N; ++i) { double x = FPBits(INPUTS[i]).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log1p, x, - LIBC_NAMESPACE::log1p(x), 0.5); + LIBC_NAMESPACE::log1p(x), TOLERANCE + 0.5); } } @@ -107,9 +113,8 @@ TEST_F(LlvmLibcLog1pTest, InDoubleRange) { continue; ++count; - // ASSERT_MPFR_MATCH(mpfr::Operation::Log1p, x, result, 0.5); if (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Log1p, x, result, - 0.5, rounding_mode)) { + TOLERANCE + 0.5, rounding_mode)) { ++fails; while (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Log1p, x, result, tol, rounding_mode)) { @@ -119,10 +124,10 @@ TEST_F(LlvmLibcLog1pTest, InDoubleRange) { } } } - tlog << " Log1p failed: " << fails << "/" << count << "/" << cc - << " tests.\n"; - tlog << " Max ULPs is at most: " << static_cast(tol) << ".\n"; if (fails) { + tlog << " Log1p failed: " << fails << "/" << count << "/" << cc + << " tests.\n"; + tlog << " Max ULPs is at most: " << static_cast(tol) << ".\n"; EXPECT_MPFR_MATCH(mpfr::Operation::Log1p, mx, mr, 0.5, rounding_mode); } }; diff --git a/libc/test/src/math/log1pf_test.cpp b/libc/test/src/math/log1pf_test.cpp index 2df452629b98d..ef61f5ae41b1a 100644 --- a/libc/test/src/math/log1pf_test.cpp +++ b/libc/test/src/math/log1pf_test.cpp @@ -7,13 +7,19 @@ //===----------------------------------------------------------------------===// #include "hdr/math_macros.h" +#include "hdr/stdint_proxy.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/optimization.h" #include "src/math/log1pf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" -#include "hdr/stdint_proxy.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS using LlvmLibcLog1pfTest = LIBC_NAMESPACE::testing::FPTest; @@ -63,7 +69,7 @@ TEST_F(LlvmLibcLog1pfTest, TrickyInputs) { for (int i = 0; i < N; ++i) { float x = FPBits(INPUTS[i]).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log1p, x, - LIBC_NAMESPACE::log1pf(x), 0.5); + LIBC_NAMESPACE::log1pf(x), TOLERANCE + 0.5); } } diff --git a/libc/test/src/math/log2_test.cpp b/libc/test/src/math/log2_test.cpp index 824a8a5d8bb70..9eada859ced84 100644 --- a/libc/test/src/math/log2_test.cpp +++ b/libc/test/src/math/log2_test.cpp @@ -117,10 +117,10 @@ TEST_F(LlvmLibcLog2Test, InDoubleRange) { } } } - tlog << " Log2 failed: " << fails << "/" << count << "/" << cc - << " tests.\n"; - tlog << " Max ULPs is at most: " << static_cast(tol) << ".\n"; if (fails) { + tlog << " Log2 failed: " << fails << "/" << count << "/" << cc + << " tests.\n"; + tlog << " Max ULPs is at most: " << static_cast(tol) << ".\n"; EXPECT_MPFR_MATCH(mpfr::Operation::Log2, mx, mr, 0.5, rounding_mode); } }; diff --git a/libc/test/src/math/log2f16_test.cpp b/libc/test/src/math/log2f16_test.cpp index 6630ca877d8d1..d277b1233693d 100644 --- a/libc/test/src/math/log2f16_test.cpp +++ b/libc/test/src/math/log2f16_test.cpp @@ -6,11 +6,17 @@ // //===----------------------------------------------------------------------===// +#include "src/__support/macros/optimization.h" #include "src/math/log2f16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS using LlvmLibcLog2f16Test = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; @@ -27,7 +33,7 @@ TEST_F(LlvmLibcLog2f16Test, PositiveRange) { for (uint16_t v = POS_START; v <= POS_STOP; ++v) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log2, x, - LIBC_NAMESPACE::log2f16(x), 0.5); + LIBC_NAMESPACE::log2f16(x), TOLERANCE + 0.5); } } @@ -35,6 +41,6 @@ TEST_F(LlvmLibcLog2f16Test, NegativeRange) { for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log2, x, - LIBC_NAMESPACE::log2f16(x), 0.5); + LIBC_NAMESPACE::log2f16(x), TOLERANCE + 0.5); } } diff --git a/libc/test/src/math/log_test.cpp b/libc/test/src/math/log_test.cpp index a68d168721e57..c9a030a89c2fd 100644 --- a/libc/test/src/math/log_test.cpp +++ b/libc/test/src/math/log_test.cpp @@ -7,13 +7,19 @@ //===----------------------------------------------------------------------===// #include "hdr/math_macros.h" +#include "hdr/stdint_proxy.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/optimization.h" #include "src/math/log.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" -#include "hdr/stdint_proxy.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS using LlvmLibcLogTest = LIBC_NAMESPACE::testing::FPTest; @@ -106,7 +112,7 @@ TEST_F(LlvmLibcLogTest, InDoubleRange) { ++count; // ASSERT_MPFR_MATCH(mpfr::Operation::Log, x, result, 0.5); if (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Log, x, result, - 0.5, rounding_mode)) { + TOLERANCE + 0.5, rounding_mode)) { ++fails; while (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Log, x, result, tol, rounding_mode)) { @@ -116,10 +122,10 @@ TEST_F(LlvmLibcLogTest, InDoubleRange) { } } } - tlog << " Log failed: " << fails << "/" << count << "/" << cc - << " tests.\n"; - tlog << " Max ULPs is at most: " << static_cast(tol) << ".\n"; if (fails) { + tlog << " Log failed: " << fails << "/" << count << "/" << cc + << " tests.\n"; + tlog << " Max ULPs is at most: " << static_cast(tol) << ".\n"; EXPECT_MPFR_MATCH(mpfr::Operation::Log, mx, mr, 0.5, rounding_mode); } }; diff --git a/libc/test/src/math/logf16_test.cpp b/libc/test/src/math/logf16_test.cpp index 922918b092b21..70d6304208567 100644 --- a/libc/test/src/math/logf16_test.cpp +++ b/libc/test/src/math/logf16_test.cpp @@ -6,11 +6,18 @@ // //===----------------------------------------------------------------------===// +#include "src/__support/macros/optimization.h" #include "src/math/logf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + using LlvmLibcLogf16Test = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; @@ -27,7 +34,7 @@ TEST_F(LlvmLibcLogf16Test, PositiveRange) { for (uint16_t v = POS_START; v <= POS_STOP; ++v) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log, x, - LIBC_NAMESPACE::logf16(x), 0.5); + LIBC_NAMESPACE::logf16(x), TOLERANCE + 0.5); } } @@ -35,6 +42,6 @@ TEST_F(LlvmLibcLogf16Test, NegativeRange) { for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log, x, - LIBC_NAMESPACE::logf16(x), 0.5); + LIBC_NAMESPACE::logf16(x), TOLERANCE + 0.5); } } diff --git a/libc/test/src/math/logf_test.cpp b/libc/test/src/math/logf_test.cpp index 7658075c1738f..fb99080435915 100644 --- a/libc/test/src/math/logf_test.cpp +++ b/libc/test/src/math/logf_test.cpp @@ -7,13 +7,19 @@ //===----------------------------------------------------------------------===// #include "hdr/math_macros.h" +#include "hdr/stdint_proxy.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/optimization.h" #include "src/math/logf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" -#include "hdr/stdint_proxy.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS using LlvmLibcLogfTest = LIBC_NAMESPACE::testing::FPTest; @@ -28,7 +34,11 @@ TEST_F(LlvmLibcLogfTest, SpecialNumbers) { EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, LIBC_NAMESPACE::logf(-0.0f), FE_DIVBYZERO); EXPECT_FP_IS_NAN_WITH_EXCEPTION(LIBC_NAMESPACE::logf(-1.0f), FE_INVALID); +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS + EXPECT_TRUE(0.0f == LIBC_NAMESPACE::logf(1.0f)); +#else EXPECT_FP_EQ_ALL_ROUNDING(zero, LIBC_NAMESPACE::logf(1.0f)); +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS } TEST_F(LlvmLibcLogfTest, TrickyInputs) { @@ -73,7 +83,7 @@ TEST_F(LlvmLibcLogfTest, TrickyInputs) { for (int i = 0; i < N; ++i) { float x = FPBits(INPUTS[i]).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log, x, - LIBC_NAMESPACE::logf(x), 0.5); + LIBC_NAMESPACE::logf(x), TOLERANCE + 0.5); } } diff --git a/libc/test/src/math/powf_test.cpp b/libc/test/src/math/powf_test.cpp index 1d7072463e0b6..fe8ef4fa85117 100644 --- a/libc/test/src/math/powf_test.cpp +++ b/libc/test/src/math/powf_test.cpp @@ -7,13 +7,19 @@ //===----------------------------------------------------------------------===// #include "hdr/math_macros.h" +#include "hdr/stdint_proxy.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/optimization.h" #include "src/math/powf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" -#include "hdr/stdint_proxy.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS using LlvmLibcPowfTest = LIBC_NAMESPACE::testing::FPTest; using LIBC_NAMESPACE::testing::tlog; @@ -42,7 +48,7 @@ TEST_F(LlvmLibcPowfTest, TrickyInputs) { float x = INPUTS[i].x; float y = INPUTS[i].y; EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Pow, INPUTS[i], - LIBC_NAMESPACE::powf(x, y), 0.5); + LIBC_NAMESPACE::powf(x, y), TOLERANCE + 0.5); } } @@ -88,7 +94,8 @@ TEST_F(LlvmLibcPowfTest, InFloatRange) { mpfr::BinaryInput inputs{x, y}; if (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Pow, inputs, - result, 0.5, rounding_mode)) { + result, TOLERANCE + 0.5, + rounding_mode)) { ++fails; while (!TEST_MPFR_MATCH_ROUNDING_SILENTLY( mpfr::Operation::Pow, inputs, result, tol, rounding_mode)) { diff --git a/libc/test/src/math/sin_test.cpp b/libc/test/src/math/sin_test.cpp index 4d5d9ddf464b1..7eac0348ee6d4 100644 --- a/libc/test/src/math/sin_test.cpp +++ b/libc/test/src/math/sin_test.cpp @@ -7,11 +7,18 @@ //===----------------------------------------------------------------------===// #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/optimization.h" #include "src/math/sin.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + using LlvmLibcSinTest = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; @@ -46,7 +53,7 @@ TEST_F(LlvmLibcSinTest, TrickyInputs) { for (int i = 0; i < N; ++i) { double x = INPUTS[i]; ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sin, x, - LIBC_NAMESPACE::sin(x), 0.5); + LIBC_NAMESPACE::sin(x), TOLERANCE + 0.5); } } diff --git a/libc/test/src/math/sincos_test.cpp b/libc/test/src/math/sincos_test.cpp index 09c87153890d0..30bfec646eee9 100644 --- a/libc/test/src/math/sincos_test.cpp +++ b/libc/test/src/math/sincos_test.cpp @@ -7,11 +7,18 @@ //===----------------------------------------------------------------------===// #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/optimization.h" #include "src/math/sincos.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + using LlvmLibcSincosTest = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; @@ -26,36 +33,36 @@ using LIBC_NAMESPACE::testing::tlog; mpfr::ForceRoundingMode __r1(mpfr::RoundingMode::Nearest); \ if (__r1.success) { \ LIBC_NAMESPACE::sincos(input, &sin_x, &cos_x); \ - ASSERT_MPFR_MATCH(mpfr::Operation::Sin, input, sin_x, 0.5, \ + ASSERT_MPFR_MATCH(mpfr::Operation::Sin, input, sin_x, TOLERANCE + 0.5, \ mpfr::RoundingMode::Nearest); \ - ASSERT_MPFR_MATCH(mpfr::Operation::Cos, input, cos_x, 0.5, \ + ASSERT_MPFR_MATCH(mpfr::Operation::Cos, input, cos_x, TOLERANCE + 0.5, \ mpfr::RoundingMode::Nearest); \ } \ \ mpfr::ForceRoundingMode __r2(mpfr::RoundingMode::Upward); \ if (__r2.success) { \ LIBC_NAMESPACE::sincos(input, &sin_x, &cos_x); \ - ASSERT_MPFR_MATCH(mpfr::Operation::Sin, input, sin_x, 0.5, \ + ASSERT_MPFR_MATCH(mpfr::Operation::Sin, input, sin_x, TOLERANCE + 0.5, \ mpfr::RoundingMode::Upward); \ - ASSERT_MPFR_MATCH(mpfr::Operation::Cos, input, cos_x, 0.5, \ + ASSERT_MPFR_MATCH(mpfr::Operation::Cos, input, cos_x, TOLERANCE + 0.5, \ mpfr::RoundingMode::Upward); \ } \ \ mpfr::ForceRoundingMode __r3(mpfr::RoundingMode::Downward); \ if (__r3.success) { \ LIBC_NAMESPACE::sincos(input, &sin_x, &cos_x); \ - ASSERT_MPFR_MATCH(mpfr::Operation::Sin, input, sin_x, 0.5, \ + ASSERT_MPFR_MATCH(mpfr::Operation::Sin, input, sin_x, TOLERANCE + 0.5, \ mpfr::RoundingMode::Downward); \ - ASSERT_MPFR_MATCH(mpfr::Operation::Cos, input, cos_x, 0.5, \ + ASSERT_MPFR_MATCH(mpfr::Operation::Cos, input, cos_x, TOLERANCE + 0.5, \ mpfr::RoundingMode::Downward); \ } \ \ mpfr::ForceRoundingMode __r4(mpfr::RoundingMode::TowardZero); \ if (__r4.success) { \ LIBC_NAMESPACE::sincos(input, &sin_x, &cos_x); \ - ASSERT_MPFR_MATCH(mpfr::Operation::Sin, input, sin_x, 0.5, \ + ASSERT_MPFR_MATCH(mpfr::Operation::Sin, input, sin_x, TOLERANCE + 0.5, \ mpfr::RoundingMode::TowardZero); \ - ASSERT_MPFR_MATCH(mpfr::Operation::Cos, input, cos_x, 0.5, \ + ASSERT_MPFR_MATCH(mpfr::Operation::Cos, input, cos_x, TOLERANCE + 0.5, \ mpfr::RoundingMode::TowardZero); \ } \ } while (0) diff --git a/libc/test/src/math/sincosf_test.cpp b/libc/test/src/math/sincosf_test.cpp index 87e995db51119..9e6263bb52a67 100644 --- a/libc/test/src/math/sincosf_test.cpp +++ b/libc/test/src/math/sincosf_test.cpp @@ -8,14 +8,20 @@ #include "hdr/errno_macros.h" #include "hdr/math_macros.h" +#include "hdr/stdint_proxy.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/optimization.h" #include "src/math/sincosf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "test/src/math/sdcomp26094.h" #include "utils/MPFRWrapper/MPFRUtils.h" -#include "hdr/stdint_proxy.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS using LlvmLibcSinCosfTest = LIBC_NAMESPACE::testing::FPTest; @@ -60,36 +66,36 @@ TEST_F(LlvmLibcSinCosfTest, SpecialNumbers) { mpfr::ForceRoundingMode __r1(mpfr::RoundingMode::Nearest); \ if (__r1.success) { \ LIBC_NAMESPACE::sincosf(input, &sin, &cos); \ - EXPECT_MPFR_MATCH(mpfr::Operation::Sin, input, sin, 0.5, \ + EXPECT_MPFR_MATCH(mpfr::Operation::Sin, input, sin, TOLERANCE + 0.5, \ mpfr::RoundingMode::Nearest); \ - EXPECT_MPFR_MATCH(mpfr::Operation::Cos, input, cos, 0.5, \ + EXPECT_MPFR_MATCH(mpfr::Operation::Cos, input, cos, TOLERANCE + 0.5, \ mpfr::RoundingMode::Nearest); \ } \ \ mpfr::ForceRoundingMode __r2(mpfr::RoundingMode::Upward); \ if (__r2.success) { \ LIBC_NAMESPACE::sincosf(input, &sin, &cos); \ - EXPECT_MPFR_MATCH(mpfr::Operation::Sin, input, sin, 0.5, \ + EXPECT_MPFR_MATCH(mpfr::Operation::Sin, input, sin, TOLERANCE + 0.5, \ mpfr::RoundingMode::Upward); \ - EXPECT_MPFR_MATCH(mpfr::Operation::Cos, input, cos, 0.5, \ + EXPECT_MPFR_MATCH(mpfr::Operation::Cos, input, cos, TOLERANCE + 0.5, \ mpfr::RoundingMode::Upward); \ } \ \ mpfr::ForceRoundingMode __r3(mpfr::RoundingMode::Downward); \ if (__r3.success) { \ LIBC_NAMESPACE::sincosf(input, &sin, &cos); \ - EXPECT_MPFR_MATCH(mpfr::Operation::Sin, input, sin, 0.5, \ + EXPECT_MPFR_MATCH(mpfr::Operation::Sin, input, sin, TOLERANCE + 0.5, \ mpfr::RoundingMode::Downward); \ - EXPECT_MPFR_MATCH(mpfr::Operation::Cos, input, cos, 0.5, \ + EXPECT_MPFR_MATCH(mpfr::Operation::Cos, input, cos, TOLERANCE + 0.5, \ mpfr::RoundingMode::Downward); \ } \ \ mpfr::ForceRoundingMode __r4(mpfr::RoundingMode::TowardZero); \ if (__r4.success) { \ LIBC_NAMESPACE::sincosf(input, &sin, &cos); \ - EXPECT_MPFR_MATCH(mpfr::Operation::Sin, input, sin, 0.5, \ + EXPECT_MPFR_MATCH(mpfr::Operation::Sin, input, sin, TOLERANCE + 0.5, \ mpfr::RoundingMode::TowardZero); \ - EXPECT_MPFR_MATCH(mpfr::Operation::Cos, input, cos, 0.5, \ + EXPECT_MPFR_MATCH(mpfr::Operation::Cos, input, cos, TOLERANCE + 0.5, \ mpfr::RoundingMode::TowardZero); \ } \ } diff --git a/libc/test/src/math/sinf16_test.cpp b/libc/test/src/math/sinf16_test.cpp index b05501cb0f145..d8e9e02f6b1b2 100644 --- a/libc/test/src/math/sinf16_test.cpp +++ b/libc/test/src/math/sinf16_test.cpp @@ -6,11 +6,18 @@ // //===----------------------------------------------------------------------===// +#include "src/__support/macros/optimization.h" #include "src/math/sinf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + using LlvmLibcSinf16Test = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; @@ -27,7 +34,7 @@ TEST_F(LlvmLibcSinf16Test, PositiveRange) { for (uint16_t v = POS_START; v <= POS_STOP; ++v) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sin, x, - LIBC_NAMESPACE::sinf16(x), 0.5); + LIBC_NAMESPACE::sinf16(x), TOLERANCE + 0.5); } } @@ -35,6 +42,6 @@ TEST_F(LlvmLibcSinf16Test, NegativeRange) { for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sin, x, - LIBC_NAMESPACE::sinf16(x), 0.5); + LIBC_NAMESPACE::sinf16(x), TOLERANCE + 0.5); } } diff --git a/libc/test/src/math/sinf_test.cpp b/libc/test/src/math/sinf_test.cpp index 42de7afa2fb74..516941ba49b9e 100644 --- a/libc/test/src/math/sinf_test.cpp +++ b/libc/test/src/math/sinf_test.cpp @@ -8,6 +8,7 @@ #include "hdr/errno_macros.h" #include "hdr/math_macros.h" +#include "hdr/stdint_proxy.h" #include "src/__support/FPUtil/FPBits.h" #include "src/math/sinf.h" #include "test/UnitTest/FPMatcher.h" @@ -15,7 +16,13 @@ #include "test/src/math/sdcomp26094.h" #include "utils/MPFRWrapper/MPFRUtils.h" -#include "hdr/stdint_proxy.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 3 +#define FP_ASSERT ASSERT_MPFR_MATCH +#else +#define TOLERANCE 0 +#define FP_ASSERT ASSERT_MPFR_MATCH_ALL_ROUNDING +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS using LlvmLibcSinfTest = LIBC_NAMESPACE::testing::FPTest; @@ -30,8 +37,13 @@ TEST_F(LlvmLibcSinfTest, SpecialNumbers) { EXPECT_FP_EQ(0.0f, LIBC_NAMESPACE::sinf(0.0f)); EXPECT_MATH_ERRNO(0); +// When accuracy is not required, signed zeros might not be preserved. +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS + EXPECT_TRUE(0.0f == LIBC_NAMESPACE::sinf(-0.0f)); +#else EXPECT_FP_EQ(-0.0f, LIBC_NAMESPACE::sinf(-0.0f)); EXPECT_MATH_ERRNO(0); +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::sinf(inf)); EXPECT_MATH_ERRNO(EDOM); @@ -47,8 +59,8 @@ TEST_F(LlvmLibcSinfTest, InFloatRange) { float x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf()) continue; - ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sin, x, - LIBC_NAMESPACE::sinf(x), 0.5); + FP_ASSERT(mpfr::Operation::Sin, x, LIBC_NAMESPACE::sinf(x), + TOLERANCE + 0.5); } } @@ -95,22 +107,20 @@ TEST_F(LlvmLibcSinfTest, SpecificBitPatterns) { for (int i = 0; i < N; ++i) { float x = FPBits(INPUTS[i]).get_val(); - EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sin, x, - LIBC_NAMESPACE::sinf(x), 0.5); - EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sin, -x, - LIBC_NAMESPACE::sinf(-x), 0.5); + FP_ASSERT(mpfr::Operation::Sin, x, LIBC_NAMESPACE::sinf(x), + TOLERANCE + 0.5); + FP_ASSERT(mpfr::Operation::Sin, -x, LIBC_NAMESPACE::sinf(-x), + TOLERANCE + 0.5); } } // For small values, sin(x) is x. TEST_F(LlvmLibcSinfTest, SmallValues) { float x = FPBits(0x1780'0000U).get_val(); - EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sin, x, - LIBC_NAMESPACE::sinf(x), 0.5); + FP_ASSERT(mpfr::Operation::Sin, x, LIBC_NAMESPACE::sinf(x), 0.5); x = FPBits(0x0040'0000U).get_val(); - EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sin, x, - LIBC_NAMESPACE::sinf(x), 0.5); + FP_ASSERT(mpfr::Operation::Sin, x, LIBC_NAMESPACE::sinf(x), 0.5); } // SDCOMP-26094: check sinf in the cases for which the range reducer @@ -118,7 +128,7 @@ TEST_F(LlvmLibcSinfTest, SmallValues) { TEST_F(LlvmLibcSinfTest, SDCOMP_26094) { for (uint32_t v : SDCOMP26094_VALUES) { float x = FPBits((v)).get_val(); - EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sin, x, - LIBC_NAMESPACE::sinf(x), 0.5); + FP_ASSERT(mpfr::Operation::Sin, x, LIBC_NAMESPACE::sinf(x), + TOLERANCE + 0.5); } } diff --git a/libc/test/src/math/sinhf16_test.cpp b/libc/test/src/math/sinhf16_test.cpp index a16ab9279c457..04e0730f0efd4 100644 --- a/libc/test/src/math/sinhf16_test.cpp +++ b/libc/test/src/math/sinhf16_test.cpp @@ -6,11 +6,18 @@ // //===----------------------------------------------------------------------===// +#include "src/__support/macros/optimization.h" #include "src/math/sinhf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + using LlvmLibcSinhf16Test = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; @@ -27,7 +34,7 @@ TEST_F(LlvmLibcSinhf16Test, PositiveRange) { for (uint16_t v = POS_START; v <= POS_STOP; ++v) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sinh, x, - LIBC_NAMESPACE::sinhf16(x), 0.5); + LIBC_NAMESPACE::sinhf16(x), TOLERANCE + 0.5); } } @@ -35,6 +42,6 @@ TEST_F(LlvmLibcSinhf16Test, NegativeRange) { for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sinh, x, - LIBC_NAMESPACE::sinhf16(x), 0.5); + LIBC_NAMESPACE::sinhf16(x), TOLERANCE + 0.5); } } diff --git a/libc/test/src/math/sinhf_test.cpp b/libc/test/src/math/sinhf_test.cpp index 6f9ac2fde8278..93a1eecc6186c 100644 --- a/libc/test/src/math/sinhf_test.cpp +++ b/libc/test/src/math/sinhf_test.cpp @@ -8,14 +8,20 @@ #include "hdr/errno_macros.h" #include "hdr/math_macros.h" +#include "hdr/stdint_proxy.h" #include "src/__support/CPP/array.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/optimization.h" #include "src/math/sinhf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" -#include "hdr/stdint_proxy.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS using LlvmLibcSinhfTest = LIBC_NAMESPACE::testing::FPTest; @@ -79,9 +85,9 @@ TEST_F(LlvmLibcSinhfTest, Overflow) { TEST_F(LlvmLibcSinhfTest, ExceptionalValues) { float x = FPBits(uint32_t(0x3a12'85ffU)).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sinh, x, - LIBC_NAMESPACE::sinhf(x), 0.5); + LIBC_NAMESPACE::sinhf(x), TOLERANCE + 0.5); x = -FPBits(uint32_t(0x3a12'85ffU)).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sinh, x, - LIBC_NAMESPACE::sinhf(x), 0.5); + LIBC_NAMESPACE::sinhf(x), TOLERANCE + 0.5); } diff --git a/libc/test/src/math/smoke/sinf_test.cpp b/libc/test/src/math/smoke/sinf_test.cpp index b0ba81e650d40..7b671c6cfa067 100644 --- a/libc/test/src/math/smoke/sinf_test.cpp +++ b/libc/test/src/math/smoke/sinf_test.cpp @@ -10,6 +10,7 @@ #include "hdr/math_macros.h" #include "hdr/stdint_proxy.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/optimization.h" #include "src/math/sinf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -26,8 +27,13 @@ TEST_F(LlvmLibcSinfTest, SpecialNumbers) { EXPECT_FP_EQ(0.0f, LIBC_NAMESPACE::sinf(0.0f)); EXPECT_MATH_ERRNO(0); +// When accuracy is not required, signed zeros might not be preserved. +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS + EXPECT_TRUE(0.0f == LIBC_NAMESPACE::sinf(-0.0f)); +#else EXPECT_FP_EQ(-0.0f, LIBC_NAMESPACE::sinf(-0.0f)); EXPECT_MATH_ERRNO(0); +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::sinf(inf)); EXPECT_MATH_ERRNO(EDOM); diff --git a/libc/test/src/math/tan_test.cpp b/libc/test/src/math/tan_test.cpp index 12dfc02bac111..62f8994c9143a 100644 --- a/libc/test/src/math/tan_test.cpp +++ b/libc/test/src/math/tan_test.cpp @@ -7,11 +7,18 @@ //===----------------------------------------------------------------------===// #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/optimization.h" #include "src/math/tan.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 4 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + using LlvmLibcTanTest = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; @@ -53,9 +60,9 @@ TEST_F(LlvmLibcTanTest, TrickyInputs) { for (int i = 0; i < N; ++i) { double x = INPUTS[i]; ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tan, x, - LIBC_NAMESPACE::tan(x), 0.5); + LIBC_NAMESPACE::tan(x), TOLERANCE + 0.5); ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tan, -x, - LIBC_NAMESPACE::tan(-x), 0.5); + LIBC_NAMESPACE::tan(-x), TOLERANCE + 0.5); } } @@ -89,7 +96,7 @@ TEST_F(LlvmLibcTanTest, InDoubleRange) { ++tested; if (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Tan, x, result, - 0.5, rounding_mode)) { + TOLERANCE + 0.5, rounding_mode)) { ++fails; while (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Tan, x, result, ulp, rounding_mode)) { diff --git a/libc/test/src/math/tanf16_test.cpp b/libc/test/src/math/tanf16_test.cpp index f2e874182efc1..cd4613bfb0ac4 100644 --- a/libc/test/src/math/tanf16_test.cpp +++ b/libc/test/src/math/tanf16_test.cpp @@ -6,11 +6,18 @@ // //===----------------------------------------------------------------------===// +#include "src/__support/macros/optimization.h" #include "src/math/tanf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + using LlvmLibcTanf16Test = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; @@ -27,7 +34,7 @@ TEST_F(LlvmLibcTanf16Test, PositiveRange) { for (uint16_t v = POS_START; v <= POS_STOP; ++v) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tan, x, - LIBC_NAMESPACE::tanf16(x), 0.5); + LIBC_NAMESPACE::tanf16(x), TOLERANCE + 0.5); } } @@ -35,6 +42,6 @@ TEST_F(LlvmLibcTanf16Test, NegativeRange) { for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tan, x, - LIBC_NAMESPACE::tanf16(x), 0.5); + LIBC_NAMESPACE::tanf16(x), TOLERANCE + 0.5); } } diff --git a/libc/test/src/math/tanf_test.cpp b/libc/test/src/math/tanf_test.cpp index 2202bd623f5ed..b384d8cab4ea7 100644 --- a/libc/test/src/math/tanf_test.cpp +++ b/libc/test/src/math/tanf_test.cpp @@ -9,12 +9,19 @@ #include "hdr/errno_macros.h" #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/optimization.h" #include "src/math/tanf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "test/src/math/sdcomp26094.h" #include "utils/MPFRWrapper/MPFRUtils.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + #include "hdr/stdint_proxy.h" using LlvmLibcTanfTest = LIBC_NAMESPACE::testing::FPTest; @@ -114,9 +121,9 @@ TEST_F(LlvmLibcTanfTest, SpecificBitPatterns) { for (int i = 0; i < N; ++i) { float x = FPBits(INPUTS[i]).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tan, x, - LIBC_NAMESPACE::tanf(x), 0.5); + LIBC_NAMESPACE::tanf(x), TOLERANCE + 0.5); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tan, -x, - LIBC_NAMESPACE::tanf(-x), 0.5); + LIBC_NAMESPACE::tanf(-x), TOLERANCE + 0.5); } } diff --git a/libc/test/src/math/tanhf16_test.cpp b/libc/test/src/math/tanhf16_test.cpp index 7124a83f3d7bc..ff796047309bf 100644 --- a/libc/test/src/math/tanhf16_test.cpp +++ b/libc/test/src/math/tanhf16_test.cpp @@ -6,11 +6,18 @@ // //===----------------------------------------------------------------------===// +#include "src/__support/macros/optimization.h" #include "src/math/tanhf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + using LlvmLibcTanhf16Test = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; @@ -27,7 +34,7 @@ TEST_F(LlvmLibcTanhf16Test, PositiveRange) { for (uint16_t v = POS_START; v <= POS_STOP; ++v) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tanh, x, - LIBC_NAMESPACE::tanhf16(x), 0.5); + LIBC_NAMESPACE::tanhf16(x), TOLERANCE + 0.5); } } @@ -35,6 +42,6 @@ TEST_F(LlvmLibcTanhf16Test, NegativeRange) { for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tanh, x, - LIBC_NAMESPACE::tanhf16(x), 0.5); + LIBC_NAMESPACE::tanhf16(x), TOLERANCE + 0.5); } } diff --git a/libc/test/src/math/tanpif16_test.cpp b/libc/test/src/math/tanpif16_test.cpp index 04d1f713574a1..f994781dd84b1 100644 --- a/libc/test/src/math/tanpif16_test.cpp +++ b/libc/test/src/math/tanpif16_test.cpp @@ -6,11 +6,18 @@ // //===----------------------------------------------------------------------===// +#include "src/__support/macros/optimization.h" #include "src/math/tanpif16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" #include "utils/MPFRWrapper/MPFRUtils.h" +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#define TOLERANCE 1 +#else +#define TOLERANCE 0 +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + using LlvmLibcTanpif16Test = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; @@ -27,7 +34,8 @@ TEST_F(LlvmLibcTanpif16Test, PositiveRange) { for (uint16_t v = POS_START; v <= POS_STOP; ++v) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tanpi, x, - LIBC_NAMESPACE::tanpif16(x), 0.5); + LIBC_NAMESPACE::tanpif16(x), + TOLERANCE + 0.5); } } @@ -35,6 +43,7 @@ TEST_F(LlvmLibcTanpif16Test, NegativeRange) { for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) { float16 x = FPBits(v).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tanpi, x, - LIBC_NAMESPACE::tanpif16(x), 0.5); + LIBC_NAMESPACE::tanpif16(x), + TOLERANCE + 0.5); } } From b85733094c65d2c69ade3936f008c68787197d5b Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Wed, 22 Oct 2025 00:11:25 +0000 Subject: [PATCH 031/530] Revert "Remove unnecessary static_cast in AsmPrinter.cpp." This reverts commit 0e8ee0ec78dc370a1bf2688411cf2db36c3a4cd0. This breaks Windows premerge. https://lab.llvm.org/staging/#/builders/21/builds/6831 --- llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index a114406b32fb6..e2af0c5925248 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -1438,7 +1438,7 @@ getBBAddrMapFeature(const MachineFunction &MF, int NumMBBSectionRanges, BBFreqEnabled, BrProbEnabled, MF.hasBBSections() && NumMBBSectionRanges > 1, - BBAddrMapSkipEmitBBEntries, + static_cast(BBAddrMapSkipEmitBBEntries), HasCalls, false}; } From 1a9aba29b09e7174758572008e8afffb84ca49bd Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 21 Oct 2025 17:14:19 -0700 Subject: [PATCH 032/530] [RISCV] Remove unreachable break statements. NFC (#164481) --- llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 70 ++++++++++++------------ 1 file changed, 36 insertions(+), 34 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 12f776bbd4fa4..912b82d294f44 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -1689,42 +1689,44 @@ bool RISCVInstrInfo::isBranchOffsetInRange(unsigned BranchOp, // instruction opcode. Otherwise, return RISCV::INSTRUCTION_LIST_END. // TODO: Support more operations. unsigned getPredicatedOpcode(unsigned Opcode) { + // clang-format off switch (Opcode) { - case RISCV::ADD: return RISCV::PseudoCCADD; break; - case RISCV::SUB: return RISCV::PseudoCCSUB; break; - case RISCV::SLL: return RISCV::PseudoCCSLL; break; - case RISCV::SRL: return RISCV::PseudoCCSRL; break; - case RISCV::SRA: return RISCV::PseudoCCSRA; break; - case RISCV::AND: return RISCV::PseudoCCAND; break; - case RISCV::OR: return RISCV::PseudoCCOR; break; - case RISCV::XOR: return RISCV::PseudoCCXOR; break; - - case RISCV::ADDI: return RISCV::PseudoCCADDI; break; - case RISCV::SLLI: return RISCV::PseudoCCSLLI; break; - case RISCV::SRLI: return RISCV::PseudoCCSRLI; break; - case RISCV::SRAI: return RISCV::PseudoCCSRAI; break; - case RISCV::ANDI: return RISCV::PseudoCCANDI; break; - case RISCV::ORI: return RISCV::PseudoCCORI; break; - case RISCV::XORI: return RISCV::PseudoCCXORI; break; - - case RISCV::ADDW: return RISCV::PseudoCCADDW; break; - case RISCV::SUBW: return RISCV::PseudoCCSUBW; break; - case RISCV::SLLW: return RISCV::PseudoCCSLLW; break; - case RISCV::SRLW: return RISCV::PseudoCCSRLW; break; - case RISCV::SRAW: return RISCV::PseudoCCSRAW; break; - - case RISCV::ADDIW: return RISCV::PseudoCCADDIW; break; - case RISCV::SLLIW: return RISCV::PseudoCCSLLIW; break; - case RISCV::SRLIW: return RISCV::PseudoCCSRLIW; break; - case RISCV::SRAIW: return RISCV::PseudoCCSRAIW; break; - - case RISCV::ANDN: return RISCV::PseudoCCANDN; break; - case RISCV::ORN: return RISCV::PseudoCCORN; break; - case RISCV::XNOR: return RISCV::PseudoCCXNOR; break; - - case RISCV::NDS_BFOS: return RISCV::PseudoCCNDS_BFOS; break; - case RISCV::NDS_BFOZ: return RISCV::PseudoCCNDS_BFOZ; break; + case RISCV::ADD: return RISCV::PseudoCCADD; + case RISCV::SUB: return RISCV::PseudoCCSUB; + case RISCV::SLL: return RISCV::PseudoCCSLL; + case RISCV::SRL: return RISCV::PseudoCCSRL; + case RISCV::SRA: return RISCV::PseudoCCSRA; + case RISCV::AND: return RISCV::PseudoCCAND; + case RISCV::OR: return RISCV::PseudoCCOR; + case RISCV::XOR: return RISCV::PseudoCCXOR; + + case RISCV::ADDI: return RISCV::PseudoCCADDI; + case RISCV::SLLI: return RISCV::PseudoCCSLLI; + case RISCV::SRLI: return RISCV::PseudoCCSRLI; + case RISCV::SRAI: return RISCV::PseudoCCSRAI; + case RISCV::ANDI: return RISCV::PseudoCCANDI; + case RISCV::ORI: return RISCV::PseudoCCORI; + case RISCV::XORI: return RISCV::PseudoCCXORI; + + case RISCV::ADDW: return RISCV::PseudoCCADDW; + case RISCV::SUBW: return RISCV::PseudoCCSUBW; + case RISCV::SLLW: return RISCV::PseudoCCSLLW; + case RISCV::SRLW: return RISCV::PseudoCCSRLW; + case RISCV::SRAW: return RISCV::PseudoCCSRAW; + + case RISCV::ADDIW: return RISCV::PseudoCCADDIW; + case RISCV::SLLIW: return RISCV::PseudoCCSLLIW; + case RISCV::SRLIW: return RISCV::PseudoCCSRLIW; + case RISCV::SRAIW: return RISCV::PseudoCCSRAIW; + + case RISCV::ANDN: return RISCV::PseudoCCANDN; + case RISCV::ORN: return RISCV::PseudoCCORN; + case RISCV::XNOR: return RISCV::PseudoCCXNOR; + + case RISCV::NDS_BFOS: return RISCV::PseudoCCNDS_BFOS; + case RISCV::NDS_BFOZ: return RISCV::PseudoCCNDS_BFOZ; } + // clang-format on return RISCV::INSTRUCTION_LIST_END; } From 25ba59e6ceb1681bb700c9a146ae6ac3d613ef6e Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 22 Oct 2025 08:32:53 +0800 Subject: [PATCH 033/530] [MachineLICM] Use structured bindings for reg pressure cost map. NFC (#164368) --- llvm/lib/CodeGen/MachineLICM.cpp | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp index 7acddff753693..729e73c8c312c 100644 --- a/llvm/lib/CodeGen/MachineLICM.cpp +++ b/llvm/lib/CodeGen/MachineLICM.cpp @@ -932,12 +932,11 @@ void MachineLICMImpl::InitRegPressure(MachineBasicBlock *BB) { void MachineLICMImpl::UpdateRegPressure(const MachineInstr *MI, bool ConsiderUnseenAsDef) { auto Cost = calcRegisterCost(MI, /*ConsiderSeen=*/true, ConsiderUnseenAsDef); - for (const auto &RPIdAndCost : Cost) { - unsigned Class = RPIdAndCost.first; - if (static_cast(RegPressure[Class]) < -RPIdAndCost.second) + for (const auto &[Class, Weight] : Cost) { + if (static_cast(RegPressure[Class]) < -Weight) RegPressure[Class] = 0; else - RegPressure[Class] += RPIdAndCost.second; + RegPressure[Class] += Weight; } } @@ -1215,11 +1214,10 @@ bool MachineLICMImpl::IsCheapInstruction(MachineInstr &MI) const { /// given cost matrix can cause high register pressure. bool MachineLICMImpl::CanCauseHighRegPressure( const SmallDenseMap &Cost, bool CheapInstr) { - for (const auto &RPIdAndCost : Cost) { - if (RPIdAndCost.second <= 0) + for (const auto &[Class, Weight] : Cost) { + if (Weight <= 0) continue; - unsigned Class = RPIdAndCost.first; int Limit = RegLimit[Class]; // Don't hoist cheap instructions if they would increase register pressure, @@ -1228,7 +1226,7 @@ bool MachineLICMImpl::CanCauseHighRegPressure( return true; for (const auto &RP : BackTrace) - if (static_cast(RP[Class]) + RPIdAndCost.second >= Limit) + if (static_cast(RP[Class]) + Weight >= Limit) return true; } @@ -1246,8 +1244,8 @@ void MachineLICMImpl::UpdateBackTraceRegPressure(const MachineInstr *MI) { // Update register pressure of blocks from loop header to current block. for (auto &RP : BackTrace) - for (const auto &RPIdAndCost : Cost) - RP[RPIdAndCost.first] += RPIdAndCost.second; + for (const auto &[Class, Weight] : Cost) + RP[Class] += Weight; } /// Return true if it is potentially profitable to hoist the given loop From ee0f86df46e619da26f9f59e835928b5c338a5db Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Tue, 21 Oct 2025 23:53:02 +0000 Subject: [PATCH 034/530] "Reapply "[Clang] Enable lit internal shell by default" This reverts commit 32de3b9ef9e7e8debc14416e968456ca13b48bea. This relands the enablement of the internal shell for clang by default. This was reverted last time for some Z3 failures that were fixed in 2d550b98cac5. There was also issues around ulimit values persisting causing test failures in LLVM. Those were addressed in 93dd17a0e0a4. --- clang/test/lit.cfg.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/clang/test/lit.cfg.py b/clang/test/lit.cfg.py index 29088ef367e7c..52b275c095475 100644 --- a/clang/test/lit.cfg.py +++ b/clang/test/lit.cfg.py @@ -18,11 +18,22 @@ # name: The name of this test suite. config.name = "Clang" +# TODO: Consolidate the logic for turning on the internal shell by default for all LLVM test suites. +# See https://github.com/llvm/llvm-project/issues/106636 for more details. +# +# We prefer the lit internal shell which provides a better user experience on failures +# and is faster unless the user explicitly disables it with LIT_USE_INTERNAL_SHELL=0 +# env var. +use_lit_shell = True +lit_shell_env = os.environ.get("LIT_USE_INTERNAL_SHELL") +if lit_shell_env: + use_lit_shell = lit.util.pythonize_bool(lit_shell_env) + # testFormat: The test format to use to interpret tests. # # For now we require '&&' between commands, until they get globally killed and # the test runner updated. -config.test_format = lit.formats.ShTest(not llvm_config.use_lit_shell) +config.test_format = lit.formats.ShTest(execute_external=not use_lit_shell) # suffixes: A list of file extensions to treat as test files. config.suffixes = [ From d55b10a4b960abdfc4a05385e254cd33d8896a21 Mon Sep 17 00:00:00 2001 From: Rahman Lavaee Date: Tue, 21 Oct 2025 17:42:12 -0700 Subject: [PATCH 035/530] Add comment about static_cast requirement. --- llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index e2af0c5925248..fefde64fd1806 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -1438,6 +1438,7 @@ getBBAddrMapFeature(const MachineFunction &MF, int NumMBBSectionRanges, BBFreqEnabled, BrProbEnabled, MF.hasBBSections() && NumMBBSectionRanges > 1, + // Use static_cast to avoid breakage of tests on windows. static_cast(BBAddrMapSkipEmitBBEntries), HasCalls, false}; From 6b9a9cf040d33ad7f9cd563a907b13e373313255 Mon Sep 17 00:00:00 2001 From: Thurston Dang Date: Wed, 22 Oct 2025 01:14:21 +0000 Subject: [PATCH 036/530] [msan][test] Add another target("aarch64.svcount") test case (#164343) This shows a crash that happens because MSan tries to check the shadow of a target("aarch64.svcount")-sized argument. This is the followup to https://github.com/llvm/llvm-project/pull/164315. This also does a drive-by fix of those test cases, to remove FileCheck (otherwise, even if opt passed, the test would still XFAIL because FileCheck cannot find any CHECK: assertions). Forked from llvm/test/CodeGen/AArch64/sme2-intrinsics-add.ll --- .../AArch64/sme-aarch64-svcount-mini.ll | 2 +- .../AArch64/sme-aarch64-svcount.ll | 2 +- .../AArch64/sme2-intrinsics-add-mini.ll | 16 + .../AArch64/sme2-intrinsics-add.ll | 340 ++++++++++++++++++ 4 files changed, 358 insertions(+), 2 deletions(-) create mode 100644 llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add-mini.ll create mode 100644 llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add.ll diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll index 1ddcd4b56688c..1c869bd41b931 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -passes=msan -mattr=+sme -o - %s | FileCheck %s +; RUN: opt -S -passes=msan -mattr=+sme -o - %s ; XFAIL: * diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll index 9caa89de63748..00cf3204464d0 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -passes=msan -mattr=+sme -o - %s | FileCheck %s +; RUN: opt -S -passes=msan -mattr=+sme -o - %s ; XFAIL: * diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add-mini.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add-mini.ll new file mode 100644 index 0000000000000..3f43efa233621 --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add-mini.ll @@ -0,0 +1,16 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -passes=msan -mattr=+sme2 -mattr=+sme-i16i64 -mattr=+sme-f64f64 -o - %s + +; XFAIL: * + +; Forked from llvm/test/CodeGen/AArch64/sme2-intrinsics-add.ll +; Manually reduced to show MSan leads to a compiler crash + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-android9001" + +define void @multi_vector_add_za_vg1x4_f32_tuple(i64 %stride, ptr %ptr) sanitize_memory { + %1 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %2 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") %1, ptr %ptr) + ret void +} diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add.ll new file mode 100644 index 0000000000000..cd04373c11d20 --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add.ll @@ -0,0 +1,340 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -passes=msan -mattr=+sme2 -mattr=+sme-i16i64 -mattr=+sme-f64f64 -o - %s + +; XFAIL: * + +; Forked from llvm/test/CodeGen/AArch64/sme2-intrinsics-add.ll + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-android9001" + +define void @multi_vector_add_write_single_za_vg1x2_i32(i32 %slice, %zn0, %zn1, %zm) sanitize_memory { + call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 %slice, + %zn0, %zn1, + %zm) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 %slice.7, + %zn0, %zn1, + %zm) + ret void +} + +define void @multi_vector_add_write_single_za_vg1x2_i64(i32 %slice, %zn0, %zn1, %zm) sanitize_memory { + call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 %slice, + %zn0, %zn1, + %zm) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 %slice.7, + %zn0, %zn1, + %zm) + ret void +} + + +define void @multi_vector_add_write_single_za_vg1x4_i32(i32 %slice, %zn0, %zn1, + %zn2, %zn3, + %zm) sanitize_memory { + call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 %slice, + %zn0, %zn1, + %zn2, %zn3, + %zm) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 %slice.7, + %zn0, %zn1, + %zn2, %zn3, + %zm) + ret void +} + +define void @multi_vector_add_write_single_za_vg1x4_i64(i32 %slice, + %zn0, %zn1, + %zn2, %zn3, + %zm) sanitize_memory { + call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 %slice, + %zn0, %zn1, + %zn2, %zn3, + %zm) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 %slice.7, + %zn0, %zn1, + %zn2, %zn3, + %zm) + ret void +} + + +define void @multi_vector_add_write_za_vg1x2_i32(i32 %slice, %zn0, %zn1, + %zm1, %zm2) sanitize_memory { + call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 %slice, + %zn0, %zn1, + %zm1, %zm2) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 %slice.7, + %zn0, %zn1, + %zm1, %zm2) + ret void +} + + +define void @multi_vector_add_write_za_vg1x2_i64(i32 %slice, %zn0, %zn1, + %zm1, %zm2) sanitize_memory { + call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 %slice, + %zn0, %zn1, + %zm1, %zm2) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 %slice.7, + %zn0, %zn1, + %zm1, %zm2) + ret void +} + + + +define void @multi_vector_add_write_za_vg1x4_i32(i32 %slice, %zn0, %zn1, + %zn2, %zn3, + %zm0, %zm1, + %zm2, %zm3) sanitize_memory { + call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 %slice, + %zn0, %zn1, + %zn2, %zn3, + %zm0, %zm1, + %zm2, %zm3) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 %slice.7, + %zn0, %zn1, + %zn2, %zn3, + %zm0, %zm1, + %zm2, %zm3) + ret void +} + +define void @multi_vector_add_write_za_vg1x4_i64(i32 %slice, %zn0, %zn1, + %zn2, %zn3, + %zm0, %zm1, + %zm2, %zm3) sanitize_memory { + call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 %slice, + %zn0, %zn1, + %zn2, %zn3, + %zm0, %zm1, + %zm2, %zm3) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 %slice.7, + %zn0, %zn1, + %zn2, %zn3, + %zm0, %zm1, + %zm2, %zm3) + ret void +} + +define void @multi_vector_add_za_vg1x2_i32(i32 %slice, %zn0, %zn1) sanitize_memory { + call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 %slice, %zn0, %zn1) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 %slice.7, %zn0, %zn1) + ret void +} + +define void @multi_vector_add_za_vg1x2_i64(i32 %slice, %zn0, %zn1) sanitize_memory { + call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 %slice, %zn0, %zn1) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 %slice.7, %zn0, %zn1) + ret void +} + +define void @multi_vector_add_za_vg1x2_f32(i32 %slice, %zn0, %zn1) sanitize_memory { + call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32 %slice, + %zn0, %zn1) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32 %slice.7, + %zn0, %zn1) + ret void +} + +define void @multi_vector_add_za_vg1x2_f64(i32 %slice, %zn0, %zn1) sanitize_memory { + call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 %slice, + %zn0, %zn1) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 %slice.7, + %zn0, %zn1) + ret void +} + +define void @multi_vector_add_za_vg1x2_f64_tuple(i64 %stride, ptr %ptr) sanitize_memory { +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , } %1, 0 + %3 = extractvalue { , } %1, 1 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount") %0, ptr %arrayidx2) + %5 = extractvalue { , } %4, 0 + %6 = extractvalue { , } %4, 1 + call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 0, %2, %5) + call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 0, %3, %6) + ret void +} + + +define void @multi_vector_add_za_vg1x4_i32(i32 %slice, %zn0, %zn1, %zn2, %zn3) sanitize_memory { + call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 %slice, + %zn0, %zn1, + %zn2, %zn3) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 %slice.7, + %zn0, %zn1, + %zn2, %zn3) + ret void +} + +define void @multi_vector_add_za_vg1x4_i64(i32 %slice, %zn0, %zn1, %zn2, %zn3) sanitize_memory { + call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 %slice, + %zn0, %zn1, + %zn2, %zn3) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 %slice.7, + %zn0, %zn1, + %zn2, %zn3) + ret void +} + +define void @multi_vector_add_za_vg1x4_f32(i32 %slice, %zn0, %zn1, %zn2, %zn3) sanitize_memory { + call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 %slice, + %zn0, %zn1, + %zn2, %zn3) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 %slice.7, + %zn0, %zn1, + %zn2, %zn3) + ret void +} + +define void @multi_vector_add_za_vg1x4_f32_tuple(i64 %stride, ptr %ptr) sanitize_memory { +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , , , } %1, 0 + %3 = extractvalue { , , , } %1, 1 + %4 = extractvalue { , , , } %1, 2 + %5 = extractvalue { , , , } %1, 3 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") %0, ptr %arrayidx2) + %7 = extractvalue { , , , } %6, 0 + %8 = extractvalue { , , , } %6, 1 + %9 = extractvalue { , , , } %6, 2 + %10 = extractvalue { , , , } %6, 3 + %mul3 = shl i64 %stride, 1 + %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 + %11 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") %0, ptr %arrayidx4) + %12 = extractvalue { , , , } %11, 0 + %13 = extractvalue { , , , } %11, 1 + %14 = extractvalue { , , , } %11, 2 + %15 = extractvalue { , , , } %11, 3 + %mul5 = mul i64 %stride, 3 + %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 + %16 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") %0, ptr %arrayidx6) + %17 = extractvalue { , , , } %16, 0 + %18 = extractvalue { , , , } %16, 1 + %19 = extractvalue { , , , } %16, 2 + %20 = extractvalue { , , , } %16, 3 + call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 0, %2, %7, %12, %17) + call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 0, %3, %8, %13, %18) + call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 0, %4, %9, %14, %19) + call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 0, %5, %10, %15, %20) + ret void +} + +define void @multi_vector_add_za_vg1x4_f64(i32 %slice, %zn0, %zn1, %zn2, %zn3) sanitize_memory { + call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32 %slice, + %zn0, %zn1, + %zn2, %zn3) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32 %slice.7, + %zn0, %zn1, + %zn2, %zn3) + ret void +} + + +define { , } @multi_vec_add_single_x2_s8( %unused, %zdn1, %zdn2, %zm) sanitize_memory { + %res = call { , } + @llvm.aarch64.sve.add.single.x2.nxv16i8( %zdn1, %zdn2, + %zm) + ret { , } %res +} + +define { , } @multi_vec_add_single_x2_s16( %unused, %zdn1, %zdn2, %zm) sanitize_memory { + %res = call { , } + @llvm.aarch64.sve.add.single.x2.nxv8i16( %zdn1, %zdn2, + %zm) + ret { , } %res +} + +define { , } @multi_vec_add_single_x2_s32( %unused, %zdn1, %zdn2, %zm) sanitize_memory { + %res = call { , } + @llvm.aarch64.sve.add.single.x2.nxv4i32( %zdn1, %zdn2, + %zm) + ret { , } %res +} + +define { , } @multi_vec_add_single_x2_s64( %unused, %zdn1, %zdn2, %zm) sanitize_memory { + %res = call { , } + @llvm.aarch64.sve.add.single.x2.nxv2i64( %zdn1, %zdn2, + %zm) + ret { , } %res +} + + +define { , , , } @multi_vec_add_single_x4_s8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) sanitize_memory { + %res = call { , , , } + @llvm.aarch64.sve.add.single.x4.nxv16i8( %zdn1, %zdn2, + %zdn3, %zdn4, + %zm) + ret { , , , } %res +} + +define { , , , } @multi_vec_add_x4_single_s16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) sanitize_memory { + %res = call { , , , } + @llvm.aarch64.sve.add.single.x4.nxv8i16( %zdn1, %zdn2, + %zdn3, %zdn4, + %zm) + ret { , , , } %res +} + +define { , , , } @multi_vec_add_x4_single_s32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) sanitize_memory { + %res = call { , , , } + @llvm.aarch64.sve.add.single.x4.nxv4i32( %zdn1, %zdn2, + %zdn3, %zdn4, + %zm) + ret { , , , } %res +} + +define { , , , } @multi_vec_add_x4_single_s64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) sanitize_memory { + %res = call { , , , } + @llvm.aarch64.sve.add.single.x4.nxv2i64( %zdn1, %zdn2, + %zdn3, %zdn4, + %zm) + ret { , , , } %res +} +declare void@llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32, , , ) +declare void@llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32, , , ) +declare void@llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32, , , , , ) +declare void@llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32, , , , , ) +declare void@llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32, , , , ) +declare void@llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32, , , , ) +declare void@llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32, , , , , , , , ) +declare void@llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32, , , , , , , , ) +declare void@llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32, ,) +declare void@llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32, ,) +declare void@llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32, ,,,) +declare void@llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32, ,,, ) +declare void@llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32, , ) +declare void@llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32, , ) +declare void@llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32, , ,, ) +declare void@llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32, , ,, ) +declare { , } @llvm.aarch64.sve.add.single.x2.nxv16i8(, , ) +declare { , } @llvm.aarch64.sve.add.single.x2.nxv8i16(, , ) +declare { , } @llvm.aarch64.sve.add.single.x2.nxv4i32(, , ) +declare { , } @llvm.aarch64.sve.add.single.x2.nxv2i64(, , ) +declare { , , , } @llvm.aarch64.sve.add.single.x4.nxv16i8(, , , , ) +declare { , , , } @llvm.aarch64.sve.add.single.x4.nxv8i16(, , , , ) +declare { , , , } @llvm.aarch64.sve.add.single.x4.nxv4i32(, , , , ) +declare { , , , } @llvm.aarch64.sve.add.single.x4.nxv2i64(, , , , ) From 90bc75043cef9de0dfb2667508595203208f65b0 Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Tue, 21 Oct 2025 18:27:21 -0700 Subject: [PATCH 037/530] [RISCV][MC] Introduce XSfvfexp* and XSfvfbfexpa* extensions and their MC supports (#164349) XSfvfbfexp16e, XSfvfexp16e, and XSfvfexp32e are SiFive's vector exponential instruction extensions of BFloat16, F16, and F32, respectively. Spec: https://www.sifive.com/document-file/exponential-function-instruction-xsfvfexp32e-xsfvf XSfvfexpa and XSfvfexpa64e are SiFive's vector exponential approximation instruction extensions where the former supports F16 and F32 and the latter covers F64. These instructions approximate 2 raised to a fractional power. Spec: https://www.sifive.com/document-file/exponential-approximation-instruction-xsfvfexpa-ex This patch adds their corresponding features and MC supports. --------- Co-authored-by: Jesse Huang Co-authored-by: Craig Topper --- .../Driver/print-supported-extensions-riscv.c | 5 +++ .../riscv-target-features-sifive.c | 45 +++++++++++++++++++ .../Target/RISCV/AsmParser/RISCVAsmParser.cpp | 3 +- .../RISCV/Disassembler/RISCVDisassembler.cpp | 5 ++- .../RISCV/MCTargetDesc/RISCVInstPrinter.cpp | 3 +- llvm/lib/Target/RISCV/RISCVFeatures.td | 38 ++++++++++++++++ llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td | 8 ++++ llvm/lib/TargetParser/RISCVISAInfo.cpp | 6 +++ llvm/test/CodeGen/RISCV/features-info.ll | 5 +++ llvm/test/MC/RISCV/xsfvfexp.s | 29 ++++++++++++ llvm/test/MC/RISCV/xsfvfexpa.s | 15 +++++++ .../TargetParser/RISCVISAInfoTest.cpp | 10 +++++ 12 files changed, 169 insertions(+), 3 deletions(-) create mode 100644 llvm/test/MC/RISCV/xsfvfexp.s create mode 100644 llvm/test/MC/RISCV/xsfvfexpa.s diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c index 7972b9e43e73e..cb812736786a9 100644 --- a/clang/test/Driver/print-supported-extensions-riscv.c +++ b/clang/test/Driver/print-supported-extensions-riscv.c @@ -190,6 +190,11 @@ // CHECK-NEXT: xsfmm64t 0.6 'XSfmm64t' (TE=64 configuration) // CHECK-NEXT: xsfmmbase 0.6 'XSfmmbase' (All non arithmetic instructions for all TEWs and sf.vtzero) // CHECK-NEXT: xsfvcp 1.0 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions) +// CHECK-NEXT: xsfvfbfexp16e 0.5 'XSfvfbfexp16e' (SiFive Vector Floating-Point Exponential Function Instruction, BFloat16) +// CHECK-NEXT: xsfvfexp16e 0.5 'XSfvfexp16e' (SiFive Vector Floating-Point Exponential Function Instruction, Half Precision) +// CHECK-NEXT: xsfvfexp32e 0.5 'XSfvfexp32e' (SiFive Vector Floating-Point Exponential Function Instruction, Single Precision) +// CHECK-NEXT: xsfvfexpa 0.2 'XSfvfexpa' (SiFive Vector Floating-Point Exponential Approximation Instruction) +// CHECK-NEXT: xsfvfexpa64e 0.2 'XSfvfexpa64e' (SiFive Vector Floating-Point Exponential Approximation Instruction with Double-Precision) // CHECK-NEXT: xsfvfnrclipxfqf 1.0 'XSfvfnrclipxfqf' (SiFive FP32-to-int8 Ranged Clip Instructions) // CHECK-NEXT: xsfvfwmaccqqq 1.0 'XSfvfwmaccqqq' (SiFive Matrix Multiply Accumulate Instruction (4-by-4)) // CHECK-NEXT: xsfvqmaccdod 1.0 'XSfvqmaccdod' (SiFive Int8 Matrix Multiplication Instructions (2-by-8 and 8-by-2)) diff --git a/clang/test/Preprocessor/riscv-target-features-sifive.c b/clang/test/Preprocessor/riscv-target-features-sifive.c index 1c49b5549f3b7..5002f621b7c39 100644 --- a/clang/test/Preprocessor/riscv-target-features-sifive.c +++ b/clang/test/Preprocessor/riscv-target-features-sifive.c @@ -5,6 +5,11 @@ // CHECK-NOT: __riscv_xsfcease {{.*$}} // CHECK-NOT: __riscv_xsfvcp {{.*$}} +// CHECK-NOT: __riscv_xsfvfbfexp16e {{.*$}} +// CHECK-NOT: __riscv_xsfvfexp16e {{.*$}} +// CHECK-NOT: __riscv_xsfvfexp32e {{.*$}} +// CHECK-NOT: __riscv_xsfvfexpa {{.*$}} +// CHECK-NOT: __riscv_xsfvfexpa64e {{.*$}} // CHECK-NOT: __riscv_xsfvfnrclipxfqf {{.*$}} // CHECK-NOT: __riscv_xsfvfwmaccqqq {{.*$}} // CHECK-NOT: __riscv_xsfqmaccdod {{.*$}} @@ -38,6 +43,46 @@ // RUN: -o - | FileCheck --check-prefix=CHECK-XSFVCP-EXT %s // CHECK-XSFVCP-EXT: __riscv_xsfvcp 1000000{{$}} +// RUN: %clang --target=riscv32-unknown-linux-gnu \ +// RUN: -march=rv32ixsfvfbfexp16e_zvfbfmin -E -dM %s \ +// RUN: -o - | FileCheck --check-prefix=CHECK-XSFVFBFEXP16E-EXT %s +// RUN: %clang --target=riscv64-unknown-linux-gnu \ +// RUN: -march=rv64ixsfvfbfexp16e_zvfbfmin -E -dM %s \ +// RUN: -o - | FileCheck --check-prefix=CHECK-XSFVFBFEXP16E-EXT %s +// CHECK-XSFVFBFEXP16E-EXT: __riscv_xsfvfbfexp16e 5000{{$}} + +// RUN: %clang --target=riscv32-unknown-linux-gnu \ +// RUN: -march=rv32ixsfvfexp16e -E -dM %s \ +// RUN: -o - | FileCheck --check-prefix=CHECK-XSFVFEXP16E-EXT %s +// RUN: %clang --target=riscv64-unknown-linux-gnu \ +// RUN: -march=rv64ixsfvfexp16e -E -dM %s \ +// RUN: -o - | FileCheck --check-prefix=CHECK-XSFVFEXP16E-EXT %s +// CHECK-XSFVFEXP16E-EXT: __riscv_xsfvfexp16e 5000{{$}} + +// RUN: %clang --target=riscv32-unknown-linux-gnu \ +// RUN: -march=rv32ixsfvfexp32e -E -dM %s \ +// RUN: -o - | FileCheck --check-prefix=CHECK-XSFVFEXP32E-EXT %s +// RUN: %clang --target=riscv64-unknown-linux-gnu \ +// RUN: -march=rv64ixsfvfexp32e -E -dM %s \ +// RUN: -o - | FileCheck --check-prefix=CHECK-XSFVFEXP32E-EXT %s +// CHECK-XSFVFEXP32E-EXT: __riscv_xsfvfexp32e 5000{{$}} + +// RUN: %clang --target=riscv32-unknown-linux-gnu \ +// RUN: -march=rv32ixsfvfexpa -E -dM %s \ +// RUN: -o - | FileCheck --check-prefix=CHECK-XSFVFEXPA-EXT %s +// RUN: %clang --target=riscv64-unknown-linux-gnu \ +// RUN: -march=rv64ixsfvfexpa -E -dM %s \ +// RUN: -o - | FileCheck --check-prefix=CHECK-XSFVFEXPA-EXT %s +// CHECK-XSFVFEXPA-EXT: __riscv_xsfvfexpa 2000{{$}} + +// RUN: %clang --target=riscv32-unknown-linux-gnu \ +// RUN: -march=rv32ixsfvfexpa64e -E -dM %s \ +// RUN: -o - | FileCheck --check-prefix=CHECK-XSFVFEXPA64E-EXT %s +// RUN: %clang --target=riscv64-unknown-linux-gnu \ +// RUN: -march=rv64ixsfvfexpa64e -E -dM %s \ +// RUN: -o - | FileCheck --check-prefix=CHECK-XSFVFEXPA64E-EXT %s +// CHECK-XSFVFEXPA64E-EXT: __riscv_xsfvfexpa64e 2000{{$}} + // RUN: %clang --target=riscv32-unknown-linux-gnu \ // RUN: -march=rv32ixsfvfnrclipxfqf -E -dM %s \ // RUN: -o - | FileCheck --check-prefix=CHECK-XSFVFNRCLIPXFQF-EXT %s diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index e857b2dd78deb..edde7ac487da3 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -2406,7 +2406,8 @@ ParseStatus RISCVAsmParser::parseVTypeI(OperandVector &Operands) { } bool RISCVAsmParser::generateVTypeError(SMLoc ErrorLoc) { - if (STI->hasFeature(RISCV::FeatureStdExtZvfbfa)) + if (STI->hasFeature(RISCV::FeatureStdExtZvfbfa) || + STI->hasFeature(RISCV::FeatureVendorXSfvfbfexp16e)) return Error( ErrorLoc, "operand must be " diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index b8ec0bbfcd3bb..4bea4c48dddd1 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -654,7 +654,10 @@ static constexpr FeatureBitset XqciFeatureGroup = { static constexpr FeatureBitset XSfVectorGroup = { RISCV::FeatureVendorXSfvcp, RISCV::FeatureVendorXSfvqmaccdod, RISCV::FeatureVendorXSfvqmaccqoq, RISCV::FeatureVendorXSfvfwmaccqqq, - RISCV::FeatureVendorXSfvfnrclipxfqf, RISCV::FeatureVendorXSfmmbase}; + RISCV::FeatureVendorXSfvfnrclipxfqf, RISCV::FeatureVendorXSfmmbase, + RISCV::FeatureVendorXSfvfexpa, RISCV::FeatureVendorXSfvfexpa64e, + RISCV::FeatureVendorXSfvfbfexp16e, RISCV::FeatureVendorXSfvfexp16e, + RISCV::FeatureVendorXSfvfexp32e}; static constexpr FeatureBitset XSfSystemGroup = { RISCV::FeatureVendorXSiFivecdiscarddlone, RISCV::FeatureVendorXSiFivecflushdlone, diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp index 50f5a5d09a69f..7b9c4b3e800cd 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp @@ -220,7 +220,8 @@ void RISCVInstPrinter::printVTypeI(const MCInst *MI, unsigned OpNo, if (RISCVVType::getVLMUL(Imm) == RISCVVType::VLMUL::LMUL_RESERVED || RISCVVType::getSEW(Imm) > 64 || (RISCVVType::isAltFmt(Imm) && - !STI.hasFeature(RISCV::FeatureStdExtZvfbfa)) || + !(STI.hasFeature(RISCV::FeatureStdExtZvfbfa) || + STI.hasFeature(RISCV::FeatureVendorXSfvfbfexp16e))) || (Imm >> 9) != 0) { O << formatImm(Imm); return; diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 3abbbb36315a8..9e6b7f0327eb8 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1335,6 +1335,44 @@ def HasVendorXSfvfnrclipxfqf AssemblerPredicate<(all_of FeatureVendorXSfvfnrclipxfqf), "'XSfvfnrclipxfqf' (SiFive FP32-to-int8 Ranged Clip Instructions)">; +// Note: XSfvfbfexp16e depends on either Zvfbfmin _or_ Zvfbfa, which cannot be expressed here in +// TableGen. Instead, we check that in RISCVISAInfo. +def FeatureVendorXSfvfbfexp16e + : RISCVExtension<0, 5, + "SiFive Vector Floating-Point Exponential Function Instruction, BFloat16">; +def HasVendorXSfvfbfexp16e : Predicate<"Subtarget->hasVendorXSfvfbfexp16e()">; + +def FeatureVendorXSfvfexp16e + : RISCVExtension<0, 5, + "SiFive Vector Floating-Point Exponential Function Instruction, Half Precision", + [FeatureStdExtZvfh]>; +def HasVendorXSfvfexp16e : Predicate<"Subtarget->hasVendorXSfvfexp16e()">; + +def FeatureVendorXSfvfexp32e + : RISCVExtension<0, 5, + "SiFive Vector Floating-Point Exponential Function Instruction, Single Precision", + [FeatureStdExtZve32f]>; +def HasVendorXSfvfexp32e : Predicate<"Subtarget->hasVendorXSfvfexp32e()">; + +def HasVendorXSfvfexpAnyFloat : Predicate<"Subtarget->hasVendorXSfvfexp16e() || Subtarget->hasVendorXSfvfexp32e()">; +def HasVendorXSfvfexpAny : Predicate<"Subtarget->hasVendorXSfvfbfexp16e() || Subtarget->hasVendorXSfvfexp16e() || Subtarget->hasVendorXSfvfexp32e()">, + AssemblerPredicate<(any_of FeatureVendorXSfvfbfexp16e, FeatureVendorXSfvfexp16e, FeatureVendorXSfvfexp32e), + "'Xsfvfbfexp16e', 'Xsfvfexp16e', or 'Xsfvfexp32e' (SiFive Vector Floating-Point Exponential Function Instruction)">; + +def FeatureVendorXSfvfexpa + : RISCVExtension<0, 2, + "SiFive Vector Floating-Point Exponential Approximation Instruction", + [FeatureStdExtZve32f]>; +def HasVendorXSfvfexpa : Predicate<"Subtarget->hasVendorXSfvfexpa()">, + AssemblerPredicate<(all_of FeatureVendorXSfvfexpa), + "'Xsfvfexpa' (SiFive Vector Floating-Point Exponential Approximation Instruction)">; + +def FeatureVendorXSfvfexpa64e + : RISCVExtension<0, 2, + "SiFive Vector Floating-Point Exponential Approximation Instruction with Double-Precision", + [FeatureVendorXSfvfexpa, FeatureStdExtZve64d]>; +def HasVendorXSfvfexpa64e : Predicate<"Subtarget->hasVendorXSfvfexpa64e()">; + def FeatureVendorXSiFivecdiscarddlone : RISCVExtension<1, 0, "SiFive sf.cdiscard.d.l1 Instruction", []>; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td index 6a4119a34ac09..4104abd3b0219 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td @@ -217,6 +217,14 @@ let Predicates = [HasVendorXSfvcp], mayLoad = 0, mayStore = 0, defm FVW : CustomSiFiveVCIX<"fvw", VCIX_XVW, VR, VR, FPR32>, Sched<[]>; } +let Predicates = [HasVendorXSfvfexpAny], DecoderNamespace = "XSfvector" in { + def SF_VFEXP_V : VALUVs2<0b010011, 0b00111, OPFVV, "sf.vfexp.v">; +} + +let Predicates = [HasVendorXSfvfexpa], DecoderNamespace = "XSfvector" in { + def SF_VFEXPA_V : VALUVs2<0b010011, 0b00110, OPFVV, "sf.vfexpa.v">; +} + let Predicates = [HasVendorXSfvqmaccdod], DecoderNamespace = "XSfvector", DestEEW = EEWSEWx4, RVVConstraint=VS2Constraint in { def SF_VQMACCU_2x8x2 : CustomSiFiveVMACC<0b101100, OPMVV, "sf.vqmaccu.2x8x2">; diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp index 31126cc698b3f..f08a0c0ddd680 100644 --- a/llvm/lib/TargetParser/RISCVISAInfo.cpp +++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp @@ -765,6 +765,12 @@ Error RISCVISAInfo::checkDependency() { if (HasZvl && !HasVector) return getExtensionRequiresError("zvl*b", "v' or 'zve*"); + if (Exts.count("xsfvfbfexp16e") && + !(Exts.count("zvfbfmin") || Exts.count("zvfbfa"))) + return createStringError(errc::invalid_argument, + "'xsfvfbfexp16e' requires 'zvfbfmin' or " + "'zvfbfa' extension to also be specified"); + if (HasD && (HasC || Exts.count("zcd"))) for (auto Ext : ZcdOverlaps) if (Exts.count(Ext.str())) diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll index 693a40d3f39e6..5e5f2b78e8869 100644 --- a/llvm/test/CodeGen/RISCV/features-info.ll +++ b/llvm/test/CodeGen/RISCV/features-info.ll @@ -217,6 +217,11 @@ ; CHECK-NEXT: xsfmm64t - 'XSfmm64t' (TE=64 configuration). ; CHECK-NEXT: xsfmmbase - 'XSfmmbase' (All non arithmetic instructions for all TEWs and sf.vtzero). ; CHECK-NEXT: xsfvcp - 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions). +; CHECK-NEXT: xsfvfbfexp16e - 'XSfvfbfexp16e' (SiFive Vector Floating-Point Exponential Function Instruction, BFloat16). +; CHECK-NEXT: xsfvfexp16e - 'XSfvfexp16e' (SiFive Vector Floating-Point Exponential Function Instruction, Half Precision). +; CHECK-NEXT: xsfvfexp32e - 'XSfvfexp32e' (SiFive Vector Floating-Point Exponential Function Instruction, Single Precision). +; CHECK-NEXT: xsfvfexpa - 'XSfvfexpa' (SiFive Vector Floating-Point Exponential Approximation Instruction). +; CHECK-NEXT: xsfvfexpa64e - 'XSfvfexpa64e' (SiFive Vector Floating-Point Exponential Approximation Instruction with Double-Precision). ; CHECK-NEXT: xsfvfnrclipxfqf - 'XSfvfnrclipxfqf' (SiFive FP32-to-int8 Ranged Clip Instructions). ; CHECK-NEXT: xsfvfwmaccqqq - 'XSfvfwmaccqqq' (SiFive Matrix Multiply Accumulate Instruction (4-by-4)). ; CHECK-NEXT: xsfvqmaccdod - 'XSfvqmaccdod' (SiFive Int8 Matrix Multiplication Instructions (2-by-8 and 8-by-2)). diff --git a/llvm/test/MC/RISCV/xsfvfexp.s b/llvm/test/MC/RISCV/xsfvfexp.s new file mode 100644 index 0000000000000..bd6aecdf4381a --- /dev/null +++ b/llvm/test/MC/RISCV/xsfvfexp.s @@ -0,0 +1,29 @@ +# RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+xsfvfexp32e %s \ +# RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +# RUN: not llvm-mc -triple=riscv64 -show-encoding %s 2>&1 \ +# RUN: | FileCheck %s --check-prefix=CHECK-ERROR +# RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+xsfvfexp32e %s \ +# RUN: | llvm-objdump -d --mattr=+xsfvfexp32e - \ +# RUN: | FileCheck %s --check-prefix=CHECK-INST +# RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+xsfvfexp32e %s \ +# RUN: | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN +# RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+xsfvfexp16e %s \ +# RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +# RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+xsfvfexp16e %s \ +# RUN: | llvm-objdump -d --mattr=+xsfvfexp16e - \ +# RUN: | FileCheck %s --check-prefix=CHECK-INST +# RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+xsfvfexp16e %s \ +# RUN: | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN +# RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+zvfbfmin,+xsfvfbfexp16e %s \ +# RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +# RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+zvfbfmin,+xsfvfbfexp16e %s \ +# RUN: | llvm-objdump -d --mattr=+xsfvfbfexp16e - \ +# RUN: | FileCheck %s --check-prefix=CHECK-INST +# RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+zvfbfmin,+xsfvfbfexp16e %s \ +# RUN: | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN + +sf.vfexp.v v2, v5, v0.t +# CHECK-INST: sf.vfexp.v v2, v5, v0.t +# CHECK-ENCODING: [0x57,0x91,0x53,0x4c] +# CHECK-ERROR: instruction requires the following: 'Xsfvfbfexp16e', 'Xsfvfexp16e', or 'Xsfvfexp32e' (SiFive Vector Floating-Point Exponential Function Instruction){{$}} +# CHECK-UNKNOWN: 4c539157 diff --git a/llvm/test/MC/RISCV/xsfvfexpa.s b/llvm/test/MC/RISCV/xsfvfexpa.s new file mode 100644 index 0000000000000..317a103122b9c --- /dev/null +++ b/llvm/test/MC/RISCV/xsfvfexpa.s @@ -0,0 +1,15 @@ +# RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+xsfvfexpa %s \ +# RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +# RUN: not llvm-mc -triple=riscv64 -show-encoding %s 2>&1 \ +# RUN: | FileCheck %s --check-prefix=CHECK-ERROR +# RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+xsfvfexpa %s \ +# RUN: | llvm-objdump -d --mattr=+xsfvfexpa - \ +# RUN: | FileCheck %s --check-prefix=CHECK-INST +# RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+xsfvfexpa %s \ +# RUN: | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN + +sf.vfexpa.v v2, v5, v0.t +# CHECK-INST: sf.vfexpa.v v2, v5, v0.t +# CHECK-ENCODING: [0x57,0x11,0x53,0x4c] +# CHECK-ERROR: instruction requires the following: 'Xsfvfexpa' (SiFive Vector Floating-Point Exponential Approximation Instruction){{$}} +# CHECK-UNKNOWN: 4c531157 diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp index 5d69a316a1f16..bfc127530570d 100644 --- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp +++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp @@ -730,6 +730,11 @@ TEST(ParseArchString, MissingDepency) { EXPECT_EQ(toString(RISCVISAInfo::parseArchString(Input, true).takeError()), ""); } + + EXPECT_EQ(toString(RISCVISAInfo::parseArchString("rv64i_xsfvfbfexp16e", true) + .takeError()), + "'xsfvfbfexp16e' requires 'zvfbfmin' or 'zvfbfa' extension to also " + "be specified"); } TEST(ParseArchString, RejectsUnrecognizedProfileNames) { @@ -1162,6 +1167,11 @@ R"(All available -march extensions for RISC-V xsfmm64t 0.6 xsfmmbase 0.6 xsfvcp 1.0 + xsfvfbfexp16e 0.5 + xsfvfexp16e 0.5 + xsfvfexp32e 0.5 + xsfvfexpa 0.2 + xsfvfexpa64e 0.2 xsfvfnrclipxfqf 1.0 xsfvfwmaccqqq 1.0 xsfvqmaccdod 1.0 From c1212419a52fb9d3eddd563996cf63bd27284066 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Tue, 21 Oct 2025 18:44:56 -0700 Subject: [PATCH 038/530] [NFC][SpecialCaseList] Remove LLVM_ABI from SpecialCaseList::Matcher::preprocess. (#164526) Copied here from header by mistake. --------- Co-authored-by: Marco Elver --- llvm/lib/Support/SpecialCaseList.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Support/SpecialCaseList.cpp b/llvm/lib/Support/SpecialCaseList.cpp index 549c4183298d8..f74e52a3a7fa9 100644 --- a/llvm/lib/Support/SpecialCaseList.cpp +++ b/llvm/lib/Support/SpecialCaseList.cpp @@ -111,7 +111,7 @@ Error SpecialCaseList::Matcher::insert(StringRef Pattern, unsigned LineNumber) { return std::visit([&](auto &V) { return V.insert(Pattern, LineNumber); }, M); } -LLVM_ABI void SpecialCaseList::Matcher::preprocess(bool BySize) { +void SpecialCaseList::Matcher::preprocess(bool BySize) { return std::visit([&](auto &V) { return V.preprocess(BySize); }, M); } From 9df34a1379261ed0ed3c5ad99a302fc12bd2ca78 Mon Sep 17 00:00:00 2001 From: lonely eagle <2020382038@qq.com> Date: Wed, 22 Oct 2025 09:46:14 +0800 Subject: [PATCH 039/530] [mlir][transform] Clearnup tile_one_consumer_using_tile_and_fuse test (NFC) (#164108) --- .../tile-and-fuse-using-interface.mlir | 47 +++++++------------ 1 file changed, 17 insertions(+), 30 deletions(-) diff --git a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir index 8116044594fca..21d7816934bf9 100644 --- a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir +++ b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir @@ -658,33 +658,20 @@ module attributes {transform.with_named_sequence} { } } -// CHECK: func.func private @tile_one_consumer_using_tile_and_fuse(%[[VAL_0:.*]]: tensor<16x128x48x96xf32>, %[[VAL_1:.*]]: tensor<16x96x48x128xf32>) -> tensor<16x96x48x128xf32> { -// CHECK: %[[VAL_2:.*]] = arith.constant 0 : index -// CHECK: %[[VAL_3:.*]] = arith.constant 16 : index -// CHECK: %[[VAL_4:.*]] = arith.constant 128 : index -// CHECK: %[[VAL_5:.*]] = arith.constant 48 : index -// CHECK: %[[VAL_6:.*]] = arith.constant 96 : index -// CHECK: %[[VAL_7:.*]] = arith.constant 1 : index -// CHECK: %[[VAL_8:.*]] = scf.for %[[VAL_9:.*]] = %[[VAL_2]] to %[[VAL_3]] step %[[VAL_7]] iter_args(%[[VAL_10:.*]] = %[[VAL_1]]) -> (tensor<16x96x48x128xf32>) { -// CHECK: %[[VAL_11:.*]] = scf.for %[[VAL_12:.*]] = %[[VAL_2]] to %[[VAL_4]] step %[[VAL_3]] iter_args(%[[VAL_13:.*]] = %[[VAL_10]]) -> (tensor<16x96x48x128xf32>) { -// CHECK: %[[VAL_14:.*]] = scf.for %[[VAL_15:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_3]] iter_args(%[[VAL_16:.*]] = %[[VAL_13]]) -> (tensor<16x96x48x128xf32>) { -// CHECK: %[[VAL_17:.*]] = scf.for %[[VAL_18:.*]] = %[[VAL_2]] to %[[VAL_6]] step %[[VAL_3]] iter_args(%[[VAL_19:.*]] = %[[VAL_16]]) -> (tensor<16x96x48x128xf32>) { -// CHECK: %[[VAL_20:.*]] = tensor.extract_slice %[[VAL_0]]{{\[}}%[[VAL_9]], %[[VAL_12]], %[[VAL_15]], %[[VAL_18]]] [1, 16, 16, 16] [1, 1, 1, 1] : tensor<16x128x48x96xf32> to tensor<1x16x16x16xf32> -// CHECK: %[[VAL_21:.*]] = tensor.extract_slice %[[VAL_19]]{{\[}}%[[VAL_9]], %[[VAL_18]], %[[VAL_15]], %[[VAL_12]]] [1, 16, 16, 16] [1, 1, 1, 1] : tensor<16x96x48x128xf32> to tensor<1x16x16x16xf32> -// CHECK: %[[VAL_22:.*]] = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%[[VAL_20]] : tensor<1x16x16x16xf32>) outs(%[[VAL_21]] : tensor<1x16x16x16xf32>) { -// CHECK: ^bb0(%[[VAL_23:.*]]: f32, %[[VAL_24:.*]]: f32): -// CHECK: linalg.yield %[[VAL_23]] : f32 -// CHECK: } -> tensor<1x16x16x16xf32> -// CHECK: %[[VAL_25:.*]] = tensor.insert_slice %[[VAL_26:.*]] into %[[VAL_19]]{{\[}}%[[VAL_9]], %[[VAL_18]], %[[VAL_15]], %[[VAL_12]]] [1, 16, 16, 16] [1, 1, 1, 1] : tensor<1x16x16x16xf32> into tensor<16x96x48x128xf32> -// CHECK: scf.yield %[[VAL_25]] : tensor<16x96x48x128xf32> -// CHECK: } -// CHECK: scf.yield %[[VAL_27:.*]] : tensor<16x96x48x128xf32> -// CHECK: } -// CHECK: scf.yield %[[VAL_28:.*]] : tensor<16x96x48x128xf32> -// CHECK: } -// CHECK: scf.yield %[[VAL_29:.*]] : tensor<16x96x48x128xf32> -// CHECK: } -// CHECK: return %[[VAL_30:.*]] : tensor<16x96x48x128xf32> -// CHECK: } -// CHECK: } - +// CHECK-LABEL: func private @tile_one_consumer_using_tile_and_fuse +// CHECK-SAME: %[[ARG0:.*]]: tensor<16x128x48x96xf32> +// CHECK-SAME: %[[ARG1:.*]]: tensor<16x96x48x128xf32> +// CHECK: scf.for %[[IV0:[a-zA-Z0-9]+]] = +// CHECK-SAME: iter_args(%[[ITERARG0:.+]] = %[[ARG1]]) +// CHECK: scf.for %[[IV1:[a-zA-Z0-9]+]] = +// CHECK-SAME: iter_args(%[[ITERARG1:.+]] = %[[ITERARG0]]) +// CHECK: scf.for %[[IV2:[a-zA-Z0-9]+]] = +// CHECK-SAME: iter_args(%[[ITERARG2:.+]] = %[[ITERARG1]]) +// CHECK: scf.for %[[IV3:[a-zA-Z0-9]+]] = +// CHECK-SAME: iter_args(%[[ITERARG3:.+]] = %[[ITERARG2]]) +// CHECK: %[[TILEDARG0:.*]] = tensor.extract_slice %[[ARG0]]{{\[}}%[[IV0]], %[[IV1]], %[[IV2]], %[[IV3]]] +// CHECK: %[[TILEDARG1:.*]] = tensor.extract_slice %[[ITERARG3]]{{\[}}%[[IV0]], %[[IV3]], %[[IV2]], %[[IV1]]] +// CHECK: %[[RES:.*]] = linalg.generic +// CHECK-SAME: ins(%[[TILEDARG0]] +// CHECK-SAME: outs(%[[TILEDARG1]] +// CHECK: tensor.insert_slice %[[RES:.*]] From 668bac5810249ae7711b050ba35afaea9ec4e1fd Mon Sep 17 00:00:00 2001 From: Walter Lee <49250218+googlewalt@users.noreply.github.com> Date: Tue, 21 Oct 2025 22:14:53 -0400 Subject: [PATCH 040/530] [asm][ie86-x86-64] Fix build error (#164532) Fixes #164453. --- compiler-rt/lib/builtins/assembly.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler-rt/lib/builtins/assembly.h b/compiler-rt/lib/builtins/assembly.h index 79a45d9143b40..d28f73f14b0cf 100644 --- a/compiler-rt/lib/builtins/assembly.h +++ b/compiler-rt/lib/builtins/assembly.h @@ -337,7 +337,7 @@ #endif #endif -#if defined(__i386__) || defined(__amd64__) +#if defined(__ASSEMBLER__) && (defined(__i386__) || defined(__amd64__)) .att_syntax #endif From 36b1ad112102ace3225b7e9b48314e178c25b729 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Tue, 21 Oct 2025 19:25:48 -0700 Subject: [PATCH 041/530] workflows/release-binaries: Remove LLVM_PARALLEL_LINK_JOBS option (#164101) This was left over from when we were using smaller runners and was only being applied to the stage2-instrumented build, and not the stage2 build. --- .github/workflows/release-binaries.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml index 83969b5490685..ea70b93edb129 100644 --- a/.github/workflows/release-binaries.yml +++ b/.github/workflows/release-binaries.yml @@ -230,7 +230,6 @@ jobs: cmake -G Ninja -S llvm -B ${{ steps.setup-stage.outputs.build-prefix }}/build \ ${{ needs.prepare.outputs.target-cmake-flags }} \ -C clang/cmake/caches/Release.cmake \ - -DBOOTSTRAP_LLVM_PARALLEL_LINK_JOBS=1 \ -DBOOTSTRAP_BOOTSTRAP_CPACK_PACKAGE_FILE_NAME="${{ needs.prepare.outputs.release-binary-basename }}" - name: Build From 59d4d5c1c3b6f0f81ac51bea26605466268f83e9 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Tue, 21 Oct 2025 19:26:56 -0700 Subject: [PATCH 042/530] workflows/release-documentation: Allow secrets pass through from calling workflow (#162765) This should fix the part of the workflow that creates a PR with the new documentation. --- .github/workflows/release-documentation.yml | 4 ++++ .github/workflows/release-tasks.yml | 3 +++ 2 files changed, 7 insertions(+) diff --git a/.github/workflows/release-documentation.yml b/.github/workflows/release-documentation.yml index d3d375d3a6df9..4cf973d000a4b 100644 --- a/.github/workflows/release-documentation.yml +++ b/.github/workflows/release-documentation.yml @@ -25,6 +25,10 @@ on: description: 'Upload documentation' required: false type: boolean + secrets: + WWW_RELEASES_TOKEN: + description: "Secret used to create a PR with the documentation changes." + required: false jobs: release-documentation: diff --git a/.github/workflows/release-tasks.yml b/.github/workflows/release-tasks.yml index a184996968cdd..d4c2a55fcc9d7 100644 --- a/.github/workflows/release-tasks.yml +++ b/.github/workflows/release-tasks.yml @@ -54,6 +54,9 @@ jobs: with: release-version: ${{ needs.validate-tag.outputs.release-version }} upload: true + # Called workflows don't have access to secrets by default, so we need to explicitly pass secrets that we use. + secrets: + WWW_RELEASES_TOKEN: ${{ secrets.WWW_RELEASES_TOKEN }} release-doxygen: name: Build and Upload Release Doxygen From 7102ff4a2114f72dc37c392e39895643e63aa3fb Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Tue, 21 Oct 2025 19:31:08 -0700 Subject: [PATCH 043/530] Potential fix for code scanning alert no. 1440: Code injection (#162764) Fix for: https://github.com/llvm/llvm-project/security/code-scanning/1440 This fix was automatically generated by Copilot Autofix for CodeQL. See https://docs.github.com/code-security/code-scanning/managing-code-scanning-alerts/about-autofix-for-codeql-code-scanning for more info. Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> --- .github/workflows/libcxx-run-benchmarks.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/libcxx-run-benchmarks.yml b/.github/workflows/libcxx-run-benchmarks.yml index 0379a0a1f857d..9e8f55859fc7a 100644 --- a/.github/workflows/libcxx-run-benchmarks.yml +++ b/.github/workflows/libcxx-run-benchmarks.yml @@ -64,17 +64,21 @@ jobs: path: repo # Avoid nuking the workspace, where we have the Python virtualenv - name: Run baseline + env: + BENCHMARKS: ${{ steps.vars.outputs.benchmarks }} run: | source .venv/bin/activate && cd repo python -m pip install -r libcxx/utils/requirements.txt baseline_commit=$(git merge-base ${{ steps.vars.outputs.pr_base }} ${{ steps.vars.outputs.pr_head }}) - ./libcxx/utils/test-at-commit --commit ${baseline_commit} -B build/baseline -- -sv -j1 --param optimization=speed ${{ steps.vars.outputs.benchmarks }} + ./libcxx/utils/test-at-commit --commit ${baseline_commit} -B build/baseline -- -sv -j1 --param optimization=speed "$BENCHMARKS" ./libcxx/utils/consolidate-benchmarks build/baseline | tee baseline.lnt - name: Run candidate + env: + BENCHMARKS: ${{ steps.vars.outputs.benchmarks }} run: | source .venv/bin/activate && cd repo - ./libcxx/utils/test-at-commit --commit ${{ steps.vars.outputs.pr_head }} -B build/candidate -- -sv -j1 --param optimization=speed ${{ steps.vars.outputs.benchmarks }} + ./libcxx/utils/test-at-commit --commit ${{ steps.vars.outputs.pr_head }} -B build/candidate -- -sv -j1 --param optimization=speed "$BENCHMARKS" ./libcxx/utils/consolidate-benchmarks build/candidate | tee candidate.lnt - name: Compare baseline and candidate runs From 1906c3e1e30759d2eb85b2833e8c5ff64128bfba Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Tue, 21 Oct 2025 19:46:40 -0700 Subject: [PATCH 044/530] [CMake][Release] Stop linking against stage1 runtimes (#164017) This was causing a build failure on Darwin and a test failure on Linux. This config is not widely used or well tested, so I don't think the potential and likely small performance gains and the portability from this are worth the maintenance costs. --- clang/cmake/caches/Release.cmake | 39 ++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/clang/cmake/caches/Release.cmake b/clang/cmake/caches/Release.cmake index a523cc561b3f9..82bfdc0cfd565 100644 --- a/clang/cmake/caches/Release.cmake +++ b/clang/cmake/caches/Release.cmake @@ -44,6 +44,19 @@ set(LLVM_RELEASE_ENABLE_LTO THIN CACHE STRING "") set(LLVM_RELEASE_ENABLE_PGO ON CACHE BOOL "") set(LLVM_RELEASE_ENABLE_RUNTIMES ${DEFAULT_RUNTIMES} CACHE STRING "") set(LLVM_RELEASE_ENABLE_PROJECTS ${DEFAULT_PROJECTS} CACHE STRING "") + +# This option enables linking stage2 clang statically with the runtimes +# (libc++ and compiler-rt) from stage1. In theory this will give the +# binaries better performance and make them more portable. However, +# this configuration is not well tested and causes build failures with +# the flang-rt tests cases, since the -stclib=libc++ flag does not +# get propagated to the runtimes build. There is also a separate +# issue on Darwin where clang will use the local libc++ headers, but +# link with system libc++ which can cause some incompatibilities. +# See https://github.com/llvm/llvm-project/issues/77653 +# Because of these problems, this option will default to OFF. +set(LLVM_RELEASE_ENABLE_LINK_LOCAL_RUNTIMES OFF CACHE BOOL "") + # Note we don't need to add install here, since it is one of the pre-defined # steps. set(LLVM_RELEASE_FINAL_STAGE_TARGETS "clang;package;check-all;check-llvm;check-clang" CACHE STRING "") @@ -55,8 +68,12 @@ set(CLANG_ENABLE_BOOTSTRAP ON CACHE BOOL "") set(STAGE1_PROJECTS "clang") +# Need to build compiler-rt in order to use PGO for later stages. +set(STAGE1_RUNTIMES "compiler-rt") # Build all runtimes so we can statically link them into the stage2 compiler. -set(STAGE1_RUNTIMES "compiler-rt;libcxx;libcxxabi;libunwind") +if(LLVM_RELEASE_ENABLE_LINK_LOCAL_RUNTIMES) + list(APPEND STAGE1_RUNTIMES "libcxx;libcxxabi;libunwind") +endif() if (LLVM_RELEASE_ENABLE_PGO) list(APPEND STAGE1_PROJECTS "lld") @@ -118,11 +135,13 @@ set_instrument_and_final_stage_var(LLVM_ENABLE_LTO "${LLVM_RELEASE_ENABLE_LTO}" if (LLVM_RELEASE_ENABLE_LTO) set_instrument_and_final_stage_var(LLVM_ENABLE_LLD "ON" BOOL) endif() -set_instrument_and_final_stage_var(LLVM_ENABLE_LIBCXX "ON" BOOL) -set_instrument_and_final_stage_var(LLVM_STATIC_LINK_CXX_STDLIB "ON" BOOL) -set(RELEASE_LINKER_FLAGS "-rtlib=compiler-rt --unwindlib=libunwind") -if(NOT ${CMAKE_HOST_SYSTEM_NAME} MATCHES "Darwin") - set(RELEASE_LINKER_FLAGS "${RELEASE_LINKER_FLAGS} -static-libgcc") +if(LLVM_RELEASE_ENABLE_LINK_LOCAL_RUNTIMES) + set_instrument_and_final_stage_var(LLVM_ENABLE_LIBCXX "ON" BOOL) + set_instrument_and_final_stage_var(LLVM_STATIC_LINK_CXX_STDLIB "ON" BOOL) + set(RELEASE_LINKER_FLAGS "-rtlib=compiler-rt --unwindlib=libunwind") + if(NOT ${CMAKE_HOST_SYSTEM_NAME} MATCHES "Darwin") + set(RELEASE_LINKER_FLAGS "${RELEASE_LINKER_FLAGS} -static-libgcc") + endif() endif() # Set flags for bolt @@ -130,9 +149,11 @@ if (${CMAKE_HOST_SYSTEM_NAME} MATCHES "Linux") set(RELEASE_LINKER_FLAGS "${RELEASE_LINKER_FLAGS} -Wl,--emit-relocs,-znow") endif() -set_instrument_and_final_stage_var(CMAKE_EXE_LINKER_FLAGS ${RELEASE_LINKER_FLAGS} STRING) -set_instrument_and_final_stage_var(CMAKE_SHARED_LINKER_FLAGS ${RELEASE_LINKER_FLAGS} STRING) -set_instrument_and_final_stage_var(CMAKE_MODULE_LINKER_FLAGS ${RELEASE_LINKER_FLAGS} STRING) +if (RELEASE_LINKER_FLAGS) + set_instrument_and_final_stage_var(CMAKE_EXE_LINKER_FLAGS ${RELEASE_LINKER_FLAGS} STRING) + set_instrument_and_final_stage_var(CMAKE_SHARED_LINKER_FLAGS ${RELEASE_LINKER_FLAGS} STRING) + set_instrument_and_final_stage_var(CMAKE_MODULE_LINKER_FLAGS ${RELEASE_LINKER_FLAGS} STRING) +endif() # Final Stage Config (stage2) set_final_stage_var(LLVM_ENABLE_RUNTIMES "${LLVM_RELEASE_ENABLE_RUNTIMES}" STRING) From 64df8f83fe293b6c08477975cbb4451c76b51c54 Mon Sep 17 00:00:00 2001 From: Igor Kudrin Date: Tue, 21 Oct 2025 20:01:08 -0700 Subject: [PATCH 045/530] [lldb][test] Fix TestEmptyFuncThreadStepOut.py (#161788) The test did not work as intended when the empty function `done()` contained prologue/epilogue code, because a breakpoint was set before the last instruction of the function, which caused the test to pass even with the fix from #126838 having been reverted. The test is intended to check a case when a breakpoint is set on a return instruction, which is the very last instruction of a function. When stepping out from this breakpoint, there is interaction between `ThreadPlanStepOut` and `ThreadPlanStepOverBreakpoint` that could lead to missing the stop location in the outer frame; the detailed explanation can be found in #126838. On `Linux/AArch64`, the source is compiled into: ``` > objdump -d main.o 0000000000000000 : 0: d65f03c0 ret ``` So, when the command `bt set -n done` from the original test sets a breakpoint to the first instruction of `done()`, this instruction luckily also happens to be the last one. On `Linux/x86_64`, it compiles into: ``` > objdump -d main.o 0000000000000000 : 0: 55 push %rbp 1: 48 89 e5 mov %rsp,%rbp 4: 5d pop %rbp 5: c3 ret ``` In this case, setting a breakpoint by function name means setting it several instructions before `ret`, which does not provoke the interaction between `ThreadPlanStepOut` and `ThreadPlanStepOverBreakpoint`. --- .../TestEmptyFuncThreadStepOut.py | 28 +++++++++++++++---- .../thread/finish-from-empty-func/main.c | 1 + 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/lldb/test/API/functionalities/thread/finish-from-empty-func/TestEmptyFuncThreadStepOut.py b/lldb/test/API/functionalities/thread/finish-from-empty-func/TestEmptyFuncThreadStepOut.py index f5d3da530f4f5..c95a57ff55815 100644 --- a/lldb/test/API/functionalities/thread/finish-from-empty-func/TestEmptyFuncThreadStepOut.py +++ b/lldb/test/API/functionalities/thread/finish-from-empty-func/TestEmptyFuncThreadStepOut.py @@ -13,12 +13,31 @@ class FinishFromEmptyFunctionTestCase(TestBase): @skipIf(compiler="clang", compiler_version=['<', '17.0']) def test_finish_from_empty_function(self): - """Test that when stopped at a breakpoint in an empty function, finish leaves it correctly.""" + """Test that when stopped at a breakpoint located at the last instruction + of a function, finish leaves it correctly.""" self.build() - exe = self.getBuildArtifact("a.out") - target, process, thread, _ = lldbutil.run_to_name_breakpoint( - self, "done", exe_name=exe + target, _, thread, _ = lldbutil.run_to_source_breakpoint( + self, "// Set breakpoint here", lldb.SBFileSpec("main.c") ) + # Find the address of the last instruction of 'done()' and set a breakpoint there. + # Even though 'done()' is empty, it may contain prologue and epilogue code, so + # simply setting a breakpoint at the function can place it before 'ret'. + error = lldb.SBError() + ret_bp_addr = lldb.SBAddress() + while True: + thread.StepInstruction(False, error) + self.assertTrue(error.Success()) + frame = thread.GetSelectedFrame() + if "done" in frame.GetFunctionName(): + ret_bp_addr = frame.GetPCAddress() + elif ret_bp_addr.IsValid(): + # The entire function 'done()' has been stepped through, so 'ret_bp_addr' + # now contains the address of its last instruction, i.e. 'ret'. + break + ret_bp = target.BreakpointCreateByAddress(ret_bp_addr.GetLoadAddress(target)) + self.assertTrue(ret_bp.IsValid()) + # Resume the execution and hit the new breakpoint. + self.runCmd("cont") if self.TraceOn(): self.runCmd("bt") @@ -29,7 +48,6 @@ def test_finish_from_empty_function(self): ) self.assertTrue(safety_bp.IsValid()) - error = lldb.SBError() thread.StepOut(error) self.assertTrue(error.Success()) diff --git a/lldb/test/API/functionalities/thread/finish-from-empty-func/main.c b/lldb/test/API/functionalities/thread/finish-from-empty-func/main.c index bc66a548a89df..b3f90db5e2562 100644 --- a/lldb/test/API/functionalities/thread/finish-from-empty-func/main.c +++ b/lldb/test/API/functionalities/thread/finish-from-empty-func/main.c @@ -2,6 +2,7 @@ void done() {} int main() { puts("in main"); + done(); // Set breakpoint here done(); puts("leaving main"); return 0; From 41204960c089d45d5bfcece809ddac95f0726cc6 Mon Sep 17 00:00:00 2001 From: Connector Switch Date: Wed, 22 Oct 2025 11:13:57 +0800 Subject: [PATCH 046/530] [libc++][C++03] Cherry pick #164272 (#164447) --- libcxx/include/__cxx03/__algorithm/for_each.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/libcxx/include/__cxx03/__algorithm/for_each.h b/libcxx/include/__cxx03/__algorithm/for_each.h index d160a9eddc50b..1ffb0137c0af7 100644 --- a/libcxx/include/__cxx03/__algorithm/for_each.h +++ b/libcxx/include/__cxx03/__algorithm/for_each.h @@ -14,15 +14,11 @@ #include <__cxx03/__config> #include <__cxx03/__iterator/segmented_iterator.h> #include <__cxx03/__type_traits/enable_if.h> -#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif -_LIBCPP_PUSH_MACROS -#include <__cxx03/__undef_macros> - _LIBCPP_BEGIN_NAMESPACE_STD template @@ -34,6 +30,4 @@ _LIBCPP_HIDE_FROM_ABI _Function for_each(_InputIterator __first, _InputIterator _LIBCPP_END_NAMESPACE_STD -_LIBCPP_POP_MACROS - #endif // _LIBCPP___CXX03___ALGORITHM_FOR_EACH_H From bc7e3ff5c55ccdf4b9981cfa255cca858190e3fb Mon Sep 17 00:00:00 2001 From: paperchalice Date: Wed, 22 Oct 2025 11:43:41 +0800 Subject: [PATCH 047/530] [clang] Remove rest TargetOptions::UnsafeFPMath use (#164525) This should be the last use in clang. --- clang/lib/CodeGen/BackendUtil.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index 602068436101b..ecfbcb5970092 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -430,12 +430,6 @@ static bool initTargetOptions(const CompilerInstance &CI, Options.NoInfsFPMath = LangOpts.NoHonorInfs; Options.NoNaNsFPMath = LangOpts.NoHonorNaNs; Options.NoZerosInBSS = CodeGenOpts.NoZeroInitializedInBSS; - Options.UnsafeFPMath = LangOpts.AllowFPReassoc && LangOpts.AllowRecip && - LangOpts.NoSignedZero && LangOpts.ApproxFunc && - (LangOpts.getDefaultFPContractMode() == - LangOptions::FPModeKind::FPM_Fast || - LangOpts.getDefaultFPContractMode() == - LangOptions::FPModeKind::FPM_FastHonorPragmas); Options.BBAddrMap = CodeGenOpts.BBAddrMap; Options.BBSections = From 57e976a3c33d3fe127602f07e0df8c2a59cba4f8 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Tue, 21 Oct 2025 21:02:23 -0700 Subject: [PATCH 048/530] workflows/release-binaries: Remove references to ccache (#164102) We aren't using ccache any more so we don't need these variables. --- .github/workflows/release-binaries.yml | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml index ea70b93edb129..3f2eb3a1d8ea8 100644 --- a/.github/workflows/release-binaries.yml +++ b/.github/workflows/release-binaries.yml @@ -58,7 +58,6 @@ jobs: ref: ${{ steps.vars.outputs.ref }} upload: ${{ steps.vars.outputs.upload }} target-cmake-flags: ${{ steps.vars.outputs.target-cmake-flags }} - ccache: ${{ steps.vars.outputs.ccache }} build-flang: ${{ steps.vars.outputs.build-flang }} release-binary-basename: ${{ steps.vars.outputs.release-binary-basename }} release-binary-filename: ${{ steps.vars.outputs.release-binary-filename }} @@ -123,13 +122,6 @@ jobs: echo "release-binary-filename=$release_binary_basename.tar.xz" >> $GITHUB_OUTPUT target="$RUNNER_OS-$RUNNER_ARCH" - # The hendrikmuhs/ccache-action action does not support installing sccache - # on arm64 Linux. - if [ "$target" = "Linux-ARM64" ]; then - echo ccache=ccache >> $GITHUB_OUTPUT - else - echo ccache=sccache >> $GITHUB_OUTPUT - fi # The macOS builds try to cross compile some libraries so we need to # add extra CMake args to disable them. @@ -222,8 +214,6 @@ jobs: - name: Configure id: build shell: bash - env: - CCACHE_BIN: ${{ needs.prepare.outputs.ccache }} run: | # There were some issues on the ARM64 MacOS runners with trying to build x86 object, # so we need to set some extra cmake flags to disable this. From 829804724bda1d8f6c79ec9ef14ccab57e2f89e1 Mon Sep 17 00:00:00 2001 From: Abhinav Gaba Date: Tue, 21 Oct 2025 21:15:52 -0700 Subject: [PATCH 049/530] [NFC][OpenMP] Update a test that was failing on aarch64. (#164456) The failure was reported here: https://github.com/llvm/llvm-project/pull/164039#issuecomment-3425429556 The test was checking for the "bad" behavior so as to keep track of it, but there seem to be some issues with the pointer arithmetic specific to aarch64. The update for now is to not check for the "bad" behavior fully. We may need to debug further if similar issues are encountered eventually once the codegen has been fixed. --- ...a_use_device_addr_class_member_ref_with_map.cpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_class_member_ref_with_map.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_class_member_ref_with_map.cpp index 5e8769eb3079d..50a28e0d9514b 100644 --- a/offload/test/mapping/use_device_addr/target_data_use_device_addr_class_member_ref_with_map.cpp +++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_class_member_ref_with_map.cpp @@ -16,7 +16,7 @@ struct ST { int m = 0; void f6() { - uintptr_t offset = (uintptr_t)&d - n; + ptrdiff_t offset = (char *)&d - ((char *)(uintptr_t)n); #pragma omp target data map(to : m, d) { void *mapped_ptr = omp_get_mapped_ptr(&d, omp_get_default_device()); @@ -34,11 +34,15 @@ struct ST { // ref/attach modifiers: // &ref_ptee(this[0].[d])), &ref_ptee(this[0].d), TO | FROM // &ref_ptr(this[0].d), &ref_ptee(this[0].d), 4, ATTACH - // EXPECTED: 1 0 - // CHECK: 0 1 - printf("%d %d\n", &d == mapped_ptr, - (uintptr_t)&d == (uintptr_t)mapped_ptr - offset); + // EXPECTED: 1 + // CHECK-NEXT: 0 + printf("%d\n", &d == mapped_ptr); + ptrdiff_t offset_device = (char *)mapped_ptr - (char *)&d; + printf("offset = %td (%p), offset_device = %td (%p)\n", offset, + (void *)offset, offset_device, (void *)offset_device); + printf("mapped_ptr = %p, device_addr = %p, ", mapped_ptr, &d); } + printf("host_addr = %p\n", &d); } } }; From 093af77ece1e0ec6581430bb9f78a9b55fd8b3bb Mon Sep 17 00:00:00 2001 From: Nathan Ridge Date: Wed, 22 Oct 2025 00:34:21 -0400 Subject: [PATCH 050/530] [clangd] Avoid calling resolveTypeOfCallExpr() twice in HeuristicResolver::resolveExprToType() (#164353) Fixes the performance regression reported at https://github.com/llvm/llvm-project/pull/156282#issuecomment-3421570600 --- clang/lib/Sema/HeuristicResolver.cpp | 5 +++++ .../unittests/Sema/HeuristicResolverTest.cpp | 22 +++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/clang/lib/Sema/HeuristicResolver.cpp b/clang/lib/Sema/HeuristicResolver.cpp index cbdefaa57aacb..056e13308b7d3 100644 --- a/clang/lib/Sema/HeuristicResolver.cpp +++ b/clang/lib/Sema/HeuristicResolver.cpp @@ -450,7 +450,12 @@ QualType HeuristicResolverImpl::resolveExprToType(const Expr *E) { if (const auto *CE = dyn_cast(E)) { if (QualType Resolved = resolveTypeOfCallExpr(CE); !Resolved.isNull()) return Resolved; + + // Don't proceed to try resolveExprToDecls(), it would just call + // resolveTypeOfCallExpr() again. + return E->getType(); } + // Similarly, unwrapping a unary dereference operation does not work via // resolveExprToDecls. if (const auto *UO = dyn_cast(E->IgnoreParenCasts())) { diff --git a/clang/unittests/Sema/HeuristicResolverTest.cpp b/clang/unittests/Sema/HeuristicResolverTest.cpp index a00632f232ea5..c592e74c41a95 100644 --- a/clang/unittests/Sema/HeuristicResolverTest.cpp +++ b/clang/unittests/Sema/HeuristicResolverTest.cpp @@ -524,6 +524,28 @@ TEST(HeuristicResolver, MemberExpr_HangIssue126536) { cxxDependentScopeMemberExpr(hasMemberName("foo")).bind("input")); } +TEST(HeuristicResolver, MemberExpr_HangOnLongCallChain) { + const size_t CallChainLength = 50; + std::string Code = R"cpp( + template + void foo(T t) { + t + )cpp"; + for (size_t I = 0; I < CallChainLength; ++I) + Code.append(".method()\n"); + Code.append(R"cpp( + .lastMethod(); + } + )cpp"); + // Test that resolution of a name whose base is a long call chain + // does not hang. Note that the hang for which this is a regression + // test is finite (exponential runtime in the length of the chain), + // so a "failure" here manifests as abnormally long runtime. + expectResolution( + Code, &HeuristicResolver::resolveMemberExpr, + cxxDependentScopeMemberExpr(hasMemberName("lastMethod")).bind("input")); +} + TEST(HeuristicResolver, MemberExpr_DefaultTemplateArgument) { std::string Code = R"cpp( struct Default { From d6b68545aa6249bf4445e0ef96c0134d69a98a26 Mon Sep 17 00:00:00 2001 From: Wenju He Date: Wed, 22 Oct 2025 12:47:44 +0800 Subject: [PATCH 051/530] [NFC][libclc] Improve empty builtins error: include ARCH_SUFFIX in message (#164527) To clarify which builtin set has no bytecode files. --- libclc/cmake/modules/AddLibclc.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libclc/cmake/modules/AddLibclc.cmake b/libclc/cmake/modules/AddLibclc.cmake index d8c22191678d4..3228926e765f4 100644 --- a/libclc/cmake/modules/AddLibclc.cmake +++ b/libclc/cmake/modules/AddLibclc.cmake @@ -392,7 +392,7 @@ function(add_libclc_builtin_set) list( PREPEND bytecode_files ${bytecode_ir_files} ) if( NOT bytecode_files ) - message(FATAL_ERROR "Cannot create an empty builtins library") + message(FATAL_ERROR "Cannot create an empty builtins library for ${ARG_ARCH_SUFFIX}") endif() set( builtins_link_lib_tgt builtins.link.${ARG_ARCH_SUFFIX} ) From dab8fb82a28c13122890f95164c67f4588a15104 Mon Sep 17 00:00:00 2001 From: paperchalice Date: Wed, 22 Oct 2025 12:55:30 +0800 Subject: [PATCH 052/530] [Instrumentation] Remove unsafe-fp-math attribute support (#164533) Now codegen part no longer depends on this attribute, remove it. --- .../NumericalStabilitySanitizer.cpp | 1 - .../NumericalStabilitySanitizer/basic.ll | 29 +------------------ 2 files changed, 1 insertion(+), 29 deletions(-) diff --git a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp index d18c0d0d2d90d..80e77e099c695 100644 --- a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp @@ -2020,7 +2020,6 @@ static void moveFastMathFlags(Function &F, F.removeFnAttr(attr); \ FMF.set##setter(); \ } - MOVE_FLAG("unsafe-fp-math", Fast) MOVE_FLAG("no-infs-fp-math", NoInfs) MOVE_FLAG("no-nans-fp-math", NoNaNs) MOVE_FLAG("no-signed-zeros-fp-math", NoSignedZeros) diff --git a/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll b/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll index 434ac8462b624..3d759f7a47d49 100644 --- a/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll +++ b/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll @@ -865,33 +865,6 @@ entry: ret float %r } -; Note that the `unsafe-fp-math` from the function attributes should be moved to -; individual instructions, with the shadow instructions NOT getting the attribute. -define float @param_add_return_float_unsafe_fp_math(float %a) #0 { -; CHECK-LABEL: @param_add_return_float_unsafe_fp_math( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__nsan_shadow_args_tag, align 8 -; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[TMP0]], ptrtoint (ptr @param_add_return_float_unsafe_fp_math to i64) -; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr @__nsan_shadow_args_ptr, align 1 -; CHECK-NEXT: [[TMP3:%.*]] = fpext float [[A:%.*]] to double -; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP1]], double [[TMP2]], double [[TMP3]] -; CHECK-NEXT: store i64 0, ptr @__nsan_shadow_args_tag, align 8 -; CHECK-NEXT: [[B:%.*]] = fadd fast float [[A]], 1.000000e+00 -; CHECK-NEXT: [[TMP5:%.*]] = fadd double [[TMP4]], 1.000000e+00 -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @__nsan_internal_check_float_d(float [[B]], double [[TMP5]], i32 1, i64 0) -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = fpext float [[B]] to double -; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP7]], double [[TMP8]], double [[TMP5]] -; CHECK-NEXT: store i64 ptrtoint (ptr @param_add_return_float_unsafe_fp_math to i64), ptr @__nsan_shadow_ret_tag, align 8 -; CHECK-NEXT: store double [[TMP9]], ptr @__nsan_shadow_ret_ptr, align 8 -; CHECK-NEXT: ret float [[B]] -; -entry: - %b = fadd float %a, 1.0 - ret float %b -} - - define void @truncate(<2 x double> %0) sanitize_numerical_stability { ; DQQ-LABEL: @truncate( ; DQQ-NEXT: entry: @@ -941,4 +914,4 @@ entry: } -attributes #0 = { nounwind readonly uwtable sanitize_numerical_stability "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #0 = { nounwind readonly uwtable sanitize_numerical_stability "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" } From d11b7bd8c271816ec9e825dc192b4f24ef88d3a4 Mon Sep 17 00:00:00 2001 From: paperchalice Date: Wed, 22 Oct 2025 13:17:04 +0800 Subject: [PATCH 053/530] [IR] Remove unsafe-fp-math attribute support (#164534) Now this attribute is no longer recognized by backends, remove it. --- llvm/include/llvm/IR/Attributes.td | 2 - ...ining-compatible-and-attribute-transfer.ll | 8 ++-- llvm/test/Transforms/Inline/attributes.ll | 42 ------------------- 3 files changed, 4 insertions(+), 48 deletions(-) diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td index 8e7d9dcebfe2a..8ce2b1bea8fac 100644 --- a/llvm/include/llvm/IR/Attributes.td +++ b/llvm/include/llvm/IR/Attributes.td @@ -410,7 +410,6 @@ def LessPreciseFPMAD : StrBoolAttr<"less-precise-fpmad">; def NoInfsFPMath : StrBoolAttr<"no-infs-fp-math">; def NoNansFPMath : StrBoolAttr<"no-nans-fp-math">; def NoSignedZerosFPMath : StrBoolAttr<"no-signed-zeros-fp-math">; -def UnsafeFPMath : StrBoolAttr<"unsafe-fp-math">; def NoJumpTables : StrBoolAttr<"no-jump-tables">; def NoInlineLineTables : StrBoolAttr<"no-inline-line-tables">; def ProfileSampleAccurate : StrBoolAttr<"profile-sample-accurate">; @@ -474,7 +473,6 @@ def : MergeRule<"setAND">; def : MergeRule<"setAND">; def : MergeRule<"setAND">; def : MergeRule<"setAND">; -def : MergeRule<"setAND">; def : MergeRule<"setOR">; def : MergeRule<"setOR">; def : MergeRule<"setOR">; diff --git a/llvm/test/Transforms/IROutliner/outlining-compatible-and-attribute-transfer.ll b/llvm/test/Transforms/IROutliner/outlining-compatible-and-attribute-transfer.ll index b3f2e8102fc1e..15ce3e3c440fe 100644 --- a/llvm/test/Transforms/IROutliner/outlining-compatible-and-attribute-transfer.ll +++ b/llvm/test/Transforms/IROutliner/outlining-compatible-and-attribute-transfer.ll @@ -5,7 +5,7 @@ ; attributes that should be transferred only if it is on all of the regions. ; This includes the attributes, no-nans-fp-math, -; no-signed-zeros-fp-math, less-precise-fpmad, unsafe-fp-math, and +; no-signed-zeros-fp-math, less-precise-fpmad, and ; no-infs-fp-math. Only when each instance of similarity has these attributes ; can we say that the outlined function can have these attributes since that ; is the more general case for these attributes. @@ -101,7 +101,7 @@ entry: } attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "less-precise-fpmad"="true" -"unsafe-fp-math"="true" "no-infs-fp-math"="true"} +"no-infs-fp-math"="true"} ; CHECK: define internal void @outlined_ir_func_0(ptr [[ARG0:%.*]], ptr [[ARG1:%.*]], ptr [[ARG2:%.*]]) [[ATTR1:#[0-9]+]] { ; CHECK: entry_to_outline: @@ -122,5 +122,5 @@ attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "les ; CHECK-NEXT: [[CL:%.*]] = load i32, ptr [[ARG2]], align 4 -; CHECK: attributes [[ATTR1]] = { minsize optsize "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "unsafe-fp-math"="false" } -; CHECK: attributes [[ATTR]] = { minsize optsize "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "unsafe-fp-math"="true" } +; CHECK: attributes [[ATTR1]] = { minsize optsize "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" } +; CHECK: attributes [[ATTR]] = { minsize optsize "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" } diff --git a/llvm/test/Transforms/Inline/attributes.ll b/llvm/test/Transforms/Inline/attributes.ll index 55ab430f201d6..da7eeda543c79 100644 --- a/llvm/test/Transforms/Inline/attributes.ll +++ b/llvm/test/Transforms/Inline/attributes.ll @@ -601,46 +601,6 @@ define i32 @test_no-signed-zeros-fp-math3(i32 %i) "no-signed-zeros-fp-math"="tru ; CHECK-NEXT: ret i32 } -define i32 @unsafe-fp-math_callee0(i32 %i) "unsafe-fp-math"="false" { - ret i32 %i -; CHECK: @unsafe-fp-math_callee0(i32 %i) [[UNSAFE_FPMATH_FALSE:#[0-9]+]] { -; CHECK-NEXT: ret i32 -} - -define i32 @unsafe-fp-math_callee1(i32 %i) "unsafe-fp-math"="true" { - ret i32 %i -; CHECK: @unsafe-fp-math_callee1(i32 %i) [[UNSAFE_FPMATH_TRUE:#[0-9]+]] { -; CHECK-NEXT: ret i32 -} - -define i32 @test_unsafe-fp-math0(i32 %i) "unsafe-fp-math"="false" { - %1 = call i32 @unsafe-fp-math_callee0(i32 %i) - ret i32 %1 -; CHECK: @test_unsafe-fp-math0(i32 %i) [[UNSAFE_FPMATH_FALSE]] { -; CHECK-NEXT: ret i32 -} - -define i32 @test_unsafe-fp-math1(i32 %i) "unsafe-fp-math"="false" { - %1 = call i32 @unsafe-fp-math_callee1(i32 %i) - ret i32 %1 -; CHECK: @test_unsafe-fp-math1(i32 %i) [[UNSAFE_FPMATH_FALSE]] { -; CHECK-NEXT: ret i32 -} - -define i32 @test_unsafe-fp-math2(i32 %i) "unsafe-fp-math"="true" { - %1 = call i32 @unsafe-fp-math_callee0(i32 %i) - ret i32 %1 -; CHECK: @test_unsafe-fp-math2(i32 %i) [[UNSAFE_FPMATH_FALSE]] { -; CHECK-NEXT: ret i32 -} - -define i32 @test_unsafe-fp-math3(i32 %i) "unsafe-fp-math"="true" { - %1 = call i32 @unsafe-fp-math_callee1(i32 %i) - ret i32 %1 -; CHECK: @test_unsafe-fp-math3(i32 %i) [[UNSAFE_FPMATH_TRUE]] { -; CHECK-NEXT: ret i32 -} - ; Test that fn_ret_thunk_extern has no CompatRule; inlining is permitted. ; Test that fn_ret_thunk_extern has no MergeRule; fn_ret_thunk_extern is not ; propagated or dropped on the caller after inlining. @@ -693,6 +653,4 @@ define i32 @loader_replaceable_caller() { ; CHECK: attributes [[NO_NANS_FPMATH_TRUE]] = { "no-nans-fp-math"="true" } ; CHECK: attributes [[NO_SIGNED_ZEROS_FPMATH_FALSE]] = { "no-signed-zeros-fp-math"="false" } ; CHECK: attributes [[NO_SIGNED_ZEROS_FPMATH_TRUE]] = { "no-signed-zeros-fp-math"="true" } -; CHECK: attributes [[UNSAFE_FPMATH_FALSE]] = { "unsafe-fp-math"="false" } -; CHECK: attributes [[UNSAFE_FPMATH_TRUE]] = { "unsafe-fp-math"="true" } ; CHECK: attributes [[FNRETTHUNK_EXTERN]] = { fn_ret_thunk_extern } From 14d50bc7d33f598001c7759fe3d61cf1d8b64ef8 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Tue, 21 Oct 2025 22:59:18 -0700 Subject: [PATCH 054/530] [NFC][STLExtras] Use GTest matchers for drop_begin/drop_end tests (#164539) --- llvm/unittests/ADT/STLExtrasTest.cpp | 32 +++++++--------------------- 1 file changed, 8 insertions(+), 24 deletions(-) diff --git a/llvm/unittests/ADT/STLExtrasTest.cpp b/llvm/unittests/ADT/STLExtrasTest.cpp index 474699835a7dc..966b1f01e8a31 100644 --- a/llvm/unittests/ADT/STLExtrasTest.cpp +++ b/llvm/unittests/ADT/STLExtrasTest.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringRef.h" #include "gmock/gmock.h" #include "gtest/gtest.h" @@ -27,6 +28,7 @@ using namespace llvm; using testing::ElementsAre; +using testing::ElementsAreArray; using testing::UnorderedElementsAre; namespace { @@ -772,48 +774,30 @@ TEST(STLExtrasTest, DropBeginTest) { SmallVector vec{0, 1, 2, 3, 4}; for (int n = 0; n < 5; ++n) { - int i = n; - for (auto &v : drop_begin(vec, n)) { - EXPECT_EQ(v, i); - i += 1; - } - EXPECT_EQ(i, 5); + EXPECT_THAT(drop_begin(vec, n), + ElementsAreArray(ArrayRef(&vec[n], vec.size() - n))); } } TEST(STLExtrasTest, DropBeginDefaultTest) { SmallVector vec{0, 1, 2, 3, 4}; - int i = 1; - for (auto &v : drop_begin(vec)) { - EXPECT_EQ(v, i); - i += 1; - } - EXPECT_EQ(i, 5); + EXPECT_THAT(drop_begin(vec), ElementsAre(1, 2, 3, 4)); } TEST(STLExtrasTest, DropEndTest) { SmallVector vec{0, 1, 2, 3, 4}; for (int n = 0; n < 5; ++n) { - int i = 0; - for (auto &v : drop_end(vec, n)) { - EXPECT_EQ(v, i); - i += 1; - } - EXPECT_EQ(i, 5 - n); + EXPECT_THAT(drop_end(vec, n), + ElementsAreArray(ArrayRef(vec.data(), vec.size() - n))); } } TEST(STLExtrasTest, DropEndDefaultTest) { SmallVector vec{0, 1, 2, 3, 4}; - int i = 0; - for (auto &v : drop_end(vec)) { - EXPECT_EQ(v, i); - i += 1; - } - EXPECT_EQ(i, 4); + EXPECT_THAT(drop_end(vec), ElementsAre(0, 1, 2, 3)); } TEST(STLExtrasTest, MapRangeTest) { From 76f15ead0100e1a846f3e8d055d91ac669e297be Mon Sep 17 00:00:00 2001 From: paperchalice Date: Wed, 22 Oct 2025 14:23:04 +0800 Subject: [PATCH 055/530] [CodeGen][Target] Remove UnsafeFPMath uses (#164549) Remove `UnsafeFPMath` uses, next is removing the command line option. --- llvm/include/llvm/CodeGen/CommandFlags.h | 2 -- llvm/include/llvm/Target/TargetOptions.h | 12 ++---------- llvm/lib/CodeGen/CommandFlags.cpp | 5 +---- llvm/lib/CodeGen/TargetOptionsImpl.cpp | 2 +- llvm/lib/Target/TargetMachine.cpp | 1 - 5 files changed, 4 insertions(+), 18 deletions(-) diff --git a/llvm/include/llvm/CodeGen/CommandFlags.h b/llvm/include/llvm/CodeGen/CommandFlags.h index 39c5a8d479a5f..af66f2d5776ba 100644 --- a/llvm/include/llvm/CodeGen/CommandFlags.h +++ b/llvm/include/llvm/CodeGen/CommandFlags.h @@ -58,8 +58,6 @@ LLVM_ABI CodeGenFileType getFileType(); LLVM_ABI FramePointerKind getFramePointerUsage(); -LLVM_ABI bool getEnableUnsafeFPMath(); - LLVM_ABI bool getEnableNoInfsFPMath(); LLVM_ABI bool getEnableNoNaNsFPMath(); diff --git a/llvm/include/llvm/Target/TargetOptions.h b/llvm/include/llvm/Target/TargetOptions.h index 2c2122a7b204f..bfd2817b8d1f5 100644 --- a/llvm/include/llvm/Target/TargetOptions.h +++ b/llvm/include/llvm/Target/TargetOptions.h @@ -118,9 +118,8 @@ enum CodeObjectVersionKind { class TargetOptions { public: TargetOptions() - : UnsafeFPMath(false), NoInfsFPMath(false), NoNaNsFPMath(false), - NoTrappingFPMath(true), NoSignedZerosFPMath(false), - EnableAIXExtendedAltivecABI(false), + : NoInfsFPMath(false), NoNaNsFPMath(false), NoTrappingFPMath(true), + NoSignedZerosFPMath(false), EnableAIXExtendedAltivecABI(false), HonorSignDependentRoundingFPMathOption(false), NoZerosInBSS(false), GuaranteedTailCallOpt(false), StackSymbolOrdering(true), EnableFastISel(false), EnableGlobalISel(false), UseInitArray(false), @@ -156,13 +155,6 @@ class TargetOptions { /// MCAsmInfo::BinutilsVersion. std::pair BinutilsVersion{0, 0}; - /// UnsafeFPMath - This flag is enabled when the - /// -enable-unsafe-fp-math flag is specified on the command line. When - /// this flag is off (the default), the code generator is not allowed to - /// produce results that are "less precise" than IEEE allows. This includes - /// use of X86 instructions like FSIN and FCOS instead of libcalls. - unsigned UnsafeFPMath : 1; - /// NoInfsFPMath - This flag is enabled when the /// -enable-no-infs-fp-math flag is specified on the command line. When /// this flag is off (the default), the code generator is not allowed to diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp index 0522698adf183..7f19161c8d138 100644 --- a/llvm/lib/CodeGen/CommandFlags.cpp +++ b/llvm/lib/CodeGen/CommandFlags.cpp @@ -64,7 +64,6 @@ CGOPT_EXP(uint64_t, LargeDataThreshold) CGOPT(ExceptionHandling, ExceptionModel) CGOPT_EXP(CodeGenFileType, FileType) CGOPT(FramePointerKind, FramePointerUsage) -CGOPT(bool, EnableUnsafeFPMath) CGOPT(bool, EnableNoInfsFPMath) CGOPT(bool, EnableNoNaNsFPMath) CGOPT(bool, EnableNoSignedZerosFPMath) @@ -219,11 +218,11 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() { "Enable frame pointer elimination"))); CGBINDOPT(FramePointerUsage); + [[maybe_unused]] static cl::opt EnableUnsafeFPMath( "enable-unsafe-fp-math", cl::desc("Enable optimizations that may decrease FP precision"), cl::init(false)); - CGBINDOPT(EnableUnsafeFPMath); static cl::opt EnableNoInfsFPMath( "enable-no-infs-fp-math", @@ -552,7 +551,6 @@ TargetOptions codegen::InitTargetOptionsFromCodeGenFlags(const Triple &TheTriple) { TargetOptions Options; Options.AllowFPOpFusion = getFuseFPOps(); - Options.UnsafeFPMath = getEnableUnsafeFPMath(); Options.NoInfsFPMath = getEnableNoInfsFPMath(); Options.NoNaNsFPMath = getEnableNoNaNsFPMath(); Options.NoSignedZerosFPMath = getEnableNoSignedZerosFPMath(); @@ -706,7 +704,6 @@ void codegen::setFunctionAttributes(StringRef CPU, StringRef Features, if (getStackRealign()) NewAttrs.addAttribute("stackrealign"); - HANDLE_BOOL_ATTR(EnableUnsafeFPMathView, "unsafe-fp-math"); HANDLE_BOOL_ATTR(EnableNoInfsFPMathView, "no-infs-fp-math"); HANDLE_BOOL_ATTR(EnableNoNaNsFPMathView, "no-nans-fp-math"); HANDLE_BOOL_ATTR(EnableNoSignedZerosFPMathView, "no-signed-zeros-fp-math"); diff --git a/llvm/lib/CodeGen/TargetOptionsImpl.cpp b/llvm/lib/CodeGen/TargetOptionsImpl.cpp index 5eb86e740ff7c..049efe8c2a9e4 100644 --- a/llvm/lib/CodeGen/TargetOptionsImpl.cpp +++ b/llvm/lib/CodeGen/TargetOptionsImpl.cpp @@ -51,7 +51,7 @@ bool TargetOptions::FramePointerIsReserved(const MachineFunction &MF) const { /// HonorSignDependentRoundingFPMath - Return true if the codegen must assume /// that the rounding mode of the FPU can change from its default. bool TargetOptions::HonorSignDependentRoundingFPMath() const { - return !UnsafeFPMath && HonorSignDependentRoundingFPMathOption; + return HonorSignDependentRoundingFPMathOption; } /// NOTE: There are targets that still do not support the debug entry values diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp index cf8569194d778..9bda8a42fd89e 100644 --- a/llvm/lib/Target/TargetMachine.cpp +++ b/llvm/lib/Target/TargetMachine.cpp @@ -158,7 +158,6 @@ void TargetMachine::resetTargetOptions(const Function &F) const { Options.X = F.getFnAttribute(Y).getValueAsBool(); \ } while (0) - RESET_OPTION(UnsafeFPMath, "unsafe-fp-math"); RESET_OPTION(NoInfsFPMath, "no-infs-fp-math"); RESET_OPTION(NoNaNsFPMath, "no-nans-fp-math"); RESET_OPTION(NoSignedZerosFPMath, "no-signed-zeros-fp-math"); From f70808c8bfdd7a6048347b2533ccddc34bbb4678 Mon Sep 17 00:00:00 2001 From: Amr Hesham Date: Wed, 22 Oct 2025 08:43:44 +0200 Subject: [PATCH 056/530] [CIR] Implement CXXDefaultInitExpr for Constants (#164509) Implement the CXXDefaultInitExpr for Constants --- clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp | 6 +++--- clang/test/CIR/CodeGen/struct-init.cpp | 10 ++++++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp index 65e6a3915f241..800262aac8fa4 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp @@ -1011,9 +1011,9 @@ class ConstExprEmitter } mlir::Attribute VisitCXXDefaultInitExpr(CXXDefaultInitExpr *die, QualType t) { - cgm.errorNYI(die->getBeginLoc(), - "ConstExprEmitter::VisitCXXDefaultInitExpr"); - return {}; + // No need for a DefaultInitExprScope: we don't handle 'this' in a + // constant expression. + return Visit(die->getExpr(), t); } mlir::Attribute VisitExprWithCleanups(ExprWithCleanups *e, QualType t) { diff --git a/clang/test/CIR/CodeGen/struct-init.cpp b/clang/test/CIR/CodeGen/struct-init.cpp index 2887e6f404ffc..cb509994d1cbf 100644 --- a/clang/test/CIR/CodeGen/struct-init.cpp +++ b/clang/test/CIR/CodeGen/struct-init.cpp @@ -15,6 +15,16 @@ S partial_init = { 1 }; // LLVM: @partial_init = global %struct.S { i32 1, i32 0, i32 0 } // OGCG: @partial_init = global %struct.S { i32 1, i32 0, i32 0 } +struct StructWithDefaultInit { + int a = 2; +}; + +StructWithDefaultInit swdi = {}; + +// CIR: cir.global external @swdi = #cir.const_record<{#cir.int<2> : !s32i}> : !rec_StructWithDefaultInit +// LLVM: @swdi = global %struct.StructWithDefaultInit { i32 2 }, align 4 +// OGCG: @swdi = global %struct.StructWithDefaultInit { i32 2 }, align 4 + void init() { S s1 = {1, 2, 3}; S s2 = {4, 5}; From 655de70c485ef165b6e9576a50686aac629e2711 Mon Sep 17 00:00:00 2001 From: Abhishek Kaushik Date: Wed, 22 Oct 2025 12:28:39 +0530 Subject: [PATCH 057/530] [NFC][GISel] Pass `APInt` by const reference (#164556) Change function signature to avoid copying the argument at call site. --- .../llvm/CodeGen/GlobalISel/MIPatternMatch.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h index b7ccfbb27e51c..8db99ba4ed883 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h @@ -210,8 +210,8 @@ struct SpecificConstantMatch { }; /// Matches a constant equal to \p RequestedValue. -inline SpecificConstantMatch m_SpecificICst(APInt RequestedValue) { - return SpecificConstantMatch(std::move(RequestedValue)); +inline SpecificConstantMatch m_SpecificICst(const APInt &RequestedValue) { + return SpecificConstantMatch(RequestedValue); } inline SpecificConstantMatch m_SpecificICst(int64_t RequestedValue) { @@ -221,7 +221,7 @@ inline SpecificConstantMatch m_SpecificICst(int64_t RequestedValue) { /// Matcher for a specific constant splat. struct SpecificConstantSplatMatch { APInt RequestedVal; - SpecificConstantSplatMatch(const APInt RequestedVal) + SpecificConstantSplatMatch(const APInt &RequestedVal) : RequestedVal(RequestedVal) {} bool match(const MachineRegisterInfo &MRI, Register Reg) { return isBuildVectorConstantSplat(Reg, MRI, RequestedVal, @@ -230,8 +230,9 @@ struct SpecificConstantSplatMatch { }; /// Matches a constant splat of \p RequestedValue. -inline SpecificConstantSplatMatch m_SpecificICstSplat(APInt RequestedValue) { - return SpecificConstantSplatMatch(std::move(RequestedValue)); +inline SpecificConstantSplatMatch +m_SpecificICstSplat(const APInt &RequestedValue) { + return SpecificConstantSplatMatch(RequestedValue); } inline SpecificConstantSplatMatch m_SpecificICstSplat(int64_t RequestedValue) { @@ -242,7 +243,7 @@ inline SpecificConstantSplatMatch m_SpecificICstSplat(int64_t RequestedValue) { /// Matcher for a specific constant or constant splat. struct SpecificConstantOrSplatMatch { APInt RequestedVal; - SpecificConstantOrSplatMatch(const APInt RequestedVal) + SpecificConstantOrSplatMatch(const APInt &RequestedVal) : RequestedVal(RequestedVal) {} bool match(const MachineRegisterInfo &MRI, Register Reg) { APInt MatchedVal; @@ -263,8 +264,8 @@ struct SpecificConstantOrSplatMatch { /// Matches a \p RequestedValue constant or a constant splat of \p /// RequestedValue. inline SpecificConstantOrSplatMatch -m_SpecificICstOrSplat(APInt RequestedValue) { - return SpecificConstantOrSplatMatch(std::move(RequestedValue)); +m_SpecificICstOrSplat(const APInt &RequestedValue) { + return SpecificConstantOrSplatMatch(RequestedValue); } inline SpecificConstantOrSplatMatch From 5ad2487f9776e1a5d5a38da8d34fcecc90fa891b Mon Sep 17 00:00:00 2001 From: Phoebe Wang Date: Wed, 22 Oct 2025 15:26:36 +0800 Subject: [PATCH 058/530] [X86] Check MinMax has NaN and replace with NewX for minimumnum/maximumnum (#164546) It is incorrect to just check for NewX and return its ordered elements. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 7 +- .../CodeGen/X86/fminimumnum-fmaximumnum.ll | 789 +++++++++--------- 2 files changed, 392 insertions(+), 404 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index b5f8ee50cba3d..b54a1e71d8c10 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -29516,11 +29516,8 @@ static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget, if (IgnoreNaN || DAG.isKnownNeverNaN(IsNum ? NewY : NewX)) return MinMax; - if (DAG.isKnownNeverNaN(NewX)) - NewX = NewY; - - SDValue IsNaN = - DAG.getSetCC(DL, SetCCType, NewX, NewX, IsNum ? ISD::SETO : ISD::SETUO); + SDValue NaNSrc = IsNum ? MinMax : NewX; + SDValue IsNaN = DAG.getSetCC(DL, SetCCType, NaNSrc, NaNSrc, ISD::SETUO); return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax); } diff --git a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll index 0fe107cbbeee3..aae6cda4458d2 100644 --- a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll +++ b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll @@ -22,25 +22,24 @@ declare <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat>, <4 x bfloat>) define float @test_fmaximumnum(float %x, float %y) nounwind { ; SSE2-LABEL: test_fmaximumnum: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: js .LBB0_2 -; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: js .LBB0_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: jmp .LBB0_3 +; SSE2-NEXT: .LBB0_1: +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: .LBB0_3: ; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: .LBB0_2: -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: cmpordss %xmm3, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm4 -; SSE2-NEXT: andps %xmm3, %xmm4 -; SSE2-NEXT: js .LBB0_4 -; SSE2-NEXT: # %bb.3: -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: .LBB0_4: -; SSE2-NEXT: maxss %xmm1, %xmm3 -; SSE2-NEXT: andnps %xmm3, %xmm0 -; SSE2-NEXT: orps %xmm4, %xmm0 +; SSE2-NEXT: maxss %xmm2, %xmm3 +; SSE2-NEXT: movaps %xmm3, %xmm0 +; SSE2-NEXT: cmpunordss %xmm3, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm3, %xmm2 +; SSE2-NEXT: andps %xmm1, %xmm0 +; SSE2-NEXT: orps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: test_fmaximumnum: @@ -56,7 +55,7 @@ define float @test_fmaximumnum(float %x, float %y) nounwind { ; AVX1-NEXT: vmovdqa %xmm0, %xmm1 ; AVX1-NEXT: .LBB0_3: ; AVX1-NEXT: vmaxss %xmm2, %xmm1, %xmm0 -; AVX1-NEXT: vcmpordss %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 ; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -70,7 +69,7 @@ define float @test_fmaximumnum(float %x, float %y) nounwind { ; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} ; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0 -; AVX512-NEXT: vcmpordss %xmm1, %xmm1, %k1 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: retq ; @@ -95,7 +94,7 @@ define float @test_fmaximumnum(float %x, float %y) nounwind { ; X86-NEXT: vmovdqa %xmm2, %xmm0 ; X86-NEXT: .LBB0_3: ; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1 -; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) @@ -371,26 +370,25 @@ define float @test_fmaximumnum_nsz(float %x, float %y) "no-signed-zeros-fp-math" ; SSE2-LABEL: test_fmaximumnum_nsz: ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: cmpordss %xmm0, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm3 -; SSE2-NEXT: andps %xmm0, %xmm3 -; SSE2-NEXT: maxss %xmm1, %xmm0 -; SSE2-NEXT: andnps %xmm0, %xmm2 -; SSE2-NEXT: orps %xmm3, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: maxss %xmm1, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm1 +; SSE2-NEXT: cmpunordss %xmm2, %xmm1 +; SSE2-NEXT: andps %xmm1, %xmm0 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: test_fmaximumnum_nsz: ; AVX1: # %bb.0: ; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: test_fmaximumnum_nsz: ; AVX512: # %bb.0: ; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vcmpordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vcmpunordss %xmm1, %xmm1, %k1 ; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vmovaps %xmm1, %xmm0 ; AVX512-NEXT: retq @@ -404,9 +402,9 @@ define float @test_fmaximumnum_nsz(float %x, float %y) "no-signed-zeros-fp-math" ; X86: # %bb.0: ; X86-NEXT: pushl %eax ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm1 -; X86-NEXT: vmaxss {{[0-9]+}}(%esp), %xmm0, %xmm2 -; X86-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 +; X86-NEXT: vmaxss {{[0-9]+}}(%esp), %xmm0, %xmm1 +; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: popl %eax @@ -421,23 +419,22 @@ define float @test_fmaximumnum_combine_cmps(float %x, float %y) nounwind { ; SSE2-NEXT: divss %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: movaps %xmm0, %xmm3 -; SSE2-NEXT: js .LBB9_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: movaps %xmm1, %xmm3 -; SSE2-NEXT: .LBB9_2: -; SSE2-NEXT: movaps %xmm3, %xmm2 -; SSE2-NEXT: cmpordss %xmm3, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm4 -; SSE2-NEXT: andps %xmm3, %xmm4 -; SSE2-NEXT: js .LBB9_4 -; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: js .LBB9_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: jmp .LBB9_3 +; SSE2-NEXT: .LBB9_1: +; SSE2-NEXT: movaps %xmm1, %xmm2 ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: .LBB9_4: -; SSE2-NEXT: maxss %xmm1, %xmm3 +; SSE2-NEXT: .LBB9_3: +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: maxss %xmm2, %xmm3 +; SSE2-NEXT: movaps %xmm3, %xmm0 +; SSE2-NEXT: cmpunordss %xmm3, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm2 ; SSE2-NEXT: andnps %xmm3, %xmm2 -; SSE2-NEXT: orps %xmm4, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: andps %xmm1, %xmm0 +; SSE2-NEXT: orps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: test_fmaximumnum_combine_cmps: @@ -454,7 +451,7 @@ define float @test_fmaximumnum_combine_cmps(float %x, float %y) nounwind { ; AVX1-NEXT: vmovaps %xmm0, %xmm1 ; AVX1-NEXT: .LBB9_3: ; AVX1-NEXT: vmaxss %xmm2, %xmm1, %xmm0 -; AVX1-NEXT: vcmpordss %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 ; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -469,7 +466,7 @@ define float @test_fmaximumnum_combine_cmps(float %x, float %y) nounwind { ; AVX512F-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} ; AVX512F-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} ; AVX512F-NEXT: vmaxss %xmm2, %xmm1, %xmm0 -; AVX512F-NEXT: vcmpordss %xmm1, %xmm1, %k1 +; AVX512F-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512F-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512F-NEXT: retq ; @@ -507,7 +504,7 @@ define float @test_fmaximumnum_combine_cmps(float %x, float %y) nounwind { ; X86-NEXT: vmovaps %xmm1, %xmm0 ; X86-NEXT: .LBB9_3: ; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1 -; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) @@ -527,23 +524,23 @@ define float @test_fminimumnum(float %x, float %y) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: js .LBB10_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: .LBB10_2: -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: cmpordss %xmm3, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm4 -; SSE2-NEXT: andps %xmm3, %xmm4 -; SSE2-NEXT: js .LBB10_4 -; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: js .LBB10_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: jmp .LBB10_3 +; SSE2-NEXT: .LBB10_1: +; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: .LBB10_4: -; SSE2-NEXT: minss %xmm0, %xmm3 +; SSE2-NEXT: .LBB10_3: +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: minss %xmm2, %xmm3 +; SSE2-NEXT: movaps %xmm3, %xmm1 +; SSE2-NEXT: cmpunordss %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm2 ; SSE2-NEXT: andnps %xmm3, %xmm2 -; SSE2-NEXT: orps %xmm4, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: orps %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: test_fminimumnum: @@ -559,7 +556,7 @@ define float @test_fminimumnum(float %x, float %y) nounwind { ; AVX1-NEXT: vmovdqa %xmm1, %xmm0 ; AVX1-NEXT: .LBB10_3: ; AVX1-NEXT: vminss %xmm2, %xmm0, %xmm1 -; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -573,7 +570,7 @@ define float @test_fminimumnum(float %x, float %y) nounwind { ; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} ; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: vminss %xmm2, %xmm0, %xmm1 -; AVX512-NEXT: vcmpordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vcmpunordss %xmm1, %xmm1, %k1 ; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vmovaps %xmm1, %xmm0 ; AVX512-NEXT: retq @@ -599,7 +596,7 @@ define float @test_fminimumnum(float %x, float %y) nounwind { ; X86-NEXT: vmovdqa %xmm1, %xmm0 ; X86-NEXT: .LBB10_3: ; X86-NEXT: vminss %xmm2, %xmm0, %xmm1 -; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) @@ -857,26 +854,25 @@ define float @test_fminimumnum_nsz(float %x, float %y) nounwind { ; SSE2-LABEL: test_fminimumnum_nsz: ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: cmpordss %xmm0, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm3 -; SSE2-NEXT: andps %xmm0, %xmm3 -; SSE2-NEXT: minss %xmm1, %xmm0 -; SSE2-NEXT: andnps %xmm0, %xmm2 -; SSE2-NEXT: orps %xmm3, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: minss %xmm1, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm1 +; SSE2-NEXT: cmpunordss %xmm2, %xmm1 +; SSE2-NEXT: andps %xmm1, %xmm0 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: test_fminimumnum_nsz: ; AVX1: # %bb.0: ; AVX1-NEXT: vminss %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: test_fminimumnum_nsz: ; AVX512: # %bb.0: ; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vcmpordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vcmpunordss %xmm1, %xmm1, %k1 ; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vmovaps %xmm1, %xmm0 ; AVX512-NEXT: retq @@ -890,9 +886,9 @@ define float @test_fminimumnum_nsz(float %x, float %y) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %eax ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm1 -; X86-NEXT: vminss {{[0-9]+}}(%esp), %xmm0, %xmm2 -; X86-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 +; X86-NEXT: vminss {{[0-9]+}}(%esp), %xmm0, %xmm1 +; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: popl %eax @@ -907,23 +903,23 @@ define float @test_fminimumnum_combine_cmps(float %x, float %y) nounwind { ; SSE2-NEXT: divss %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: movaps %xmm1, %xmm3 -; SSE2-NEXT: js .LBB19_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: movaps %xmm0, %xmm3 -; SSE2-NEXT: .LBB19_2: -; SSE2-NEXT: movaps %xmm3, %xmm2 -; SSE2-NEXT: cmpordss %xmm3, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm4 -; SSE2-NEXT: andps %xmm3, %xmm4 -; SSE2-NEXT: js .LBB19_4 -; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: js .LBB19_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: jmp .LBB19_3 +; SSE2-NEXT: .LBB19_1: +; SSE2-NEXT: movaps %xmm0, %xmm2 ; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: .LBB19_4: -; SSE2-NEXT: minss %xmm0, %xmm3 +; SSE2-NEXT: .LBB19_3: +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: minss %xmm2, %xmm3 +; SSE2-NEXT: movaps %xmm3, %xmm1 +; SSE2-NEXT: cmpunordss %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm2 ; SSE2-NEXT: andnps %xmm3, %xmm2 -; SSE2-NEXT: orps %xmm4, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: orps %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: test_fminimumnum_combine_cmps: @@ -940,7 +936,7 @@ define float @test_fminimumnum_combine_cmps(float %x, float %y) nounwind { ; AVX1-NEXT: vmovaps %xmm2, %xmm0 ; AVX1-NEXT: .LBB19_3: ; AVX1-NEXT: vminss %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -955,7 +951,7 @@ define float @test_fminimumnum_combine_cmps(float %x, float %y) nounwind { ; AVX512F-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} ; AVX512F-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512F-NEXT: vminss %xmm2, %xmm0, %xmm1 -; AVX512F-NEXT: vcmpordss %xmm0, %xmm0, %k1 +; AVX512F-NEXT: vcmpunordss %xmm1, %xmm1, %k1 ; AVX512F-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} ; AVX512F-NEXT: vmovaps %xmm1, %xmm0 ; AVX512F-NEXT: retq @@ -994,7 +990,7 @@ define float @test_fminimumnum_combine_cmps(float %x, float %y) nounwind { ; X86-NEXT: vmovaps %xmm2, %xmm0 ; X86-NEXT: .LBB19_3: ; X86-NEXT: vminss %xmm1, %xmm0, %xmm1 -; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) @@ -1022,9 +1018,9 @@ define <2 x double> @test_fminimumnum_vector(<2 x double> %x, <2 x double> %y) { ; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm1 ; SSE2-NEXT: minpd %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: cmpordpd %xmm3, %xmm0 -; SSE2-NEXT: andpd %xmm0, %xmm3 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: cmpunordpd %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: andnpd %xmm1, %xmm0 ; SSE2-NEXT: orpd %xmm3, %xmm0 ; SSE2-NEXT: retq @@ -1034,7 +1030,7 @@ define <2 x double> @test_fminimumnum_vector(<2 x double> %x, <2 x double> %y) { ; AVX-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm2 ; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vminpd %xmm2, %xmm0, %xmm1 -; AVX-NEXT: vcmpordpd %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vcmpunordpd %xmm1, %xmm1, %xmm2 ; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; @@ -1048,7 +1044,7 @@ define <2 x double> @test_fminimumnum_vector(<2 x double> %x, <2 x double> %y) { ; X86-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm2 ; X86-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0 ; X86-NEXT: vminpd %xmm2, %xmm0, %xmm1 -; X86-NEXT: vcmpordpd %xmm0, %xmm0, %xmm2 +; X86-NEXT: vcmpunordpd %xmm1, %xmm1, %xmm2 ; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: retl %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> %y) @@ -1084,19 +1080,17 @@ define <2 x double> @test_fminimumnum_vector_zero(<2 x double> %x) { ; SSE2: # %bb.0: ; SSE2-NEXT: xorpd %xmm1, %xmm1 ; SSE2-NEXT: minpd %xmm0, %xmm1 -; SSE2-NEXT: movapd %xmm0, %xmm2 -; SSE2-NEXT: cmpordpd %xmm0, %xmm2 -; SSE2-NEXT: andpd %xmm2, %xmm0 -; SSE2-NEXT: andnpd %xmm1, %xmm2 -; SSE2-NEXT: orpd %xmm2, %xmm0 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: cmpunordpd %xmm1, %xmm0 +; SSE2-NEXT: andnpd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_fminimumnum_vector_zero: ; AVX: # %bb.0: ; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vcmpordpd %xmm0, %xmm0, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vandnpd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; ; AVX10_2-LABEL: test_fminimumnum_vector_zero: @@ -1108,9 +1102,9 @@ define <2 x double> @test_fminimumnum_vector_zero(<2 x double> %x) { ; X86-LABEL: test_fminimumnum_vector_zero: ; X86: # %bb.0: ; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; X86-NEXT: vminpd %xmm0, %xmm1, %xmm1 -; X86-NEXT: vcmpordpd %xmm0, %xmm0, %xmm2 -; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-NEXT: vminpd %xmm0, %xmm1, %xmm0 +; X86-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm1 +; X86-NEXT: vandnpd %xmm0, %xmm1, %xmm0 ; X86-NEXT: retl %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> ) ret <2 x double> %r @@ -1120,20 +1114,21 @@ define <4 x float> @test_fmaximumnum_vector_signed_zero(<4 x float> %x) { ; SSE2-LABEL: test_fmaximumnum_vector_signed_zero: ; SSE2: # %bb.0: ; SSE2-NEXT: movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; SSE2-NEXT: maxps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: cmpordps %xmm0, %xmm2 -; SSE2-NEXT: andps %xmm2, %xmm0 -; SSE2-NEXT: andnps %xmm1, %xmm2 -; SSE2-NEXT: orps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: maxps %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: cmpunordps %xmm2, %xmm0 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: andnps %xmm2, %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_fmaximumnum_vector_signed_zero: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vcmpordps %xmm0, %xmm0, %xmm2 -; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vcmpunordps %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX10_2-LABEL: test_fmaximumnum_vector_signed_zero: @@ -1144,9 +1139,9 @@ define <4 x float> @test_fmaximumnum_vector_signed_zero(<4 x float> %x) { ; X86-LABEL: test_fmaximumnum_vector_signed_zero: ; X86: # %bb.0: ; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; X86-NEXT: vmaxps %xmm0, %xmm1, %xmm1 -; X86-NEXT: vcmpordps %xmm0, %xmm0, %xmm2 -; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; X86-NEXT: vmaxps %xmm0, %xmm1, %xmm0 +; X86-NEXT: vcmpunordps %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ; X86-NEXT: retl %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> ) ret <4 x float> %r @@ -1155,13 +1150,14 @@ define <4 x float> @test_fmaximumnum_vector_signed_zero(<4 x float> %x) { define <2 x double> @test_fminimumnum_vector_partially_zero(<2 x double> %x) { ; SSE2-LABEL: test_fminimumnum_vector_partially_zero: ; SSE2: # %bb.0: -; SSE2-NEXT: movapd %xmm0, %xmm1 -; SSE2-NEXT: cmpordpd %xmm0, %xmm1 -; SSE2-NEXT: xorpd %xmm2, %xmm2 -; SSE2-NEXT: movhpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; SSE2-NEXT: xorpd %xmm1, %xmm1 +; SSE2-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; SSE2-NEXT: movapd %xmm1, %xmm2 ; SSE2-NEXT: minpd %xmm0, %xmm2 -; SSE2-NEXT: andpd %xmm1, %xmm0 -; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: movapd %xmm2, %xmm0 +; SSE2-NEXT: cmpunordpd %xmm2, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm1 +; SSE2-NEXT: andnpd %xmm2, %xmm0 ; SSE2-NEXT: orpd %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -1169,9 +1165,9 @@ define <2 x double> @test_fminimumnum_vector_partially_zero(<2 x double> %x) { ; AVX: # %bb.0: ; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vcmpordpd %xmm0, %xmm0, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX10_2-LABEL: test_fminimumnum_vector_partially_zero: @@ -1185,9 +1181,9 @@ define <2 x double> @test_fminimumnum_vector_partially_zero(<2 x double> %x) { ; X86: # %bb.0: ; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; X86-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; X86-NEXT: vminpd %xmm0, %xmm1, %xmm1 -; X86-NEXT: vcmpordpd %xmm0, %xmm0, %xmm2 -; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-NEXT: vminpd %xmm0, %xmm1, %xmm0 +; X86-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 ; X86-NEXT: retl %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> ) ret <2 x double> %r @@ -1212,9 +1208,9 @@ define <2 x double> @test_fminimumnum_vector_different_zeros(<2 x double> %x) { ; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm1 ; SSE2-NEXT: minpd %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: cmpordpd %xmm3, %xmm0 -; SSE2-NEXT: andpd %xmm0, %xmm3 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: cmpunordpd %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: andnpd %xmm1, %xmm0 ; SSE2-NEXT: orpd %xmm3, %xmm0 ; SSE2-NEXT: retq @@ -1226,7 +1222,7 @@ define <2 x double> @test_fminimumnum_vector_different_zeros(<2 x double> %x) { ; AVX-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm2 ; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vminpd %xmm2, %xmm0, %xmm1 -; AVX-NEXT: vcmpordpd %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vcmpunordpd %xmm1, %xmm1, %xmm2 ; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; @@ -1244,7 +1240,7 @@ define <2 x double> @test_fminimumnum_vector_different_zeros(<2 x double> %x) { ; X86-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm2 ; X86-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0 ; X86-NEXT: vminpd %xmm2, %xmm0, %xmm1 -; X86-NEXT: vcmpordpd %xmm0, %xmm0, %xmm2 +; X86-NEXT: vcmpunordpd %xmm1, %xmm1, %xmm2 ; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: retl %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> ) @@ -1278,20 +1274,24 @@ define <4 x float> @test_fmaximumnum_vector_non_zero(<4 x float> %x) { define <2 x double> @test_fminimumnum_vector_nan(<2 x double> %x) { ; SSE2-LABEL: test_fminimumnum_vector_nan: ; SSE2: # %bb.0: -; SSE2-NEXT: xorpd %xmm2, %xmm2 ; SSE2-NEXT: xorpd %xmm1, %xmm1 ; SSE2-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; SSE2-NEXT: minpd %xmm0, %xmm1 -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: minpd %xmm0, %xmm2 +; SSE2-NEXT: movapd %xmm2, %xmm0 +; SSE2-NEXT: cmpunordpd %xmm2, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm1 +; SSE2-NEXT: andnpd %xmm2, %xmm0 +; SSE2-NEXT: orpd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_fminimumnum_vector_nan: ; AVX: # %bb.0: ; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vmovhpd {{.*#+}} xmm2 = xmm1[0],mem[0] -; AVX-NEXT: vminpd %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; AVX-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX10_2-LABEL: test_fminimumnum_vector_nan: @@ -1306,7 +1306,7 @@ define <2 x double> @test_fminimumnum_vector_nan(<2 x double> %x) { ; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; X86-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; X86-NEXT: vminpd %xmm0, %xmm1, %xmm0 -; X86-NEXT: vcmpordpd %xmm1, %xmm1, %xmm2 +; X86-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm2 ; X86-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 ; X86-NEXT: retl %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> ) @@ -1318,19 +1318,17 @@ define <2 x double> @test_fminimumnum_vector_zero_first(<2 x double> %x) { ; SSE2: # %bb.0: ; SSE2-NEXT: xorpd %xmm1, %xmm1 ; SSE2-NEXT: minpd %xmm0, %xmm1 -; SSE2-NEXT: movapd %xmm0, %xmm2 -; SSE2-NEXT: cmpordpd %xmm0, %xmm2 -; SSE2-NEXT: andpd %xmm2, %xmm0 -; SSE2-NEXT: andnpd %xmm1, %xmm2 -; SSE2-NEXT: orpd %xmm2, %xmm0 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: cmpunordpd %xmm1, %xmm0 +; SSE2-NEXT: andnpd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_fminimumnum_vector_zero_first: ; AVX: # %bb.0: ; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vcmpordpd %xmm0, %xmm0, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vandnpd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; ; AVX10_2-LABEL: test_fminimumnum_vector_zero_first: @@ -1342,9 +1340,9 @@ define <2 x double> @test_fminimumnum_vector_zero_first(<2 x double> %x) { ; X86-LABEL: test_fminimumnum_vector_zero_first: ; X86: # %bb.0: ; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; X86-NEXT: vminpd %xmm0, %xmm1, %xmm1 -; X86-NEXT: vcmpordpd %xmm0, %xmm0, %xmm2 -; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-NEXT: vminpd %xmm0, %xmm1, %xmm0 +; X86-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm1 +; X86-NEXT: vandnpd %xmm0, %xmm1, %xmm0 ; X86-NEXT: retl %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> , <2 x double> %x) ret <2 x double> %r @@ -1378,20 +1376,21 @@ define <4 x float> @test_fmaximumnum_vector_signed_zero_first(<4 x float> %x) { ; SSE2-LABEL: test_fmaximumnum_vector_signed_zero_first: ; SSE2: # %bb.0: ; SSE2-NEXT: movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; SSE2-NEXT: maxps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: cmpordps %xmm0, %xmm2 -; SSE2-NEXT: andps %xmm2, %xmm0 -; SSE2-NEXT: andnps %xmm1, %xmm2 -; SSE2-NEXT: orps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: maxps %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: cmpunordps %xmm2, %xmm0 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: andnps %xmm2, %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_fmaximumnum_vector_signed_zero_first: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vcmpordps %xmm0, %xmm0, %xmm2 -; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vcmpunordps %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX10_2-LABEL: test_fmaximumnum_vector_signed_zero_first: @@ -1402,9 +1401,9 @@ define <4 x float> @test_fmaximumnum_vector_signed_zero_first(<4 x float> %x) { ; X86-LABEL: test_fmaximumnum_vector_signed_zero_first: ; X86: # %bb.0: ; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; X86-NEXT: vmaxps %xmm0, %xmm1, %xmm1 -; X86-NEXT: vcmpordps %xmm0, %xmm0, %xmm2 -; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; X86-NEXT: vmaxps %xmm0, %xmm1, %xmm0 +; X86-NEXT: vcmpunordps %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ; X86-NEXT: retl %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> , <4 x float> %x) ret <4 x float> %r @@ -1455,11 +1454,11 @@ define <4 x float> @test_fmaximumnum_v4f32_splat(<4 x float> %x, float %y) { ; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: maxps %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: cmpordps %xmm0, %xmm2 -; SSE2-NEXT: andps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: cmpunordps %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: andnps %xmm1, %xmm2 -; SSE2-NEXT: orps %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: test_fmaximumnum_v4f32_splat: @@ -1468,7 +1467,7 @@ define <4 x float> @test_fmaximumnum_v4f32_splat(<4 x float> %x, float %y) { ; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmaxps %xmm2, %xmm0, %xmm1 -; AVX1-NEXT: vcmpordps %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vcmpunordps %xmm1, %xmm1, %xmm2 ; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -1478,7 +1477,7 @@ define <4 x float> @test_fmaximumnum_v4f32_splat(<4 x float> %x, float %y) { ; AVX512-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm2 ; AVX512-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmaxps %xmm2, %xmm0, %xmm1 -; AVX512-NEXT: vcmpordps %xmm0, %xmm0, %xmm2 +; AVX512-NEXT: vcmpunordps %xmm1, %xmm1, %xmm2 ; AVX512-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq ; @@ -1494,7 +1493,7 @@ define <4 x float> @test_fmaximumnum_v4f32_splat(<4 x float> %x, float %y) { ; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm2 ; X86-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmaxps %xmm2, %xmm0, %xmm1 -; X86-NEXT: vcmpordps %xmm0, %xmm0, %xmm2 +; X86-NEXT: vcmpunordps %xmm1, %xmm1, %xmm2 ; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: retl %splatinsert = insertelement <4 x float> poison, float %y, i64 0 @@ -1506,134 +1505,130 @@ define <4 x float> @test_fmaximumnum_v4f32_splat(<4 x float> %x, float %y) { define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind { ; SSE2-LABEL: test_fmaximumnum_v4f16: ; SSE2: # %bb.0: -; SSE2-NEXT: subq $104, %rsp -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: subq $136, %rsp +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: psrld $16, %xmm0 ; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: psrld $16, %xmm0 ; SSE2-NEXT: callq __extendhfsf2@PLT -; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: callq __extendhfsf2@PLT -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload -; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero -; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: js .LBB33_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: .LBB33_2: -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: cmpordss %xmm2, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm3 -; SSE2-NEXT: andps %xmm2, %xmm3 -; SSE2-NEXT: js .LBB33_4 -; SSE2-NEXT: # %bb.3: -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: .LBB33_4: -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: maxss %xmm4, %xmm2 -; SSE2-NEXT: andnps %xmm2, %xmm0 -; SSE2-NEXT: orps %xmm3, %xmm0 +; SSE2-NEXT: js .LBB33_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: jmp .LBB33_3 +; SSE2-NEXT: .LBB33_1: +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: .LBB33_3: +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE2-NEXT: psrlq $48, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: psrlq $48, %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: maxss %xmm1, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: cmpunordss %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: andps %xmm3, %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 ; SSE2-NEXT: callq __truncsfhf2@PLT ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __extendhfsf2@PLT -; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __extendhfsf2@PLT -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload -; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero -; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: js .LBB33_6 +; SSE2-NEXT: js .LBB33_4 ; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: .LBB33_6: -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: cmpordss %xmm2, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm3 -; SSE2-NEXT: andps %xmm2, %xmm3 -; SSE2-NEXT: js .LBB33_8 -; SSE2-NEXT: # %bb.7: -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: .LBB33_8: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: jmp .LBB33_6 +; SSE2-NEXT: .LBB33_4: ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE2-NEXT: psrlq $48, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: psrlq $48, %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill -; SSE2-NEXT: maxss %xmm4, %xmm2 -; SSE2-NEXT: andnps %xmm2, %xmm0 -; SSE2-NEXT: orps %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: .LBB33_6: +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: maxss %xmm1, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: cmpunordss %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: andps %xmm3, %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 ; SSE2-NEXT: callq __truncsfhf2@PLT ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: callq __extendhfsf2@PLT -; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: callq __extendhfsf2@PLT -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload -; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: js .LBB33_10 -; SSE2-NEXT: # %bb.9: -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: .LBB33_10: -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: cmpordss %xmm2, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm3 -; SSE2-NEXT: andps %xmm2, %xmm3 -; SSE2-NEXT: js .LBB33_12 -; SSE2-NEXT: # %bb.11: -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: .LBB33_12: -; SSE2-NEXT: maxss %xmm4, %xmm2 +; SSE2-NEXT: js .LBB33_7 +; SSE2-NEXT: # %bb.8: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: jmp .LBB33_9 +; SSE2-NEXT: .LBB33_7: +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: .LBB33_9: +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: maxss %xmm1, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: cmpunordss %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: andnps %xmm2, %xmm1 -; SSE2-NEXT: orps %xmm3, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: andps %xmm3, %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 ; SSE2-NEXT: callq __truncsfhf2@PLT ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE2-NEXT: callq __extendhfsf2@PLT -; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: callq __extendhfsf2@PLT -; SSE2-NEXT: movd (%rsp), %xmm4 # 4-byte Folded Reload -; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero -; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: js .LBB33_14 -; SSE2-NEXT: # %bb.13: -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: .LBB33_14: -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: cmpordss %xmm2, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm3 -; SSE2-NEXT: andps %xmm2, %xmm3 -; SSE2-NEXT: js .LBB33_16 -; SSE2-NEXT: # %bb.15: -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: .LBB33_16: -; SSE2-NEXT: maxss %xmm4, %xmm2 -; SSE2-NEXT: andnps %xmm2, %xmm0 -; SSE2-NEXT: orps %xmm3, %xmm0 +; SSE2-NEXT: js .LBB33_10 +; SSE2-NEXT: # %bb.11: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload +; SSE2-NEXT: jmp .LBB33_12 +; SSE2-NEXT: .LBB33_10: +; SSE2-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: .LBB33_12: +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: maxss %xmm1, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: cmpunordss %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: andps %xmm3, %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 ; SSE2-NEXT: callq __truncsfhf2@PLT ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] @@ -1641,7 +1636,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: addq $104, %rsp +; SSE2-NEXT: addq $136, %rsp ; SSE2-NEXT: retq ; ; AVX1-LABEL: test_fmaximumnum_v4f16: @@ -1679,7 +1674,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0 -; AVX1-NEXT: vcmpordss %xmm2, %xmm2, %xmm1 +; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1 ; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1700,7 +1695,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; AVX1-NEXT: vmovdqa %xmm0, %xmm2 ; AVX1-NEXT: .LBB33_6: ; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0 -; AVX1-NEXT: vcmpordss %xmm2, %xmm2, %xmm1 +; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1 ; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1721,7 +1716,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; AVX1-NEXT: vmovdqa %xmm0, %xmm2 ; AVX1-NEXT: .LBB33_9: ; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0 -; AVX1-NEXT: vcmpordss %xmm2, %xmm2, %xmm1 +; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1 ; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -1742,7 +1737,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; AVX1-NEXT: vmovdqa %xmm0, %xmm2 ; AVX1-NEXT: .LBB33_12: ; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0 -; AVX1-NEXT: vcmpordss %xmm2, %xmm2, %xmm1 +; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1 ; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload @@ -1768,7 +1763,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; AVX512-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} ; AVX512-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} ; AVX512-NEXT: vmaxss %xmm4, %xmm3, %xmm2 -; AVX512-NEXT: vcmpordss %xmm3, %xmm3, %k1 +; AVX512-NEXT: vcmpunordss %xmm2, %xmm2, %k1 ; AVX512-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1} ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; AVX512-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] @@ -1783,7 +1778,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; AVX512-NEXT: vmovss %xmm4, %xmm5, %xmm5 {%k1} ; AVX512-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} ; AVX512-NEXT: vmaxss %xmm5, %xmm4, %xmm3 -; AVX512-NEXT: vcmpordss %xmm4, %xmm4, %k1 +; AVX512-NEXT: vcmpunordss %xmm3, %xmm3, %k1 ; AVX512-NEXT: vmovss %xmm4, %xmm3, %xmm3 {%k1} ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] @@ -1799,7 +1794,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; AVX512-NEXT: vmovss %xmm4, %xmm5, %xmm5 {%k1} ; AVX512-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} ; AVX512-NEXT: vmaxss %xmm5, %xmm4, %xmm3 -; AVX512-NEXT: vcmpordss %xmm4, %xmm4, %k1 +; AVX512-NEXT: vcmpunordss %xmm3, %xmm3, %k1 ; AVX512-NEXT: vmovss %xmm4, %xmm3, %xmm3 {%k1} ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; AVX512-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] @@ -1814,7 +1809,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; AVX512-NEXT: vmovss %xmm5, %xmm6, %xmm6 {%k1} ; AVX512-NEXT: vmovss %xmm4, %xmm5, %xmm5 {%k1} ; AVX512-NEXT: vmaxss %xmm6, %xmm5, %xmm4 -; AVX512-NEXT: vcmpordss %xmm5, %xmm5, %k1 +; AVX512-NEXT: vcmpunordss %xmm4, %xmm4, %k1 ; AVX512-NEXT: vmovss %xmm5, %xmm4, %xmm4 {%k1} ; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] @@ -1831,7 +1826,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; AVX512-NEXT: vmovss %xmm4, %xmm5, %xmm5 {%k1} ; AVX512-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} ; AVX512-NEXT: vmaxss %xmm5, %xmm4, %xmm3 -; AVX512-NEXT: vcmpordss %xmm4, %xmm4, %k1 +; AVX512-NEXT: vcmpunordss %xmm3, %xmm3, %k1 ; AVX512-NEXT: vmovss %xmm4, %xmm3, %xmm3 {%k1} ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] @@ -1846,7 +1841,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; AVX512-NEXT: vmovss %xmm5, %xmm6, %xmm6 {%k1} ; AVX512-NEXT: vmovss %xmm4, %xmm5, %xmm5 {%k1} ; AVX512-NEXT: vmaxss %xmm6, %xmm5, %xmm4 -; AVX512-NEXT: vcmpordss %xmm5, %xmm5, %k1 +; AVX512-NEXT: vcmpunordss %xmm4, %xmm4, %k1 ; AVX512-NEXT: vmovss %xmm5, %xmm4, %xmm4 {%k1} ; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] @@ -1860,7 +1855,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; AVX512-NEXT: vmovss %xmm5, %xmm6, %xmm6 {%k1} ; AVX512-NEXT: vmovss %xmm4, %xmm5, %xmm5 {%k1} ; AVX512-NEXT: vmaxss %xmm6, %xmm5, %xmm4 -; AVX512-NEXT: vcmpordss %xmm5, %xmm5, %k1 +; AVX512-NEXT: vcmpunordss %xmm4, %xmm4, %k1 ; AVX512-NEXT: vmovss %xmm5, %xmm4, %xmm4 {%k1} ; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 @@ -1875,7 +1870,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; AVX512-NEXT: vmovss %xmm1, %xmm5, %xmm5 {%k1} ; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vmaxss %xmm5, %xmm1, %xmm0 -; AVX512-NEXT: vcmpordss %xmm1, %xmm1, %k1 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] @@ -1933,7 +1928,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; X86-NEXT: vmovdqa %xmm1, %xmm0 ; X86-NEXT: .LBB33_3: ; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1 -; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: calll __extendhfsf2 @@ -1955,7 +1950,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; X86-NEXT: vmovdqa %xmm1, %xmm0 ; X86-NEXT: .LBB33_6: ; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1 -; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: calll __truncsfhf2 @@ -1993,7 +1988,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; X86-NEXT: vmovdqa %xmm1, %xmm0 ; X86-NEXT: .LBB33_9: ; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1 -; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: calll __extendhfsf2 @@ -2015,7 +2010,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; X86-NEXT: vmovdqa %xmm1, %xmm0 ; X86-NEXT: .LBB33_12: ; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1 -; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: calll __truncsfhf2 @@ -2041,120 +2036,114 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n ; SSE2-NEXT: pushq %rbp ; SSE2-NEXT: pushq %r15 ; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 ; SSE2-NEXT: pushq %rbx ; SSE2-NEXT: subq $56, %rsp -; SSE2-NEXT: pextrw $0, %xmm1, %r14d -; SSE2-NEXT: pextrw $0, %xmm0, %r15d -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrld $16, %xmm2 -; SSE2-NEXT: pextrw $0, %xmm2, %eax ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrld $16, %xmm2 -; SSE2-NEXT: pextrw $0, %xmm2, %ecx +; SSE2-NEXT: psrlq $48, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: psrlq $48, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[1,1] +; SSE2-NEXT: pextrw $0, %xmm4, %ebp +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] +; SSE2-NEXT: pextrw $0, %xmm4, %r15d +; SSE2-NEXT: pextrw $0, %xmm0, %r12d +; SSE2-NEXT: pextrw $0, %xmm1, %r13d +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pextrw $0, %xmm0, %eax +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pextrw $0, %xmm1, %ecx ; SSE2-NEXT: shll $16, %ecx -; SSE2-NEXT: movd %ecx, %xmm3 +; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: testl %ecx, %ecx -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: js .LBB34_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: .LBB34_2: -; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] -; SSE2-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm0[1,1] -; SSE2-NEXT: movdqa %xmm7, %xmm0 -; SSE2-NEXT: cmpordss %xmm7, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm4 -; SSE2-NEXT: andps %xmm7, %xmm4 -; SSE2-NEXT: js .LBB34_4 -; SSE2-NEXT: # %bb.3: -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: .LBB34_4: -; SSE2-NEXT: pextrw $0, %xmm5, %ebp -; SSE2-NEXT: pextrw $0, %xmm6, %ebx -; SSE2-NEXT: maxss %xmm2, %xmm7 -; SSE2-NEXT: andnps %xmm7, %xmm0 -; SSE2-NEXT: orps %xmm4, %xmm0 +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: js .LBB34_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: jmp .LBB34_3 +; SSE2-NEXT: .LBB34_1: +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: .LBB34_3: +; SSE2-NEXT: pextrw $0, %xmm2, %ebx +; SSE2-NEXT: pextrw $0, %xmm3, %r14d +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: maxss %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: cmpunordss %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm1, %xmm0 +; SSE2-NEXT: orps %xmm3, %xmm0 ; SSE2-NEXT: callq __truncsfbf2@PLT ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: shll $16, %r15d -; SSE2-NEXT: movd %r15d, %xmm3 -; SSE2-NEXT: shll $16, %r14d -; SSE2-NEXT: movd %r14d, %xmm2 -; SSE2-NEXT: testl %r15d, %r15d -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: js .LBB34_6 +; SSE2-NEXT: shll $16, %r13d +; SSE2-NEXT: movd %r13d, %xmm1 +; SSE2-NEXT: shll $16, %r12d +; SSE2-NEXT: movd %r12d, %xmm2 +; SSE2-NEXT: js .LBB34_4 ; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: jmp .LBB34_6 +; SSE2-NEXT: .LBB34_4: +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: .LBB34_6: -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE2-NEXT: psrlq $48, %xmm5 -; SSE2-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload -; SSE2-NEXT: psrlq $48, %xmm6 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: cmpordss %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm4 -; SSE2-NEXT: andps %xmm1, %xmm4 -; SSE2-NEXT: js .LBB34_8 -; SSE2-NEXT: # %bb.7: -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: .LBB34_8: -; SSE2-NEXT: pextrw $0, %xmm5, %r15d -; SSE2-NEXT: pextrw $0, %xmm6, %r14d -; SSE2-NEXT: maxss %xmm2, %xmm1 -; SSE2-NEXT: andnps %xmm1, %xmm0 -; SSE2-NEXT: orps %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: maxss %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: cmpunordss %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm1, %xmm0 +; SSE2-NEXT: orps %xmm3, %xmm0 ; SSE2-NEXT: callq __truncsfbf2@PLT ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: shll $16, %ebx -; SSE2-NEXT: movd %ebx, %xmm1 +; SSE2-NEXT: shll $16, %r15d +; SSE2-NEXT: movd %r15d, %xmm1 ; SSE2-NEXT: shll $16, %ebp -; SSE2-NEXT: movd %ebp, %xmm3 -; SSE2-NEXT: testl %ebx, %ebx -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: js .LBB34_10 -; SSE2-NEXT: # %bb.9: -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: .LBB34_10: +; SSE2-NEXT: movd %ebp, %xmm2 +; SSE2-NEXT: js .LBB34_7 +; SSE2-NEXT: # %bb.8: ; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: cmpordss %xmm2, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm4 -; SSE2-NEXT: andps %xmm2, %xmm4 -; SSE2-NEXT: js .LBB34_12 -; SSE2-NEXT: # %bb.11: -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: .LBB34_12: -; SSE2-NEXT: maxss %xmm3, %xmm2 -; SSE2-NEXT: andnps %xmm2, %xmm0 -; SSE2-NEXT: orps %xmm4, %xmm0 +; SSE2-NEXT: jmp .LBB34_9 +; SSE2-NEXT: .LBB34_7: +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: .LBB34_9: +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: maxss %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: cmpunordss %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm1, %xmm0 +; SSE2-NEXT: orps %xmm3, %xmm0 ; SSE2-NEXT: callq __truncsfbf2@PLT ; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; SSE2-NEXT: shll $16, %r14d ; SSE2-NEXT: movd %r14d, %xmm1 -; SSE2-NEXT: shll $16, %r15d -; SSE2-NEXT: movd %r15d, %xmm3 -; SSE2-NEXT: testl %r14d, %r14d -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: js .LBB34_14 -; SSE2-NEXT: # %bb.13: -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: .LBB34_14: +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd %ebx, %xmm2 +; SSE2-NEXT: js .LBB34_10 +; SSE2-NEXT: # %bb.11: ; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: cmpordss %xmm2, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm4 -; SSE2-NEXT: andps %xmm2, %xmm4 -; SSE2-NEXT: js .LBB34_16 -; SSE2-NEXT: # %bb.15: -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: .LBB34_16: -; SSE2-NEXT: maxss %xmm3, %xmm2 -; SSE2-NEXT: andnps %xmm2, %xmm0 -; SSE2-NEXT: orps %xmm4, %xmm0 +; SSE2-NEXT: jmp .LBB34_12 +; SSE2-NEXT: .LBB34_10: +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: .LBB34_12: +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: maxss %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: cmpunordss %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm1, %xmm0 +; SSE2-NEXT: orps %xmm3, %xmm0 ; SSE2-NEXT: callq __truncsfbf2@PLT ; SSE2-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] @@ -2164,6 +2153,8 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: addq $56, %rsp ; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 ; SSE2-NEXT: popq %r14 ; SSE2-NEXT: popq %r15 ; SSE2-NEXT: popq %rbp @@ -2205,7 +2196,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n ; AVX1-NEXT: vpextrw $0, %xmm2, %ebp ; AVX1-NEXT: vpextrw $0, %xmm3, %r15d ; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: callq __truncsfbf2@PLT ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2222,7 +2213,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n ; AVX1-NEXT: vmovdqa %xmm2, %xmm0 ; AVX1-NEXT: .LBB34_6: ; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: callq __truncsfbf2@PLT ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2239,7 +2230,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n ; AVX1-NEXT: vmovdqa %xmm2, %xmm0 ; AVX1-NEXT: .LBB34_9: ; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: callq __truncsfbf2@PLT ; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -2256,7 +2247,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n ; AVX1-NEXT: vmovdqa %xmm2, %xmm0 ; AVX1-NEXT: .LBB34_12: ; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: callq __truncsfbf2@PLT ; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload @@ -2305,7 +2296,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n ; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} ; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0 -; AVX512-NEXT: vcmpordss %xmm1, %xmm1, %k1 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: callq __truncsfbf2@PLT ; AVX512-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) @@ -2319,7 +2310,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n ; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} ; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0 -; AVX512-NEXT: vcmpordss %xmm1, %xmm1, %k1 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: callq __truncsfbf2@PLT ; AVX512-NEXT: vpextrw $0, %xmm0, (%rsp) @@ -2333,7 +2324,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n ; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} ; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0 -; AVX512-NEXT: vcmpordss %xmm1, %xmm1, %k1 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: callq __truncsfbf2@PLT ; AVX512-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) @@ -2347,7 +2338,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n ; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} ; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0 -; AVX512-NEXT: vcmpordss %xmm1, %xmm1, %k1 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: callq __truncsfbf2@PLT ; AVX512-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) @@ -2400,7 +2391,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n ; X86-NEXT: vpextrw $0, %xmm2, %edi ; X86-NEXT: vpextrw $0, %xmm3, %ebp ; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1 -; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: shll $16, %ecx @@ -2416,7 +2407,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n ; X86-NEXT: vmovdqa %xmm2, %xmm0 ; X86-NEXT: .LBB34_6: ; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1 -; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: calll __truncsfbf2 @@ -2436,7 +2427,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n ; X86-NEXT: vmovdqa %xmm2, %xmm0 ; X86-NEXT: .LBB34_9: ; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1 -; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: calll __truncsfbf2 @@ -2456,7 +2447,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n ; X86-NEXT: vmovdqa %xmm2, %xmm0 ; X86-NEXT: .LBB34_12: ; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1 -; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 ; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: calll __truncsfbf2 From 048070ba6f28d3a0a788d71788320549527db94f Mon Sep 17 00:00:00 2001 From: Daniel Kiss Date: Wed, 22 Oct 2025 09:29:06 +0200 Subject: [PATCH 059/530] [ARM][AArch64] BTI,GCS,PAC Module flag update. (#86212) Module flag is used to indicate the feature to be propagated to the function. As now the frontend emits all attributes accordingly let's help the auto upgrade to only do work when old and new bitcodes are merged. Depends on #82819 and #86031 --- clang/lib/CodeGen/CodeGenModule.cpp | 19 ++- .../CodeGen/AArch64/sign-return-address.c | 12 +- .../CodeGen/arm-branch-protection-attr-2.c | 8 +- .../arm-ignore-branch-protection-option.c | 2 +- llvm/include/llvm/IR/AutoUpgrade.h | 10 ++ llvm/lib/AsmParser/LLParser.cpp | 1 + llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 2 + llvm/lib/IR/AutoUpgrade.cpp | 114 ++++++++++++++++ llvm/lib/Linker/IRMover.cpp | 10 ++ llvm/lib/Target/ARM/ARMAsmPrinter.cpp | 2 +- llvm/lib/Transforms/IPO/LowerTypeTests.cpp | 2 +- .../test/Bitcode/upgrade-branch-protection.ll | 15 ++- .../CodeGen/Thumb2/pacbti-m-outliner-5.ll | 2 +- llvm/test/LTO/AArch64/Inputs/foo.ll | 16 --- llvm/test/LTO/AArch64/TestInputs/bar.ll | 35 +++++ llvm/test/LTO/AArch64/TestInputs/fiz.ll | 41 ++++++ llvm/test/LTO/AArch64/TestInputs/foo.ll | 38 ++++++ llvm/test/LTO/AArch64/TestInputs/old.ll | 59 ++++++++ .../AArch64/link-branch-target-enforcement.ll | 23 ++-- .../LTO/AArch64/link-sign-return-address.ll | 127 ++++++++++++++++++ llvm/test/Linker/link-arm-and-thumb.ll | 6 +- llvm/test/ThinLTO/AArch64/aarch64_inline.ll | 86 ++++++++++++ 22 files changed, 575 insertions(+), 55 deletions(-) delete mode 100644 llvm/test/LTO/AArch64/Inputs/foo.ll create mode 100644 llvm/test/LTO/AArch64/TestInputs/bar.ll create mode 100644 llvm/test/LTO/AArch64/TestInputs/fiz.ll create mode 100644 llvm/test/LTO/AArch64/TestInputs/foo.ll create mode 100644 llvm/test/LTO/AArch64/TestInputs/old.ll create mode 100644 llvm/test/LTO/AArch64/link-sign-return-address.ll create mode 100644 llvm/test/ThinLTO/AArch64/aarch64_inline.ll diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index e490b1ce0efa4..3746bc04f4dd3 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -1325,22 +1325,29 @@ void CodeGenModule::Release() { "tag-stack-memory-buildattr", 1); if (T.isARM() || T.isThumb() || T.isAArch64()) { + // Previously 1 is used and meant for the backed to derive the function + // attribute form it. 2 now means function attributes already set for all + // functions in this module, so no need to propagate those from the module + // flag. Value is only used in case of LTO module merge because the backend + // will see all required function attribute set already. Value is used + // before modules got merged. Any posive value means the feature is active + // and required binary markings need to be emit accordingly. if (LangOpts.BranchTargetEnforcement) getModule().addModuleFlag(llvm::Module::Min, "branch-target-enforcement", - 1); + 2); if (LangOpts.BranchProtectionPAuthLR) getModule().addModuleFlag(llvm::Module::Min, "branch-protection-pauth-lr", - 1); + 2); if (LangOpts.GuardedControlStack) - getModule().addModuleFlag(llvm::Module::Min, "guarded-control-stack", 1); + getModule().addModuleFlag(llvm::Module::Min, "guarded-control-stack", 2); if (LangOpts.hasSignReturnAddress()) - getModule().addModuleFlag(llvm::Module::Min, "sign-return-address", 1); + getModule().addModuleFlag(llvm::Module::Min, "sign-return-address", 2); if (LangOpts.isSignReturnAddressScopeAll()) getModule().addModuleFlag(llvm::Module::Min, "sign-return-address-all", - 1); + 2); if (!LangOpts.isSignReturnAddressWithAKey()) getModule().addModuleFlag(llvm::Module::Min, - "sign-return-address-with-bkey", 1); + "sign-return-address-with-bkey", 2); if (LangOpts.PointerAuthELFGOT) getModule().addModuleFlag(llvm::Module::Min, "ptrauth-elf-got", 1); diff --git a/clang/test/CodeGen/AArch64/sign-return-address.c b/clang/test/CodeGen/AArch64/sign-return-address.c index 11dd683a3d7e9..2b505de339054 100644 --- a/clang/test/CodeGen/AArch64/sign-return-address.c +++ b/clang/test/CodeGen/AArch64/sign-return-address.c @@ -28,17 +28,17 @@ // NONE-NOT: !"branch-target-enforcement" // ALL-NOT: !"branch-target-enforcement" // PART-NOT: !"branch-target-enforcement" -// BTE: !{i32 8, !"branch-target-enforcement", i32 1} +// BTE: !{i32 8, !"branch-target-enforcement", i32 2} // B-KEY-NOT: !"branch-target-enforcement" // NONE-NOT: !"sign-return-address" -// ALL: !{i32 8, !"sign-return-address", i32 1} -// PART: !{i32 8, !"sign-return-address", i32 1} +// ALL: !{i32 8, !"sign-return-address", i32 2} +// PART: !{i32 8, !"sign-return-address", i32 2} // BTE-NOT: !"sign-return-address" -// B-KEY: !{i32 8, !"sign-return-address", i32 1} +// B-KEY: !{i32 8, !"sign-return-address", i32 2} // NONE-NOT: !"sign-return-address-all" -// ALL: !{i32 8, !"sign-return-address-all", i32 1} +// ALL: !{i32 8, !"sign-return-address-all", i32 2} // PART-NOT: !"sign-return-address-all" // BTE-NOT: !"sign-return-address-all" // B-KEY-NOT: !"sign-return-address-all" @@ -47,6 +47,6 @@ // ALL-NOT: !"sign-return-address-with-bkey" // PART-NOT: !"sign-return-address-with-bkey" // BTE-NOT: !"sign-return-address-with-bkey" -// B-KEY: !{i32 8, !"sign-return-address-with-bkey", i32 1} +// B-KEY: !{i32 8, !"sign-return-address-with-bkey", i32 2} void foo() {} diff --git a/clang/test/CodeGen/arm-branch-protection-attr-2.c b/clang/test/CodeGen/arm-branch-protection-attr-2.c index fad5dc0707fb9..53915370a71cd 100644 --- a/clang/test/CodeGen/arm-branch-protection-attr-2.c +++ b/clang/test/CodeGen/arm-branch-protection-attr-2.c @@ -23,16 +23,16 @@ // NONE-NOT: !"branch-target-enforcement" // PART-NOT: !"branch-target-enforcement" // ALL-NOT: !"branch-target-enforcement" -// BTE: !{i32 8, !"branch-target-enforcement", i32 1} +// BTE: !{i32 8, !"branch-target-enforcement", i32 2} // NONE-NOT: !"sign-return-address" -// PART: !{i32 8, !"sign-return-address", i32 1} -// ALL: !{i32 8, !"sign-return-address", i32 1} +// PART: !{i32 8, !"sign-return-address", i32 2} +// ALL: !{i32 8, !"sign-return-address", i32 2} // BTE-NOT: !"sign-return-address" // NONE-NOT: !"sign-return-address-all", i32 0} // PART-NOT: !"sign-return-address-all", i32 0} -// ALL: !{i32 8, !"sign-return-address-all", i32 1} +// ALL: !{i32 8, !"sign-return-address-all", i32 2} // BTE-NOT: !"sign-return-address-all", i32 0} void foo() {} diff --git a/clang/test/Frontend/arm-ignore-branch-protection-option.c b/clang/test/Frontend/arm-ignore-branch-protection-option.c index 99a2accef3ae2..45bdb37f5ed1a 100644 --- a/clang/test/Frontend/arm-ignore-branch-protection-option.c +++ b/clang/test/Frontend/arm-ignore-branch-protection-option.c @@ -15,4 +15,4 @@ __attribute__((target("arch=cortex-m0"))) void f() {} // CHECK-NOT: attributes { {{.*}} "branch-target-enforcement" /// Check that there are branch protection module attributes despite the warning. -// CHECK: !{i32 8, !"branch-target-enforcement", i32 1} +// CHECK: !{i32 8, !"branch-target-enforcement", i32 2} diff --git a/llvm/include/llvm/IR/AutoUpgrade.h b/llvm/include/llvm/IR/AutoUpgrade.h index 31096e86c5180..540d60afc5a01 100644 --- a/llvm/include/llvm/IR/AutoUpgrade.h +++ b/llvm/include/llvm/IR/AutoUpgrade.h @@ -96,6 +96,16 @@ namespace llvm { /// info. Return true if module is modified. LLVM_ABI bool UpgradeDebugInfo(Module &M); + /// Copies module attributes to the functions in the module. + /// Currently only effects ARM, Thumb and AArch64 targets. + /// Supported attributes: + /// - branch-target-enforcement + /// - branch-protection-pauth-lr + /// - guarded-control-stack + /// - sign-return-address + /// - sign-return-address-with-bkey + void copyModuleAttrToFunctions(Module &M); + /// Check whether a string looks like an old loop attachment tag. inline bool mayBeOldLoopAttachmentTag(StringRef Name) { return Name.starts_with("llvm.vectorizer."); diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index cf6328580fd21..f71a534ef9c2e 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -451,6 +451,7 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) { UpgradeModuleFlags(*M); UpgradeNVVMAnnotations(*M); UpgradeSectionAttributes(*M); + copyModuleAttrToFunctions(*M); if (!Slots) return false; diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index aaee1f0a7687c..cf7efbfab65c7 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -7143,6 +7143,8 @@ Error BitcodeReader::materializeModule() { UpgradeARCRuntime(*TheModule); + copyModuleAttrToFunctions(*TheModule); + return Error::success(); } diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 10f915d15de95..7e5e7b524d85c 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -6045,6 +6045,120 @@ void llvm::UpgradeFunctionAttributes(Function &F) { } } +// Check if the function attribute is not present and set it. +static void setFunctionAttrIfNotSet(Function &F, StringRef FnAttrName, + StringRef Value) { + if (!F.hasFnAttribute(FnAttrName)) + F.addFnAttr(FnAttrName, Value); +} + +// Check if the function attribute is not present and set it if needed. +// If the attribute is "false" then removes it. +// If the attribute is "true" resets it to a valueless attribute. +static void ConvertFunctionAttr(Function &F, bool Set, StringRef FnAttrName) { + if (!F.hasFnAttribute(FnAttrName)) { + if (Set) + F.addFnAttr(FnAttrName); + } else { + auto A = F.getFnAttribute(FnAttrName); + if ("false" == A.getValueAsString()) + F.removeFnAttr(FnAttrName); + else if ("true" == A.getValueAsString()) { + F.removeFnAttr(FnAttrName); + F.addFnAttr(FnAttrName); + } + } +} + +void llvm::copyModuleAttrToFunctions(Module &M) { + Triple T(M.getTargetTriple()); + if (!T.isThumb() && !T.isARM() && !T.isAArch64()) + return; + + uint64_t BTEValue = 0; + uint64_t BPPLRValue = 0; + uint64_t GCSValue = 0; + uint64_t SRAValue = 0; + uint64_t SRAALLValue = 0; + uint64_t SRABKeyValue = 0; + + NamedMDNode *ModFlags = M.getModuleFlagsMetadata(); + if (ModFlags) { + for (unsigned I = 0, E = ModFlags->getNumOperands(); I != E; ++I) { + MDNode *Op = ModFlags->getOperand(I); + if (Op->getNumOperands() != 3) + continue; + + MDString *ID = dyn_cast_or_null(Op->getOperand(1)); + auto *CI = mdconst::dyn_extract(Op->getOperand(2)); + if (!ID || !CI) + continue; + + StringRef IDStr = ID->getString(); + uint64_t *ValPtr = IDStr == "branch-target-enforcement" ? &BTEValue + : IDStr == "branch-protection-pauth-lr" ? &BPPLRValue + : IDStr == "guarded-control-stack" ? &GCSValue + : IDStr == "sign-return-address" ? &SRAValue + : IDStr == "sign-return-address-all" ? &SRAALLValue + : IDStr == "sign-return-address-with-bkey" + ? &SRABKeyValue + : nullptr; + if (!ValPtr) + continue; + + *ValPtr = CI->getZExtValue(); + if (*ValPtr == 2) + return; + } + } + + bool BTE = BTEValue == 1; + bool BPPLR = BPPLRValue == 1; + bool GCS = GCSValue == 1; + bool SRA = SRAValue == 1; + + StringRef SignTypeValue = "non-leaf"; + if (SRA && SRAALLValue == 1) + SignTypeValue = "all"; + + StringRef SignKeyValue = "a_key"; + if (SRA && SRABKeyValue == 1) + SignKeyValue = "b_key"; + + for (Function &F : M.getFunctionList()) { + if (F.isDeclaration()) + continue; + + if (SRA) { + setFunctionAttrIfNotSet(F, "sign-return-address", SignTypeValue); + setFunctionAttrIfNotSet(F, "sign-return-address-key", SignKeyValue); + } else { + if (auto A = F.getFnAttribute("sign-return-address"); + A.isValid() && "none" == A.getValueAsString()) { + F.removeFnAttr("sign-return-address"); + F.removeFnAttr("sign-return-address-key"); + } + } + ConvertFunctionAttr(F, BTE, "branch-target-enforcement"); + ConvertFunctionAttr(F, BPPLR, "branch-protection-pauth-lr"); + ConvertFunctionAttr(F, GCS, "guarded-control-stack"); + } + + if (BTE) + M.setModuleFlag(llvm::Module::Min, "branch-target-enforcement", 2); + if (BPPLR) + M.setModuleFlag(llvm::Module::Min, "branch-protection-pauth-lr", 2); + if (GCS) + M.setModuleFlag(llvm::Module::Min, "guarded-control-stack", 2); + if (SRA) { + M.setModuleFlag(llvm::Module::Min, "sign-return-address", 2); + if (SRAALLValue == 1) + M.setModuleFlag(llvm::Module::Min, "sign-return-address-all", 2); + if (SRABKeyValue == 1) + M.setModuleFlag(llvm::Module::Min, "sign-return-address-with-bkey", 2); + } +} + static bool isOldLoopArgument(Metadata *MD) { auto *T = dyn_cast_or_null(MD); if (!T) diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp index 1bff6cd25156d..f78d9b016d8c9 100644 --- a/llvm/lib/Linker/IRMover.cpp +++ b/llvm/lib/Linker/IRMover.cpp @@ -1512,6 +1512,11 @@ Error IRLinker::run() { // Loop over all of the linked values to compute type mappings. computeTypeMapping(); + // Convert module level attributes to function level attributes because + // after merging modules the attributes might change and would have different + // effect on the functions as the original module would have. + copyModuleAttrToFunctions(*SrcM); + std::reverse(Worklist.begin(), Worklist.end()); while (!Worklist.empty()) { GlobalValue *GV = Worklist.back(); @@ -1677,6 +1682,11 @@ IRMover::IRMover(Module &M) : Composite(M) { for (const auto *MD : StructTypes.getVisitedMetadata()) { SharedMDs[MD].reset(const_cast(MD)); } + + // Convert module level attributes to function level attributes because + // after merging modules the attributes might change and would have different + // effect on the functions as the original module would have. + copyModuleAttrToFunctions(M); } Error IRMover::move(std::unique_ptr Src, diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp index 1f773e2a7e0fc..3368a509661ae 100644 --- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp @@ -820,7 +820,7 @@ void ARMAsmPrinter::emitAttributes() { auto *BTIValue = mdconst::extract_or_null( SourceModule->getModuleFlag("branch-target-enforcement")); - if (BTIValue && BTIValue->isOne()) { + if (BTIValue && !BTIValue->isZero()) { // If "+pacbti" is used as an architecture extension, // Tag_BTI_extension is emitted in // ARMTargetStreamer::emitTargetAttributes(). diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp index 46fb567593add..aa1346d9ee56a 100644 --- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp @@ -1271,7 +1271,7 @@ bool LowerTypeTestsModule::hasBranchTargetEnforcement() { // the module flags. if (const auto *BTE = mdconst::extract_or_null( M.getModuleFlag("branch-target-enforcement"))) - HasBranchTargetEnforcement = (BTE->getZExtValue() != 0); + HasBranchTargetEnforcement = !BTE->isZero(); else HasBranchTargetEnforcement = 0; } diff --git a/llvm/test/Bitcode/upgrade-branch-protection.ll b/llvm/test/Bitcode/upgrade-branch-protection.ll index 1b33e3901198e..6f60ba543e6c0 100644 --- a/llvm/test/Bitcode/upgrade-branch-protection.ll +++ b/llvm/test/Bitcode/upgrade-branch-protection.ll @@ -1,8 +1,11 @@ -;; Test that module flags "branch-target-enforcement" and "sign-return-address" can be upgraded to -;; are upgraded from Error to Min. +;; Test that module flags "branch-target-enforcement" and "sign-return-address" +;; can be upgraded to are upgraded from Error to Min and the value is changed 2 +;; as the module is converted to the semantic. ; RUN: llvm-as %s -o - | llvm-dis - | FileCheck %s +target triple = "aarch64-unknown-linux-gnu" + !llvm.module.flags = !{!0, !1, !2, !3} !0 = !{i32 1, !"branch-target-enforcement", i32 1} @@ -10,7 +13,7 @@ !2 = !{i32 1, !"sign-return-address-all", i32 1} !3 = !{i32 1, !"sign-return-address-with-bkey", i32 1} -;CHECK: !0 = !{i32 8, !"branch-target-enforcement", i32 1} -;CHECK: !1 = !{i32 8, !"sign-return-address", i32 1} -;CHECK: !2 = !{i32 8, !"sign-return-address-all", i32 1} -;CHECK: !3 = !{i32 8, !"sign-return-address-with-bkey", i32 1} \ No newline at end of file +;CHECK: !0 = !{i32 8, !"branch-target-enforcement", i32 2} +;CHECK: !1 = !{i32 8, !"sign-return-address", i32 2} +;CHECK: !2 = !{i32 8, !"sign-return-address-all", i32 2} +;CHECK: !3 = !{i32 8, !"sign-return-address-with-bkey", i32 2} diff --git a/llvm/test/CodeGen/Thumb2/pacbti-m-outliner-5.ll b/llvm/test/CodeGen/Thumb2/pacbti-m-outliner-5.ll index 053d6a1d01800..d741411fe6cdd 100644 --- a/llvm/test/CodeGen/Thumb2/pacbti-m-outliner-5.ll +++ b/llvm/test/CodeGen/Thumb2/pacbti-m-outliner-5.ll @@ -94,5 +94,5 @@ attributes #1 = { minsize nofree norecurse nounwind optsize } !llvm.module.flags = !{!0, !1, !2} !0 = !{i32 8, !"branch-target-enforcement", i32 0} -!1 = !{i32 8, !"sign-return-address", i32 1} +!1 = !{i32 8, !"sign-return-address", i32 2} !2 = !{i32 8, !"sign-return-address-all", i32 0} diff --git a/llvm/test/LTO/AArch64/Inputs/foo.ll b/llvm/test/LTO/AArch64/Inputs/foo.ll deleted file mode 100644 index 961b0d4e7997e..0000000000000 --- a/llvm/test/LTO/AArch64/Inputs/foo.ll +++ /dev/null @@ -1,16 +0,0 @@ -target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" -target triple = "aarch64-unknown-linux-gnu" - -define dso_local i32 @foo() #0 { -entry: - ret i32 42 -} - -attributes #0 = { noinline nounwind optnone uwtable } - -!llvm.module.flags = !{!0, !1, !2, !3} - -!0 = !{i32 8, !"branch-target-enforcement", i32 1} -!1 = !{i32 8, !"sign-return-address", i32 1} -!2 = !{i32 8, !"sign-return-address-all", i32 1} -!3 = !{i32 8, !"sign-return-address-with-bkey", i32 1} diff --git a/llvm/test/LTO/AArch64/TestInputs/bar.ll b/llvm/test/LTO/AArch64/TestInputs/bar.ll new file mode 100644 index 0000000000000..7c2a753bcc403 --- /dev/null +++ b/llvm/test/LTO/AArch64/TestInputs/bar.ll @@ -0,0 +1,35 @@ +;; This file contains the new semantic of the branch-target-enforcement, sign-return-address. +;; Used for test mixing a mixed link case and also verify the import too in llc. + +; RUN: llc -mattr=+pauth -mattr=+bti %s -o - | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +define dso_local void @bar() #0 { +entry: + ret void +} +; CHECK-LABEL: bar: +; CHECK-NOT: hint +; CHECK-NOT: bti +; CHECK: ret + +define dso_local void @baz() #1 { +entry: + ret void +} + +; CHECK-LABEL: baz: +; CHECK: bti c +; CHECK: ret + +attributes #0 = { noinline nounwind optnone uwtable } +attributes #1 = { noinline nounwind optnone uwtable "branch-target-enforcement" } + +!llvm.module.flags = !{!0, !1, !2, !3} + +!0 = !{i32 8, !"branch-target-enforcement", i32 2} +!1 = !{i32 8, !"sign-return-address", i32 2} +!2 = !{i32 8, !"sign-return-address-all", i32 2} +!3 = !{i32 8, !"sign-return-address-with-bkey", i32 2} diff --git a/llvm/test/LTO/AArch64/TestInputs/fiz.ll b/llvm/test/LTO/AArch64/TestInputs/fiz.ll new file mode 100644 index 0000000000000..e57842681d918 --- /dev/null +++ b/llvm/test/LTO/AArch64/TestInputs/fiz.ll @@ -0,0 +1,41 @@ +;; This file contains the previous semantic of the branch-target-enforcement, sign-return-address. +;; Used for test mixing a mixed link case and also verify the import too in llc. + +; RUN: llc -mattr=+pauth -mattr=+bti %s -o - | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +declare void @func() + +define i32 @fiz_on() #0 { +entry: + call void @func() + ret i32 42 +} + +; CHECK-LABEL: fiz_on: +; CHECK: paciasp +; CHECK: bl func +; CHECK: retaa + +define i32 @fiz_off() #1 { +entry: + ret i32 43 +} + +; CHECK-LABEL: fiz_off: +; CHECK-NOT: pac +; CHECK-NOT: hint +; CHECK-NOT: bti +; CHECK: ret + +attributes #0 = { noinline nounwind optnone uwtable } +attributes #1 = { noinline nounwind optnone uwtable "branch-target-enforcement"="false" "sign-return-address"="none" } + +!llvm.module.flags = !{!0, !1, !2, !3} + +!0 = !{i32 8, !"branch-target-enforcement", i32 1} +!1 = !{i32 8, !"sign-return-address", i32 1} +!2 = !{i32 8, !"sign-return-address-all", i32 0} +!3 = !{i32 8, !"sign-return-address-with-bkey", i32 0} diff --git a/llvm/test/LTO/AArch64/TestInputs/foo.ll b/llvm/test/LTO/AArch64/TestInputs/foo.ll new file mode 100644 index 0000000000000..689d93849939d --- /dev/null +++ b/llvm/test/LTO/AArch64/TestInputs/foo.ll @@ -0,0 +1,38 @@ +;; This file contains the previous semantic of the branch-target-enforcement, sign-return-address. +;; Used for test mixing a mixed link case and also verify the import too in llc. + +; RUN: llc -mattr=+pauth -mattr=+bti %s -o - | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +define i32 @foo_on() #0 { +entry: + ret i32 42 +} + +; CHECK-LABEL: foo_on: +; CHECK: pacibsp +; CHECK: mov +; CHECK: retab + +define i32 @foo_off() #1 { +entry: + ret i32 43 +} + +; CHECK-LABEL: foo_off: +; CHECK-NOT: pac +; CHECK-NOT: hint +; CHECK-NOT: bti +; CHECK: ret + +attributes #0 = { noinline nounwind optnone uwtable } +attributes #1 = { noinline nounwind optnone uwtable "branch-target-enforcement"="false" "sign-return-address"="none" } + +!llvm.module.flags = !{!0, !1, !2, !3} + +!0 = !{i32 8, !"branch-target-enforcement", i32 1} +!1 = !{i32 8, !"sign-return-address", i32 1} +!2 = !{i32 8, !"sign-return-address-all", i32 1} +!3 = !{i32 8, !"sign-return-address-with-bkey", i32 1} diff --git a/llvm/test/LTO/AArch64/TestInputs/old.ll b/llvm/test/LTO/AArch64/TestInputs/old.ll new file mode 100644 index 0000000000000..2b1758b0107a1 --- /dev/null +++ b/llvm/test/LTO/AArch64/TestInputs/old.ll @@ -0,0 +1,59 @@ +;; This file contains the previous semantic of the branch-target-enforcement, sign-return-address. +;; Used for test mixing a mixed link case and also verify the import too in llc. + +; RUN: llc -mattr=+pauth -mattr=+bti %s -o - | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +define i32 @old_bti() #0 { +entry: + ret i32 2 +} + +; CHECK-LABEL: old_bti: +; CHECK: bti c +; CHECK: mov +; CHECK: ret + +define i32 @old_pac() #1 { +entry: + ret i32 2 +} + +; CHECK-LABEL: old_pac: +; CHECK: paciasp +; CHECK: mov +; CHECK: retaa + + +define i32 @old_none() #2 { +entry: + ret i32 3 +} + +; CHECK-LABEL: old_none: +; CHECK-NOT: hint +; CHECK-NOT: paci +; CHECK-NOT: bti +; CHECK: ret + +declare i32 @func(i32) + +define i32 @old_none_leaf() #3 { +entry: + %0 = call i32 @func() + ret i32 %0 +} + +; CHECK-LABEL: old_none_leaf: +; CHECK: paciasp +; CHECK: bl func +; CHECK: retaa + +attributes #0 = { noinline nounwind optnone "branch-target-enforcement"="true" } +attributes #1 = { noinline nounwind optnone "branch-target-enforcement"="false" "sign-return-address"="all" "sign-return-address-key"="a_key" } +attributes #2 = { noinline nounwind optnone "branch-target-enforcement"="false" "sign-return-address"="none" } +attributes #3 = { noinline nounwind optnone "branch-target-enforcement"="false" "sign-return-address"="non-leaf" "sign-return-address-key"="a_key" } + +;; Intentionally no module flags diff --git a/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll b/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll index b3c9828cb645a..aef8907c03411 100644 --- a/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll +++ b/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll @@ -1,10 +1,10 @@ -; Testcase to check that module with different branch-target-enforcement can -; be mixed. -; +;; Testcase to check that module with different branch-target-enforcement can +;; be mixed. +;; ; RUN: llvm-as %s -o %t1.bc -; RUN: llvm-as %p/Inputs/foo.ll -o %t2.bc +; RUN: llvm-as %p/TestInputs/foo.ll -o %t2.bc ; RUN: llvm-lto -exported-symbol main \ -; RUN: -exported-symbol foo \ +; RUN: -exported-symbol foo_on \ ; RUN: -filetype=obj \ ; RUN: %t1.bc %t2.bc \ ; RUN: -o %t1.exe 2>&1 | FileCheck --allow-empty %s @@ -14,11 +14,11 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-gnu" -declare i32 @foo(); +declare i32 @foo_on(); define i32 @main() "sign-return-address"="non-leaf" "sign-return-address-key"="a_key" { entry: - %add = call i32 @foo() + %add = call i32 @foo_on() ret i32 %add } @@ -30,9 +30,12 @@ entry: ; CHECK-NOT: linking module flags 'branch-target-enforcement': IDs have conflicting values in ; CHECK-DUMP:
: +; CHECK-DUMP: paciasp +; CHECK-DUMP: str ; CHECK-DUMP: bl 0x8 -; CHECK-DUMP: : +; CHECK-DUMP: : +; CHECK-DUMP: pacibsp -; `main` doesn't support BTI while `foo` does, so in the binary -; we should see only PAC which is supported by both. +;; `main` doesn't support BTI while `foo` does, so in the binary +;; we should see only PAC which is supported by both. ; CHECK-PROP: Properties: aarch64 feature: PAC \ No newline at end of file diff --git a/llvm/test/LTO/AArch64/link-sign-return-address.ll b/llvm/test/LTO/AArch64/link-sign-return-address.ll new file mode 100644 index 0000000000000..df6276f32b6f1 --- /dev/null +++ b/llvm/test/LTO/AArch64/link-sign-return-address.ll @@ -0,0 +1,127 @@ +;; Testcase to check that module with different sign return address can +;; be mixed. +; +; RUN: llvm-as %s -o %t1.bc +; RUN: llvm-as %p/TestInputs/foo.ll -o %t2.bc +; RUN: llvm-as %p/TestInputs/fiz.ll -o %t3.bc +; RUN: llvm-as %p/TestInputs/bar.ll -o %t4.bc +; RUN: llvm-as %p/TestInputs/old.ll -o %t5.bc +; RUN: llvm-lto -exported-symbol main \ +; RUN: -exported-symbol foo_on \ +; RUN: -exported-symbol foo_off \ +; RUN: -exported-symbol fiz_on \ +; RUN: -exported-symbol fiz_off \ +; RUN: -exported-symbol bar \ +; RUN: -exported-symbol baz \ +; RUN: -exported-symbol old_bti \ +; RUN: -exported-symbol old_pac \ +; RUN: -exported-symbol old_none \ +; RUN: -filetype=obj \ +; RUN: %t5.bc %t4.bc %t3.bc %t2.bc %t1.bc \ +; RUN: -o %t1.exe 2>&1 +; RUN: llvm-objdump -d %t1.exe | FileCheck --check-prefix=CHECK-DUMP %s +; RUN: llvm-readelf -n %t1.exe | FileCheck --allow-empty --check-prefix=CHECK-PROP %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +declare i32 @foo_on(); +declare i32 @foo_off(); +declare i32 @fiz_on(); +declare i32 @fiz_off(); +declare void @baz(); +declare void @bar(); +declare i32 @old_bti(); +declare i32 @old_pac(); +declare i32 @old_none(); + +define i32 @main() #0 { +entry: + call i32 @foo_on() + call i32 @foo_off() + call i32 @fiz_on() + call i32 @fiz_off() + call void @bar() + call void @baz() + call i32 @old_bti() + call i32 @old_pac() + call i32 @old_none() + ret i32 0 +} + +attributes #0 = { noinline nounwind optnone } + +!llvm.module.flags = !{!0, !1, !2, !3 } +!0 = !{i32 8, !"branch-target-enforcement", i32 0} +!1 = !{i32 8, !"sign-return-address", i32 0} +!2 = !{i32 8, !"sign-return-address-all", i32 0} +!3 = !{i32 8, !"sign-return-address-with-bkey", i32 0} + + +; CHECK-DUMP-LABEL: : +; CHECK-DUMP-NEXT: bti c +; CHECK-DUMP-NEXT: mov w0, #0x2 +; CHECK-DUMP-NEXT: ret + +; CHECK-DUMP-LABEL: : +; CHECK-DUMP-NEXT: paciasp +; CHECK-DUMP-NEXT: mov w0, #0x2 +; CHECK-DUMP-NEXT: autiasp +; CHECK-DUMP-NEXT: ret + +; CHECK-DUMP-LABEL: : +; CHECK-DUMP-NEXT: mov w0, #0x3 +; CHECK-DUMP-NEXT: ret + +; CHECK-DUMP-LABEL: : +; CHECK-DUMP-NEXT: ret + +; CHECK-DUMP-LABEL: : +; CHECK-DUMP-NEXT: bti c +; CHECK-DUMP-NEXT: ret + +;; fiz.ll represents a module with the old style of the function attributes. +;; fiz_on shall have PAC with A-key as it requested at module level. +; CHECK-DUMP-LABEL: : +; CHECK-DUMP-NEXT: paciasp +; CHECK-DUMP-NEXT: str x30, [sp, #-0x10]! +; CHECK-DUMP-NEXT: bl 0x38 +; CHECK-DUMP-NEXT: mov w0, #0x2a +; CHECK-DUMP-NEXT: ldr x30, [sp], #0x10 +; CHECK-DUMP-NEXT: autiasp +; CHECK-DUMP-NEXT: ret + +;; fiz_off shall not have BTI or PAC instructions as they are disabled at function scope. +; CHECK-DUMP-LABEL: : +; CHECK-DUMP-NEXT: mov w0, #0x2b +; CHECK-DUMP-NEXT: ret + +;; foo.ll represents a module with the old style of the function attributes. +;; foo_on shall have PAC with B-key as it requested at module level. +; CHECK-DUMP-LABEL: : +; CHECK-DUMP-NEXT: pacibsp +; CHECK-DUMP-NEXT: mov w0, #0x2a +; CHECK-DUMP-NEXT: autibsp +; CHECK-DUMP-NEXT: ret + +;; foo_off shall not have BTI or PAC instructions as they are disabled at function scope. +; CHECK-DUMP-LABEL: : +; CHECK-DUMP-NEXT: mov w0, #0x2b +; CHECK-DUMP-NEXT: ret + +; CHECK-DUMP-LABEL:
: +; CHECK-DUMP-NOT: paciasp +; CHECK-DUMP-NEXT: str x30, +; CHECK-DUMP-NEXT: bl +; CHECK-DUMP-NEXT: bl +; CHECK-DUMP-NEXT: bl +; CHECK-DUMP-NEXT: bl +; CHECK-DUMP-NEXT: bl +; CHECK-DUMP-NEXT: bl +; CHECK-DUMP-NEXT: bl +; CHECK-DUMP-NEXT: bl +; CHECK-DUMP-NEXT: bl + +;; `main` doesn't support PAC sign-return-address while `foo` does, so in the binary +;; we should not see anything. +; CHECK-PROP-NOT: Properties: aarch64 feature: PAC diff --git a/llvm/test/Linker/link-arm-and-thumb.ll b/llvm/test/Linker/link-arm-and-thumb.ll index a90f2128e4430..b5984bf557947 100644 --- a/llvm/test/Linker/link-arm-and-thumb.ll +++ b/llvm/test/Linker/link-arm-and-thumb.ll @@ -13,11 +13,11 @@ entry: ret i32 %add } -; CHECK: define i32 @main() { +; CHECK: define i32 @main() ; CHECK: define i32 @foo(i32 %a, i32 %b) [[ARM_ATTRS:#[0-9]+]] ; CHECK: define i32 @bar(i32 %a, i32 %b) [[THUMB_ATTRS:#[0-9]+]] -; CHECK: attributes [[ARM_ATTRS]] = { "target-features"="-thumb-mode" } -; CHECK: attributes [[THUMB_ATTRS]] = { "target-features"="+thumb-mode" } +; CHECK: attributes [[ARM_ATTRS]] = {{{.*}}"target-features"="-thumb-mode" } +; CHECK: attributes [[THUMB_ATTRS]] = {{{.*}}"target-features"="+thumb-mode" } ; STDERR-NOT: warning: Linking two modules of different target triples: diff --git a/llvm/test/ThinLTO/AArch64/aarch64_inline.ll b/llvm/test/ThinLTO/AArch64/aarch64_inline.ll new file mode 100644 index 0000000000000..401f66d48c224 --- /dev/null +++ b/llvm/test/ThinLTO/AArch64/aarch64_inline.ll @@ -0,0 +1,86 @@ +;; Test verifies inlining happens cross module when module flags are upgraded. +;; `foo` and `main` are both old semantic while bar is the new semantic. +;; Regression test for #82763 + +; RUN: split-file %s %t +; RUN: opt -module-summary %t/foo.ll -o %t/foo.o +; RUN: opt -module-summary %t/bar.ll -o %t/bar.o +; RUN: opt -module-summary %t/main.ll -o %t/main.o +; RUN: llvm-lto2 run %t/main.o %t/foo.o %t/bar.o -save-temps \ +; RUN: -o %t/t.exe \ +; RUN: -r=%t/foo.o,foo,plx \ +; RUN: -r=%t/bar.o,bar,plx \ +; RUN: -r=%t/main.o,foo,l \ +; RUN: -r=%t/main.o,bar,l \ +; RUN: -r=%t/main.o,main,plx 2>&1 +; RUN: llvm-dis %t/t.exe.1.4.opt.bc -o - | FileCheck %s + +; CHECK: define dso_local noundef i32 @main() local_unnamed_addr #0 { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret i32 35 +; CHECK-NEXT: } + +; CHECK: attributes #0 = { {{.*}}"branch-target-enforcement" "sign-return-address"="all" "sign-return-address-key"="b_key" } + +; CHECK: !llvm.module.flags = !{!0, !1, !2, !3} + +; CHECK: !0 = !{i32 8, !"branch-target-enforcement", i32 2} +; CHECK: !1 = !{i32 8, !"sign-return-address", i32 2} +; CHECK: !2 = !{i32 8, !"sign-return-address-all", i32 2} +; CHECK: !3 = !{i32 8, !"sign-return-address-with-bkey", i32 2} + + +;--- foo.ll +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +define dso_local noundef i32 @foo() local_unnamed_addr #0 { +entry: + ret i32 34 +} + +attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) } +!llvm.module.flags = !{!0, !1, !2, !3 } +!0 = !{i32 8, !"branch-target-enforcement", i32 1} +!1 = !{i32 8, !"sign-return-address", i32 1} +!2 = !{i32 8, !"sign-return-address-all", i32 1} +!3 = !{i32 8, !"sign-return-address-with-bkey", i32 1} + +;--- bar.ll +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +define dso_local noundef i32 @bar() local_unnamed_addr #0 { +entry: + ret i32 1 +} + +attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "branch-target-enforcement" "sign-return-address"="all" "sign-return-address-key"="b_key" } +!llvm.module.flags = !{!0, !1, !2, !3 } +!0 = !{i32 8, !"branch-target-enforcement", i32 2} +!1 = !{i32 8, !"sign-return-address", i32 2} +!2 = !{i32 8, !"sign-return-address-all", i32 2} +!3 = !{i32 8, !"sign-return-address-with-bkey", i32 2} + +;--- main.ll +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +declare i32 @foo(); +declare i32 @bar(); + +define i32 @main() #0 { +entry: + %1 = call i32 @foo() + %2 = call i32 @bar() + %3 = add i32 %1, %2 + ret i32 %3 +} + +attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) } + +!llvm.module.flags = !{!0, !1, !2, !3 } +!0 = !{i32 8, !"branch-target-enforcement", i32 1} +!1 = !{i32 8, !"sign-return-address", i32 1} +!2 = !{i32 8, !"sign-return-address-all", i32 1} +!3 = !{i32 8, !"sign-return-address-with-bkey", i32 1} From dec57c87f61bdb43299512005bfe7ea2b77bd0fe Mon Sep 17 00:00:00 2001 From: Abhishek Kaushik Date: Wed, 22 Oct 2025 13:03:20 +0530 Subject: [PATCH 060/530] [NFC][TableGen] `emitGetOperandIdxName`: Pass arg by const& (#164563) Take `OperandNameToID` by const& to avoid copying --- llvm/utils/TableGen/InstrInfoEmitter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp index d1b14fbbdcd3e..0b90f9104208a 100644 --- a/llvm/utils/TableGen/InstrInfoEmitter.cpp +++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp @@ -285,7 +285,7 @@ emitGetNamedOperandIdx(raw_ostream &OS, static void emitGetOperandIdxName(raw_ostream &OS, - MapVector OperandNameToID, + const MapVector &OperandNameToID, const MapVector, unsigned> &OperandMap, unsigned MaxNumOperands, unsigned NumOperandNames) { OS << "LLVM_READONLY OpName getOperandIdxName(uint16_t Opcode, int16_t Idx) " From 3656f6f22652b01eb13fc2f39afcf36396d64c4f Mon Sep 17 00:00:00 2001 From: paperchalice Date: Wed, 22 Oct 2025 15:40:31 +0800 Subject: [PATCH 061/530] [CodeGen] Remove `-enable-unsafe-fp-math` option (#164559) Hope this can unblock #105746. --- llvm/lib/CodeGen/CommandFlags.cpp | 6 ------ llvm/test/CodeGen/AArch64/arm64-fma-combines.ll | 2 +- llvm/test/CodeGen/AArch64/csel-zero-float.ll | 2 +- llvm/test/CodeGen/AArch64/svtcf-fmul-fdiv-combine.ll | 2 +- .../AMDGPU/GlobalISel/combine-fma-unmerge-values.mir | 2 +- .../AMDGPU/GlobalISel/inst-select-fract.f64.mir | 4 ++-- llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll | 8 ++++---- llvm/test/CodeGen/AMDGPU/fpext.f16.ll | 10 +++++----- llvm/test/CodeGen/AMDGPU/fptosi.f16.ll | 8 ++++---- llvm/test/CodeGen/AMDGPU/fptoui.f16.ll | 8 ++++---- llvm/test/CodeGen/AMDGPU/fptrunc.ll | 8 ++++---- llvm/test/CodeGen/AMDGPU/fract.f64.ll | 4 ++-- llvm/test/CodeGen/AMDGPU/fract.ll | 4 ++-- llvm/test/CodeGen/AMDGPU/inline-attr.ll | 2 +- llvm/test/CodeGen/AMDGPU/prevent-fmul-hoist-ir.ll | 2 +- llvm/test/CodeGen/AMDGPU/sitofp.f16.ll | 8 ++++---- llvm/test/CodeGen/AMDGPU/uitofp.f16.ll | 8 ++++---- llvm/test/CodeGen/Hexagon/swp-phi.ll | 2 +- llvm/test/CodeGen/NVPTX/fma-assoc.ll | 4 ++-- llvm/test/CodeGen/PowerPC/fmf-propagation.ll | 4 ++-- llvm/test/CodeGen/PowerPC/scalar-equal.ll | 4 ++-- llvm/test/CodeGen/PowerPC/scalar-min-max-p10.ll | 2 +- llvm/test/CodeGen/PowerPC/scalar_cmp.ll | 4 ++-- llvm/test/CodeGen/SystemZ/fp-sincos-01.ll | 2 +- llvm/test/CodeGen/X86/2006-05-22-FPSetEQ.ll | 2 +- .../test/CodeGen/X86/2008-05-01-InvalidOrdCompare.ll | 2 +- llvm/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll | 2 +- llvm/test/CodeGen/X86/avx-minmax.ll | 2 +- llvm/test/CodeGen/X86/avx512-unsafe-fp-math.ll | 2 +- .../CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll | 2 +- .../X86/avx512fp16-combine-xor-vfmulc-fadd.ll | 2 +- .../CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll | 2 +- llvm/test/CodeGen/X86/dag-fmf-cse.ll | 2 +- llvm/test/CodeGen/X86/fabs.ll | 2 +- llvm/test/CodeGen/X86/fp-undef.ll | 2 +- llvm/test/CodeGen/X86/fsxor-alignment.ll | 2 +- llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll | 2 +- llvm/test/CodeGen/X86/neg_fp.ll | 2 +- llvm/test/CodeGen/X86/negate-add-zero.ll | 2 +- llvm/test/CodeGen/X86/recip-pic.ll | 2 +- llvm/test/CodeGen/X86/sincos-opt.ll | 6 +++--- llvm/test/CodeGen/X86/sincos.ll | 2 +- llvm/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll | 12 ++++++------ .../Transforms/SimplifyCFG/AArch64/prefer-fma.ll | 2 +- .../Transforms/SimplifyCFG/PowerPC/prefer-fma.ll | 2 +- 45 files changed, 80 insertions(+), 86 deletions(-) diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp index 7f19161c8d138..c1365f499dcf5 100644 --- a/llvm/lib/CodeGen/CommandFlags.cpp +++ b/llvm/lib/CodeGen/CommandFlags.cpp @@ -218,12 +218,6 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() { "Enable frame pointer elimination"))); CGBINDOPT(FramePointerUsage); - [[maybe_unused]] - static cl::opt EnableUnsafeFPMath( - "enable-unsafe-fp-math", - cl::desc("Enable optimizations that may decrease FP precision"), - cl::init(false)); - static cl::opt EnableNoInfsFPMath( "enable-no-infs-fp-math", cl::desc("Enable FP math optimizations that assume no +-Infs"), diff --git a/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll b/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll index e17a0a96955b1..54f752efb2b3d 100644 --- a/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll +++ b/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -O=3 -mtriple=arm64-apple-ios -mcpu=cyclone -mattr=+fullfp16 -enable-unsafe-fp-math -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -O=3 -mtriple=arm64-apple-ios -mcpu=cyclone -mattr=+fullfp16 -verify-machineinstrs | FileCheck %s define void @foo_2d(ptr %src) { ; CHECK-LABEL: %entry diff --git a/llvm/test/CodeGen/AArch64/csel-zero-float.ll b/llvm/test/CodeGen/AArch64/csel-zero-float.ll index 6edde13f0a7c7..56a33cc4936be 100644 --- a/llvm/test/CodeGen/AArch64/csel-zero-float.ll +++ b/llvm/test/CodeGen/AArch64/csel-zero-float.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=aarch64-none-linux-gnu -enable-unsafe-fp-math < %s +; RUN: llc -mtriple=aarch64-none-linux-gnu < %s ; There is no invocation to FileCheck as this ; caused a crash in "Post-RA pseudo instruction expansion" diff --git a/llvm/test/CodeGen/AArch64/svtcf-fmul-fdiv-combine.ll b/llvm/test/CodeGen/AArch64/svtcf-fmul-fdiv-combine.ll index f78fcea9e3a1f..b8dcd6f120957 100644 --- a/llvm/test/CodeGen/AArch64/svtcf-fmul-fdiv-combine.ll +++ b/llvm/test/CodeGen/AArch64/svtcf-fmul-fdiv-combine.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mtriple aarch64-none-linux-gnu -enable-unsafe-fp-math -mattr=+fullfp16 < %s | FileCheck %s +; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+fullfp16 < %s | FileCheck %s define half @scvtf_f16_2(i32 %state) { ; CHECK-LABEL: scvtf_f16_2: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir index d9ac9a71ebb25..de1bb477a3e55 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -enable-unsafe-fp-math -run-pass=amdgpu-prelegalizer-combiner %s -o - | FileCheck -check-prefix=GFX10 %s +# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-prelegalizer-combiner %s -o - | FileCheck -check-prefix=GFX10 %s # Test that we fold correct element from G_UNMERGE_VALUES into fma diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir index 52b1beb0b0594..91f2f6f15b931 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -enable-unsafe-fp-math -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX10 -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -enable-unsafe-fp-math -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX11 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX10 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX11 --- name: fract_f64_neg diff --git a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll index 13206adc10ffa..f45070cbe88ee 100644 --- a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s -; RUN: llc -mtriple=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s +; RUN: llc -mtriple=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s +; RUN: llc -mtriple=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s -; RUN: llc -mtriple=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FASTFMA %s -; RUN: llc -mtriple=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SLOWFMA %s +; RUN: llc -mtriple=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FASTFMA %s +; RUN: llc -mtriple=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SLOWFMA %s ; FIXME: This should also fold when fma is actually fast if an FMA ; exists in the original program. diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll index d41e2c62be9a6..8df756481e54a 100644 --- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,GFX9 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-TRUE16 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-FAKE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-FAKE16 %s define amdgpu_kernel void @fpext_f16_to_f32( ; SI-LABEL: fpext_f16_to_f32: diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll index a43292d9e5021..a043d537fbc45 100644 --- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s define amdgpu_kernel void @fptosi_f16_to_i16( diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll index 96cb62110001e..af1ab37e48474 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s define amdgpu_kernel void @fptoui_f16_to_i16( diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll index 5d311776066e5..f11654912d02d 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll @@ -2,14 +2,14 @@ ; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI-SDAG,VI-SAFE-SDAG %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI-GISEL,VI-SAFE-GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI-SDAG,VI-UNSAFE-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI-SDAG,VI-UNSAFE-SDAG %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10-SDAG,GFX10-SAFE-SDAG %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10-GISEL,GFX10-SAFE-GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=0 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX10-SDAG,GFX10-UNSAFE-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10-SDAG,GFX10-UNSAFE-SDAG %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-SAFE-SDAG %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-SAFE-GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,+real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-TRUE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,-real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-FAKE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-UNSAFE-GISEL-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-UNSAFE-GISEL-FAKE16 %s diff --git a/llvm/test/CodeGen/AMDGPU/fract.f64.ll b/llvm/test/CodeGen/AMDGPU/fract.f64.ll index f09c1c6b0e5c7..cc2e78d431bf5 100644 --- a/llvm/test/CodeGen/AMDGPU/fract.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fract.f64.ll @@ -2,8 +2,8 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck --check-prefixes=GCN,CI,FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCN,CI,FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -enable-unsafe-fp-math < %s | FileCheck --check-prefixes=GCN,SI,FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck --check-prefixes=GCN,CI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck --check-prefixes=GCN,SI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCN,CI,FUNC %s declare double @llvm.fabs.f64(double) #0 declare double @llvm.floor.f64(double) #0 diff --git a/llvm/test/CodeGen/AMDGPU/fract.ll b/llvm/test/CodeGen/AMDGPU/fract.ll index 8ef0fcfc1e404..723fd93411c36 100644 --- a/llvm/test/CodeGen/AMDGPU/fract.ll +++ b/llvm/test/CodeGen/AMDGPU/fract.ll @@ -1,8 +1,8 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck --check-prefix=GCN %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck --check-prefix=GCN %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefix=GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -enable-unsafe-fp-math < %s | FileCheck --check-prefix=GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck --check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck --check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefix=GCN %s declare float @llvm.fabs.f32(float) #0 declare float @llvm.floor.f32(float) #0 diff --git a/llvm/test/CodeGen/AMDGPU/inline-attr.ll b/llvm/test/CodeGen/AMDGPU/inline-attr.ll index 4ae0ba065adfc..4e93ecac71622 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-attr.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-attr.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 -; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -O3 -enable-unsafe-fp-math %s | FileCheck --check-prefixes=GCN,UNSAFE %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -O3 %s | FileCheck --check-prefixes=GCN,UNSAFE %s ; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -O3 -enable-no-nans-fp-math %s | FileCheck --check-prefixes=GCN,NONANS %s ; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -O3 -enable-no-infs-fp-math %s | FileCheck --check-prefixes=GCN,NOINFS %s diff --git a/llvm/test/CodeGen/AMDGPU/prevent-fmul-hoist-ir.ll b/llvm/test/CodeGen/AMDGPU/prevent-fmul-hoist-ir.ll index ef3e04c0e9968..6ce614bd92480 100644 --- a/llvm/test/CodeGen/AMDGPU/prevent-fmul-hoist-ir.ll +++ b/llvm/test/CodeGen/AMDGPU/prevent-fmul-hoist-ir.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -S -passes='simplifycfg' -mtriple=amdgcn-- --fp-contract=fast -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX -check-prefix=FP-CONTRACT-FAST %s -; RUN: opt -S -passes='simplifycfg' -mtriple=amdgcn-- --fp-contract=off --enable-unsafe-fp-math -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX -check-prefix=UNSAFE-FP-MATH %s +; RUN: opt -S -passes='simplifycfg' -mtriple=amdgcn-- --fp-contract=off -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX -check-prefix=UNSAFE-FP-MATH %s ; RUN: opt -S -passes='simplifycfg' -mtriple=amdgcn-- --fp-contract=off -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX -check-prefix=NO-UNSAFE-FP-MATH %s define double @is_profitable_f64_contract(ptr dereferenceable(8) %ptr_x, ptr dereferenceable(8) %ptr_y, ptr dereferenceable(8) %ptr_a) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll index 09596e956d603..7ddd90e7aeb53 100644 --- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s define amdgpu_kernel void @sitofp_i16_to_f16( ; SI-LABEL: sitofp_i16_to_f16: diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll index 9bcba6c9a3a77..2d7ce10644928 100644 --- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s define amdgpu_kernel void @uitofp_i16_to_f16( ; SI-LABEL: uitofp_i16_to_f16: diff --git a/llvm/test/CodeGen/Hexagon/swp-phi.ll b/llvm/test/CodeGen/Hexagon/swp-phi.ll index 9b0e1262c73f5..6ce2481e52115 100644 --- a/llvm/test/CodeGen/Hexagon/swp-phi.ll +++ b/llvm/test/CodeGen/Hexagon/swp-phi.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=hexagon -enable-unsafe-fp-math -enable-pipeliner \ +; RUN: llc -mtriple=hexagon -enable-pipeliner \ ; RUN: -pipeliner-prune-deps=false -stats -o /dev/null < %s ; REQUIRES: asserts diff --git a/llvm/test/CodeGen/NVPTX/fma-assoc.ll b/llvm/test/CodeGen/NVPTX/fma-assoc.ll index 6693c9044ca2c..db0eae75ae9ba 100644 --- a/llvm/test/CodeGen/NVPTX/fma-assoc.ll +++ b/llvm/test/CodeGen/NVPTX/fma-assoc.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s -check-prefix=CHECK -; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast -enable-unsafe-fp-math | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-UNSAFE +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-UNSAFE +; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %} -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast -enable-unsafe-fp-math | %ptxas-verify %} define ptx_device float @t1_f32(float %x, float %y, float %z, ; CHECK-UNSAFE-LABEL: t1_f32( diff --git a/llvm/test/CodeGen/PowerPC/fmf-propagation.ll b/llvm/test/CodeGen/PowerPC/fmf-propagation.ll index cad684ecf0170..baa127e451701 100644 --- a/llvm/test/CodeGen/PowerPC/fmf-propagation.ll +++ b/llvm/test/CodeGen/PowerPC/fmf-propagation.ll @@ -2,8 +2,8 @@ ; REQUIRES: asserts ; RUN: llc < %s -mtriple=powerpc64le -debug-only=isel -o /dev/null 2>&1 | FileCheck %s --check-prefix=FMFDEBUG ; RUN: llc < %s -mtriple=powerpc64le | FileCheck %s --check-prefix=FMF -; RUN: llc < %s -mtriple=powerpc64le -debug-only=isel -o /dev/null 2>&1 -enable-unsafe-fp-math -fp-contract=fast -enable-no-nans-fp-math | FileCheck %s --check-prefix=GLOBALDEBUG -; RUN: llc < %s -mtriple=powerpc64le -enable-unsafe-fp-math -fp-contract=fast -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math | FileCheck %s --check-prefix=GLOBAL +; RUN: llc < %s -mtriple=powerpc64le -debug-only=isel -o /dev/null 2>&1 -fp-contract=fast -enable-no-nans-fp-math | FileCheck %s --check-prefix=GLOBALDEBUG +; RUN: llc < %s -mtriple=powerpc64le -fp-contract=fast -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math | FileCheck %s --check-prefix=GLOBAL ; Test FP transforms using instruction/node-level fast-math-flags. ; We're also checking debug output to verify that FMF is propagated to the newly created nodes. diff --git a/llvm/test/CodeGen/PowerPC/scalar-equal.ll b/llvm/test/CodeGen/PowerPC/scalar-equal.ll index 1832475e7795b..c0b11b47236a9 100644 --- a/llvm/test/CodeGen/PowerPC/scalar-equal.ll +++ b/llvm/test/CodeGen/PowerPC/scalar-equal.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names --enable-unsafe-fp-math \ +; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names \ ; RUN: -verify-machineinstrs --enable-no-signed-zeros-fp-math \ ; RUN: --enable-no-nans-fp-math --enable-no-infs-fp-math \ ; RUN: -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s \ ; RUN: --check-prefix=FAST-P8 -; RUN: llc -mcpu=pwr9 -ppc-asm-full-reg-names --enable-unsafe-fp-math \ +; RUN: llc -mcpu=pwr9 -ppc-asm-full-reg-names \ ; RUN: -verify-machineinstrs --enable-no-signed-zeros-fp-math \ ; RUN: --enable-no-nans-fp-math --enable-no-infs-fp-math \ ; RUN: -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s \ diff --git a/llvm/test/CodeGen/PowerPC/scalar-min-max-p10.ll b/llvm/test/CodeGen/PowerPC/scalar-min-max-p10.ll index ca9bacebe7a33..5915bd3b21085 100644 --- a/llvm/test/CodeGen/PowerPC/scalar-min-max-p10.ll +++ b/llvm/test/CodeGen/PowerPC/scalar-min-max-p10.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mcpu=pwr10 -ppc-asm-full-reg-names --enable-unsafe-fp-math \ +; RUN: llc -mcpu=pwr10 -ppc-asm-full-reg-names \ ; RUN: -verify-machineinstrs --enable-no-signed-zeros-fp-math \ ; RUN: --enable-no-nans-fp-math \ ; RUN: -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s diff --git a/llvm/test/CodeGen/PowerPC/scalar_cmp.ll b/llvm/test/CodeGen/PowerPC/scalar_cmp.ll index fd0b494d57677..881d1f4c4093b 100644 --- a/llvm/test/CodeGen/PowerPC/scalar_cmp.ll +++ b/llvm/test/CodeGen/PowerPC/scalar_cmp.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names --enable-unsafe-fp-math \ +; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names \ ; RUN: -verify-machineinstrs --enable-no-signed-zeros-fp-math \ ; RUN: --enable-no-nans-fp-math --enable-no-infs-fp-math \ ; RUN: -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s \ ; RUN: --check-prefix=FAST-P8 -; RUN: llc -mcpu=pwr9 -ppc-asm-full-reg-names --enable-unsafe-fp-math \ +; RUN: llc -mcpu=pwr9 -ppc-asm-full-reg-names \ ; RUN: -verify-machineinstrs --enable-no-signed-zeros-fp-math \ ; RUN: --enable-no-nans-fp-math --enable-no-infs-fp-math \ ; RUN: -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s \ diff --git a/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll b/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll index 4a38d7afba2c9..c87f113a1f499 100644 --- a/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll +++ b/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll @@ -1,7 +1,7 @@ ; Test that combined sin/cos library call is emitted when appropriate ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s --check-prefix=CHECK-OPT -; RUN: llc < %s -mtriple=s390x-linux-gnu -enable-unsafe-fp-math | FileCheck %s --check-prefix=CHECK-OPT +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s --check-prefix=CHECK-OPT define float @f1(float %x) { ; CHECK-OPT-LABEL: f1: diff --git a/llvm/test/CodeGen/X86/2006-05-22-FPSetEQ.ll b/llvm/test/CodeGen/X86/2006-05-22-FPSetEQ.ll index bea11e91669a0..940fe8cf6ba75 100644 --- a/llvm/test/CodeGen/X86/2006-05-22-FPSetEQ.ll +++ b/llvm/test/CodeGen/X86/2006-05-22-FPSetEQ.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=i686-- -mattr=-sse | FileCheck %s -check-prefix=WITHNANS -; RUN: llc < %s -mtriple=i686-- -mattr=-sse -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck %s -check-prefix=NONANS +; RUN: llc < %s -mtriple=i686-- -mattr=-sse -enable-no-nans-fp-math | FileCheck %s -check-prefix=NONANS ; WITHNANS-LABEL: test: ; WITHNANS: setnp diff --git a/llvm/test/CodeGen/X86/2008-05-01-InvalidOrdCompare.ll b/llvm/test/CodeGen/X86/2008-05-01-InvalidOrdCompare.ll index 8411a409a522d..ff7a99ada7e50 100644 --- a/llvm/test/CodeGen/X86/2008-05-01-InvalidOrdCompare.ll +++ b/llvm/test/CodeGen/X86/2008-05-01-InvalidOrdCompare.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -enable-unsafe-fp-math -mtriple=i686-- | FileCheck %s +; RUN: llc < %s -mtriple=i686-- | FileCheck %s ; rdar://5902801 declare void @test2() diff --git a/llvm/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll b/llvm/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll index 6ebbb2e97d139..0e0e20f3f2ede 100644 --- a/llvm/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll +++ b/llvm/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -enable-unsafe-fp-math +; RUN: llc < %s ; target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128" target triple = "i386-apple-macosx10.8.0" diff --git a/llvm/test/CodeGen/X86/avx-minmax.ll b/llvm/test/CodeGen/X86/avx-minmax.ll index 6da04c50c336a..8e4b6c6af4cb1 100644 --- a/llvm/test/CodeGen/X86/avx-minmax.ll +++ b/llvm/test/CodeGen/X86/avx-minmax.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx -enable-no-nans-fp-math | FileCheck %s define <2 x double> @maxpd(<2 x double> %x, <2 x double> %y) { ; CHECK-LABEL: maxpd: diff --git a/llvm/test/CodeGen/X86/avx512-unsafe-fp-math.ll b/llvm/test/CodeGen/X86/avx512-unsafe-fp-math.ll index f8279985f12d2..eb9de8a9536d0 100644 --- a/llvm/test/CodeGen/X86/avx512-unsafe-fp-math.ll +++ b/llvm/test/CodeGen/X86/avx512-unsafe-fp-math.ll @@ -2,7 +2,7 @@ ; RUN: llc < %s -mtriple=x86_64 -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mattr=+avx512f | FileCheck %s --check-prefix=CHECK_UNSAFE ; RUN: llc < %s -mtriple=x86_64 -enable-no-nans-fp-math -mattr=+avx512f | FileCheck %s ; RUN: llc < %s -mtriple=x86_64 -enable-no-signed-zeros-fp-math -mattr=+avx512f | FileCheck %s -; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx512f | FileCheck %s +; RUN: llc < %s -mtriple=x86_64 -mattr=+avx512f | FileCheck %s ; RUN: llc < %s -mtriple=x86_64 -mattr=+avx512f | FileCheck %s define <16 x float> @test_max_v16f32(ptr %a_ptr, <16 x float> %b) { diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll index 5d9784aa5d2eb..1147d7945c694 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --enable-unsafe-fp-math | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl | FileCheck %s define dso_local <32 x half> @test1(<32 x half> %acc.coerce, <32 x half> %lhs.coerce, <32 x half> %rhs.coerce) { ; CHECK-LABEL: test1: diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll index b58bae93ed660..1c4d9c6535761 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --enable-unsafe-fp-math | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl | FileCheck %s define dso_local <32 x half> @test1(<32 x half> %acc.coerce, <32 x half> %lhs.coerce.conj, <32 x half> %rhs.coerce) local_unnamed_addr #0 { ; CHECK-LABEL: test1: diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll index 92bdebb34979a..a8ff969c3c05f 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --enable-unsafe-fp-math | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl | FileCheck %s define dso_local <32 x half> @test1(<32 x half> %lhs.coerce.conj, <32 x half> %rhs.coerce) local_unnamed_addr #0 { ; CHECK-LABEL: test1: diff --git a/llvm/test/CodeGen/X86/dag-fmf-cse.ll b/llvm/test/CodeGen/X86/dag-fmf-cse.ll index 609ccdc367395..cdcc082e72f2b 100644 --- a/llvm/test/CodeGen/X86/dag-fmf-cse.ll +++ b/llvm/test/CodeGen/X86/dag-fmf-cse.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=fma -enable-unsafe-fp-math | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=fma | FileCheck %s ; If fast-math-flags are propagated correctly, the mul1 expression ; should be recognized as a factor in the last fsub, so we should diff --git a/llvm/test/CodeGen/X86/fabs.ll b/llvm/test/CodeGen/X86/fabs.ll index 82c82ac3e917e..4e6da83ec55f3 100644 --- a/llvm/test/CodeGen/X86/fabs.ll +++ b/llvm/test/CodeGen/X86/fabs.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=-sse,-sse2,-sse3 | FileCheck %s --check-prefix=X87 -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck %s --check-prefix=X87UNSAFE +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=-sse,-sse2,-sse3 -enable-no-nans-fp-math | FileCheck %s --check-prefix=X87UNSAFE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 declare float @fabsf(float) diff --git a/llvm/test/CodeGen/X86/fp-undef.ll b/llvm/test/CodeGen/X86/fp-undef.ll index 227f0073e103b..c358085722a9b 100644 --- a/llvm/test/CodeGen/X86/fp-undef.ll +++ b/llvm/test/CodeGen/X86/fp-undef.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ANY -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -enable-unsafe-fp-math | FileCheck %s --check-prefix=ANY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ANY ; This is duplicated from tests for InstSimplify. If you're ; adding something here, you should probably add it there too. diff --git a/llvm/test/CodeGen/X86/fsxor-alignment.ll b/llvm/test/CodeGen/X86/fsxor-alignment.ll index 6fa4a310956dc..32af5b969b6d5 100644 --- a/llvm/test/CodeGen/X86/fsxor-alignment.ll +++ b/llvm/test/CodeGen/X86/fsxor-alignment.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 -enable-unsafe-fp-math | FileCheck %s +; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 | FileCheck %s ; Don't fold the incoming stack arguments into the xorps instructions used ; to do floating-point negations, because the arguments aren't vectors diff --git a/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll b/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll index f710a300dd2d4..bd997d1647766 100644 --- a/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll +++ b/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math < %s | FileCheck %s +; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse < %s | FileCheck %s ; The debug info in this test case was causing a crash because machine trace metrics ; did not correctly ignore debug instructions. The check lines ensure that the diff --git a/llvm/test/CodeGen/X86/neg_fp.ll b/llvm/test/CodeGen/X86/neg_fp.ll index 8020982509819..18ded50b45037 100644 --- a/llvm/test/CodeGen/X86/neg_fp.ll +++ b/llvm/test/CodeGen/X86/neg_fp.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-- -mattr=+sse4.1 | FileCheck %s -; Test that when we don't -enable-unsafe-fp-math, we don't do the optimization +; Test that when we don't, we don't do the optimization ; -0 - (A - B) to (B - A) because A==B, -0 != 0 define float @negfp(float %a, float %b) nounwind { diff --git a/llvm/test/CodeGen/X86/negate-add-zero.ll b/llvm/test/CodeGen/X86/negate-add-zero.ll index eb4e2d312af20..4884832a91a52 100644 --- a/llvm/test/CodeGen/X86/negate-add-zero.ll +++ b/llvm/test/CodeGen/X86/negate-add-zero.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -enable-unsafe-fp-math | FileCheck %s +; RUN: llc < %s | FileCheck %s ; PR3374 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" diff --git a/llvm/test/CodeGen/X86/recip-pic.ll b/llvm/test/CodeGen/X86/recip-pic.ll index d01ecc1e2ce1e..d2620e7202ee7 100644 --- a/llvm/test/CodeGen/X86/recip-pic.ll +++ b/llvm/test/CodeGen/X86/recip-pic.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -enable-unsafe-fp-math -mcpu=slm -relocation-model=pic | FileCheck %s --check-prefix=CHECK +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=slm -relocation-model=pic | FileCheck %s --check-prefix=CHECK define fastcc float @foo(float %x) unnamed_addr #0 { ; CHECK-LABEL: foo: diff --git a/llvm/test/CodeGen/X86/sincos-opt.ll b/llvm/test/CodeGen/X86/sincos-opt.ll index 68854569febba..51f3e52f88a6d 100644 --- a/llvm/test/CodeGen/X86/sincos-opt.ll +++ b/llvm/test/CodeGen/X86/sincos-opt.ll @@ -1,10 +1,10 @@ ; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9.0 -mcpu=core2 | FileCheck %s --check-prefix=OSX_SINCOS ; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=core2 | FileCheck %s --check-prefix=OSX_NOOPT ; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=core2 | FileCheck %s --check-prefix=GNU_SINCOS -; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=core2 -enable-unsafe-fp-math | FileCheck %s --check-prefix=GNU_SINCOS_FASTMATH -; RUN: llc < %s -mtriple=x86_64-pc-linux-gnux32 -mcpu=core2 -enable-unsafe-fp-math | FileCheck %s --check-prefix=GNU_SINCOS_FASTMATH +; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=core2 | FileCheck %s --check-prefix=GNU_SINCOS_FASTMATH +; RUN: llc < %s -mtriple=x86_64-pc-linux-gnux32 -mcpu=core2 | FileCheck %s --check-prefix=GNU_SINCOS_FASTMATH ; RUN: llc < %s -mtriple=x86_64-fuchsia -mcpu=core2 | FileCheck %s --check-prefix=GNU_SINCOS -; RUN: llc < %s -mtriple=x86_64-fuchsia -mcpu=core2 -enable-unsafe-fp-math | FileCheck %s --check-prefix=GNU_SINCOS_FASTMATH +; RUN: llc < %s -mtriple=x86_64-fuchsia -mcpu=core2 | FileCheck %s --check-prefix=GNU_SINCOS_FASTMATH ; RUN: llc < %s -mtriple=x86_64-scei-ps4 -mcpu=btver2 | FileCheck %s --check-prefix=PS4_SINCOS ; RUN: llc < %s -mtriple=x86_64-sie-ps5 -mcpu=znver2 | FileCheck %s --check-prefix=PS4_SINCOS diff --git a/llvm/test/CodeGen/X86/sincos.ll b/llvm/test/CodeGen/X86/sincos.ll index 79034077f1835..9206c254949ba 100644 --- a/llvm/test/CodeGen/X86/sincos.ll +++ b/llvm/test/CodeGen/X86/sincos.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; Make sure this testcase codegens to the sin and cos instructions, not calls -; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math | FileCheck %s +; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 | FileCheck %s ; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 | FileCheck %s declare float @sinf(float) readonly diff --git a/llvm/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll b/llvm/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll index c0beb6fef3234..2822d407b8d01 100644 --- a/llvm/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll +++ b/llvm/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll @@ -1,9 +1,9 @@ -; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math | FileCheck %s --check-prefix=CST --check-prefix=SSE2 -; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+sse4.1 | FileCheck %s --check-prefix=CST --check-prefix=SSE41 -; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx | FileCheck %s --check-prefix=CST --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F -; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512VL +; RUN: llc < %s -mtriple=x86_64 | FileCheck %s --check-prefix=CST --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64 -mattr=+sse4.1 | FileCheck %s --check-prefix=CST --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64 -mattr=+avx | FileCheck %s --check-prefix=CST --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F +; RUN: llc < %s -mtriple=x86_64 -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512VL ; Check that the constant used in the vectors are the right ones. ; SSE2: [[MASKCSTADDR:.LCPI[0-9_]+]]: diff --git a/llvm/test/Transforms/SimplifyCFG/AArch64/prefer-fma.ll b/llvm/test/Transforms/SimplifyCFG/AArch64/prefer-fma.ll index 0f18dc2132ec5..46e38d9abd4c8 100644 --- a/llvm/test/Transforms/SimplifyCFG/AArch64/prefer-fma.ll +++ b/llvm/test/Transforms/SimplifyCFG/AArch64/prefer-fma.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -mtriple=aarch64-linux-gnu -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -hoist-common-insts=true -enable-unsafe-fp-math -S >%t +; RUN: opt < %s -mtriple=aarch64-linux-gnu -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -hoist-common-insts=true -S >%t ; RUN: FileCheck %s < %t ; ModuleID = 't.cc' diff --git a/llvm/test/Transforms/SimplifyCFG/PowerPC/prefer-fma.ll b/llvm/test/Transforms/SimplifyCFG/PowerPC/prefer-fma.ll index c7bc43e1faaed..b61d659673945 100644 --- a/llvm/test/Transforms/SimplifyCFG/PowerPC/prefer-fma.ll +++ b/llvm/test/Transforms/SimplifyCFG/PowerPC/prefer-fma.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=powerpc64le-unknown-linux-gnu -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -hoist-common-insts=true -enable-unsafe-fp-math -S | \ +; RUN: opt < %s -mtriple=powerpc64le-unknown-linux-gnu -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -hoist-common-insts=true -S | \ ; RUN: FileCheck %s ; This case is copied from test/Transforms/SimplifyCFG/AArch64/ From f36f2bff843c2bdf388189f0fd39fe3512070e1d Mon Sep 17 00:00:00 2001 From: Simon Tatham Date: Wed, 22 Oct 2025 09:09:40 +0100 Subject: [PATCH 062/530] [ARM][MVE] Invalid tail predication in LowOverheadLoop pass (#163941) When a loop is converted into a low-overhead loop using tail predication via FPSCR.LTPSIZE, the MQPRCopy pseudo-instruction is expanded into either two VMOVD or a single MVE_VORR, depending on whether the values written to the lanes with a 'false' predicate matter. (MVE_VORR uses the ambient LTPSIZE predicate, so it won't write those lanes at all; the double VMOVD is slower but gets them right.) This check was done based on whether the output of the MQPRCopy is live coming out of the loop. But it missed a case where the live-out value is not _itself_ an MQPRCopy, but is a predicated operation taking its false lanes from an MQPRCopy. Fixes #162644, and adds a new MIR test case derived from the reproducer in that bug. --- llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp | 15 ++ llvm/lib/Target/ARM/Thumb2InstrInfo.cpp | 10 ++ llvm/lib/Target/ARM/Thumb2InstrInfo.h | 3 + .../vctp-vs-unpredicated-copy.mir | 146 ++++++++++++++++++ 4 files changed, 174 insertions(+) create mode 100644 llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-vs-unpredicated-copy.mir diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp index 406f4c1f21983..597d311989b2f 100644 --- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -1036,6 +1036,7 @@ bool LowOverheadLoop::ValidateLiveOuts() { while (!Worklist.empty()) { MachineInstr *MI = Worklist.pop_back_val(); if (MI->getOpcode() == ARM::MQPRCopy) { + LLVM_DEBUG(dbgs() << " Must generate copy as VMOV: " << *MI); VMOVCopies.insert(MI); MachineInstr *CopySrc = RDI.getUniqueReachingMIDef(MI, MI->getOperand(1).getReg()); @@ -1045,6 +1046,20 @@ bool LowOverheadLoop::ValidateLiveOuts() { LLVM_DEBUG(dbgs() << " Unable to handle live out: " << *MI); VMOVCopies.clear(); return false; + } else if (isVectorPredicated(MI)) { + // If this is a predicated instruction with merging semantics, + // check where it gets its false lanes from, if any. + int InactiveIdx = findVPTInactiveOperandIdx(*MI); + if (InactiveIdx != -1) { + SmallPtrSet Defs; + MachineInstr *FalseSrc = RDI.getUniqueReachingMIDef( + MI, MI->getOperand(InactiveIdx).getReg()); + if (FalseSrc) { + LLVM_DEBUG(dbgs() + << " Must check source of false lanes for: " << *MI); + Worklist.push_back(FalseSrc); + } + } } } diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp index 431ce38ad6e99..f5653d459eac8 100644 --- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -805,6 +805,16 @@ int llvm::findFirstVPTPredOperandIdx(const MachineInstr &MI) { return -1; } +int llvm::findVPTInactiveOperandIdx(const MachineInstr &MI) { + const MCInstrDesc &MCID = MI.getDesc(); + + for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i) + if (MCID.operands()[i].OperandType == ARM::OPERAND_VPRED_R) + return i + ARM::SUBOP_vpred_r_inactive; + + return -1; +} + ARMVCC::VPTCodes llvm::getVPTInstrPredicate(const MachineInstr &MI, Register &PredReg) { int PIdx = findFirstVPTPredOperandIdx(MI); diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/llvm/lib/Target/ARM/Thumb2InstrInfo.h index 3ec3a6216b9f6..1b0bf2d499510 100644 --- a/llvm/lib/Target/ARM/Thumb2InstrInfo.h +++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.h @@ -90,6 +90,9 @@ inline ARMVCC::VPTCodes getVPTInstrPredicate(const MachineInstr &MI) { Register PredReg; return getVPTInstrPredicate(MI, PredReg); } +// Identify the input operand in an MVE predicated instruction which +// contributes the values of any inactive vector lanes. +int findVPTInactiveOperandIdx(const MachineInstr &MI); // Recomputes the Block Mask of Instr, a VPT or VPST instruction. // This rebuilds the block mask of the instruction depending on the predicates diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-vs-unpredicated-copy.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-vs-unpredicated-copy.mir new file mode 100644 index 0000000000000..5783133132fd0 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-vs-unpredicated-copy.mir @@ -0,0 +1,146 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -verify-machineinstrs -o - | FileCheck %s + +# From bug #162644. The _wrong_ output of this test is to generate the +# body of the tail-predicated loop like this: +# +# $q2 = MVE_VORR killed $q0, killed $q0, 0, $noreg, $noreg, undef $q2 +# renamable $r0, renamable $q3 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg, renamable $lr :: (load unknown-size from %ir.13, align 4) +# $q0 = MVE_VORR $q1, $q1, 0, $noreg, $noreg, undef $q0 +# renamable $q0 = MVE_VADDf32 killed renamable $q2, killed renamable $q3, 0, killed $noreg, renamable $lr, killed renamable $q0 +# $lr = MVE_LETP killed renamable $lr, %bb.1 +# +# in which the second MVE_VORR, copying q1 into q0, is an invalid conversion of +# the input MQPRCopy, because it won't copy the vector lanes disabled by +# FPSCR.LTPSIZE, and those are needed in the output value of the loop. +# +# In the right output, that MQPRCopy is expanded into a pair of VMOVD copying +# d2,d3 into d0,d1 respectively, which are unaffected by LTPSIZE. + +--- | + target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" + target triple = "thumbv8.1m.main-unknown-none-eabihf" + + @inactive = dso_local local_unnamed_addr global <4 x float> zeroinitializer, align 16 + + define <4 x float> @test_func(ptr %0, i32 %1) { + %3 = load <4 x float>, ptr @inactive, align 16 + %4 = add i32 %1, 3 + %5 = call i32 @llvm.smin.i32(i32 %1, i32 4) + %6 = sub i32 %4, %5 + %7 = lshr i32 %6, 2 + %8 = add nuw nsw i32 %7, 1 + %9 = call i32 @llvm.start.loop.iterations.i32(i32 %8) + br label %10 + + 10: ; preds = %10, %2 + %11 = phi <4 x float> [ splat (float 0x3FB99999A0000000), %2 ], [ %17, %10 ] + %12 = phi i32 [ %1, %2 ], [ %19, %10 ] + %13 = phi ptr [ %0, %2 ], [ %18, %10 ] + %14 = phi i32 [ %9, %2 ], [ %20, %10 ] + %15 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %12) + %16 = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %13, i32 4, <4 x i1> %15, <4 x float> zeroinitializer) + %17 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %11, <4 x float> %16, <4 x i1> %15, <4 x float> %3) + %18 = getelementptr inbounds nuw i8, ptr %13, i32 16 + %19 = add i32 %12, -4 + %20 = call i32 @llvm.loop.decrement.reg.i32(i32 %14, i32 1) + %21 = icmp ne i32 %20, 0 + br i1 %21, label %10, label %22 + + 22: ; preds = %10 + ret <4 x float> %17 + } +... +--- +name: test_func +alignment: 4 +legalized: false +tracksRegLiveness: true +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +body: | + ; CHECK-LABEL: name: test_func + ; CHECK: bb.0 (%ir-block.2): + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $lr, $r0, $r1, $r7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK-NEXT: $r2 = t2MOVi16 target-flags(arm-lo16) @inactive, 14 /* CC::al */, $noreg + ; CHECK-NEXT: $r2 = t2MOVTi16 killed $r2, target-flags(arm-hi16) @inactive, 14 /* CC::al */, $noreg + ; CHECK-NEXT: renamable $q1 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg, $noreg :: (dereferenceable load (s128) from @inactive) + ; CHECK-NEXT: $r3 = t2MOVi16 52429, 14 /* CC::al */, $noreg + ; CHECK-NEXT: $r3 = t2MOVTi16 killed $r3, 15820, 14 /* CC::al */, $noreg + ; CHECK-NEXT: renamable $q0 = MVE_VDUP32 killed renamable $r3, 0, $noreg, $noreg, undef renamable $q0 + ; CHECK-NEXT: $lr = MVE_DLSTP_32 killed renamable $r1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1 (%ir-block.10, align 4): + ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK-NEXT: liveins: $lr, $d2, $d3, $q0, $r0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $q2 = MVE_VORR killed $q0, killed $q0, 0, $noreg, $noreg, undef $q2 + ; CHECK-NEXT: renamable $r0, renamable $q3 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg, renamable $lr :: (load unknown-size from %ir.13, align 4) + ; CHECK-NEXT: $d0 = VMOVD $d2, 14 /* CC::al */, $noreg + ; CHECK-NEXT: $d1 = VMOVD $d3, 14 /* CC::al */, $noreg + ; CHECK-NEXT: renamable $q0 = MVE_VADDf32 killed renamable $q2, killed renamable $q3, 0, killed $noreg, renamable $lr, killed renamable $q0 + ; CHECK-NEXT: $lr = MVE_LETP killed renamable $lr, %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2 (%ir-block.22): + ; CHECK-NEXT: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $q0 + bb.0 (%ir-block.2): + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $r7, $lr + + frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + $r2 = t2MOVi16 target-flags(arm-lo16) @inactive, 14 /* CC::al */, $noreg + tCMPi8 renamable $r1, 4, 14 /* CC::al */, $noreg, implicit-def $cpsr + $r2 = t2MOVTi16 killed $r2, target-flags(arm-hi16) @inactive, 14 /* CC::al */, $noreg + renamable $r3 = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg + renamable $q1 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg, $noreg :: (dereferenceable load (s128) from @inactive) + $r2 = tMOVr $r1, 14 /* CC::al */, $noreg + t2IT 10, 8, implicit-def $itstate + renamable $r2 = tMOVi8 $noreg, 4, 10 /* CC::ge */, killed $cpsr, implicit killed renamable $r2, implicit killed $itstate + renamable $r2, dead $cpsr = tSUBrr renamable $r1, killed renamable $r2, 14 /* CC::al */, $noreg + renamable $r2, dead $cpsr = tADDi8 killed renamable $r2, 3, 14 /* CC::al */, $noreg + renamable $r2 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r2, 19, 14 /* CC::al */, $noreg, $noreg + $r3 = t2MOVi16 52429, 14 /* CC::al */, $noreg + $r3 = t2MOVTi16 killed $r3, 15820, 14 /* CC::al */, $noreg + renamable $q0 = MVE_VDUP32 killed renamable $r3, 0, $noreg, $noreg, undef renamable $q0 + renamable $lr = t2DoLoopStartTP killed renamable $r2, renamable $r1 + + bb.1 (%ir-block.10, align 4): + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $lr, $q0, $q1, $r0, $r1 + + renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg, $noreg + $q2 = MQPRCopy killed $q0 + MVE_VPST 8, implicit $vpr + renamable $r0, renamable $q3 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr, renamable $lr :: (load unknown-size from %ir.13, align 4) + $q0 = MQPRCopy $q1 + MVE_VPST 8, implicit $vpr + renamable $q0 = MVE_VADDf32 killed renamable $q2, killed renamable $q3, 1, killed renamable $vpr, renamable $lr, killed renamable $q0 + renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg + renamable $lr = t2LoopEndDec killed renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14 /* CC::al */, $noreg + + bb.2 (%ir-block.22): + liveins: $q0 + + frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $q0 +... From dda30e1b7d100c5df2285d27af25f8b42b7aac19 Mon Sep 17 00:00:00 2001 From: jeanPerier Date: Wed, 22 Oct 2025 10:11:13 +0200 Subject: [PATCH 063/530] [flang][OpenACC] remap common block member symbols (#163752) --- flang/lib/Lower/OpenACC.cpp | 55 ++++++++++++++++--- .../acc-data-operands-remapping-common.f90 | 43 +++++++++++++++ 2 files changed, 90 insertions(+), 8 deletions(-) create mode 100644 flang/test/Lower/OpenACC/acc-data-operands-remapping-common.f90 diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp index af4f4207b8e4c..1fc59c702fd81 100644 --- a/flang/lib/Lower/OpenACC.cpp +++ b/flang/lib/Lower/OpenACC.cpp @@ -2366,6 +2366,23 @@ static void processDoLoopBounds( } } +static void remapCommonBlockMember( + Fortran::lower::AbstractConverter &converter, mlir::Location loc, + const Fortran::semantics::Symbol &member, + mlir::Value newCommonBlockBaseAddress, + const Fortran::semantics::Symbol &commonBlockSymbol, + llvm::SmallPtrSetImpl &seenSymbols) { + if (seenSymbols.contains(&member)) + return; + mlir::Value accMemberValue = Fortran::lower::genCommonBlockMember( + converter, loc, member, newCommonBlockBaseAddress, + commonBlockSymbol.size()); + fir::ExtendedValue hostExv = converter.getSymbolExtendedValue(member); + fir::ExtendedValue accExv = fir::substBase(hostExv, accMemberValue); + converter.bindSymbol(member, accExv); + seenSymbols.insert(&member); +} + /// Remap symbols that appeared in OpenACC data clauses to use the results of /// the corresponding data operations. This allows isolating symbol accesses /// inside the OpenACC region from accesses in the host and other regions while @@ -2391,14 +2408,39 @@ static void remapDataOperandSymbols( builder.setInsertionPointToStart(®ionOp.getRegion().front()); llvm::SmallPtrSet seenSymbols; mlir::IRMapping mapper; + mlir::Location loc = regionOp.getLoc(); for (auto [value, symbol] : dataOperandSymbolPairs) { - - // If A symbol appears on several data clause, just map it to the first + // If a symbol appears on several data clause, just map it to the first // result (all data operations results for a symbol are pointing same // memory, so it does not matter which one is used). if (seenSymbols.contains(&symbol.get())) continue; seenSymbols.insert(&symbol.get()); + // When a common block appears in a directive, remap its members. + // Note: this will instantiate all common block members even if they are not + // used inside the region. If hlfir.declare DCE is not made possible, this + // could be improved to reduce IR noise. + if (const auto *commonBlock = symbol->template detailsIf< + Fortran::semantics::CommonBlockDetails>()) { + const Fortran::semantics::Scope &commonScope = symbol->owner(); + if (commonScope.equivalenceSets().empty()) { + for (auto member : commonBlock->objects()) + remapCommonBlockMember(converter, loc, *member, value, *symbol, + seenSymbols); + } else { + // Objects equivalenced with common block members still belong to the + // common block storage even if they are not part of the common block + // declaration. The easiest and most robust way to find all symbols + // belonging to the common block is to loop through the scope symbols + // and check if they belong to the common. + for (const auto &scopeSymbol : commonScope) + if (Fortran::semantics::FindCommonBlockContaining( + *scopeSymbol.second) == &symbol.get()) + remapCommonBlockMember(converter, loc, *scopeSymbol.second, value, + *symbol, seenSymbols); + } + continue; + } std::optional hostDef = symbolMap.lookupVariableDefinition(symbol); assert(hostDef.has_value() && llvm::isa(*hostDef) && @@ -2415,10 +2457,8 @@ static void remapDataOperandSymbols( "box type mismatch between compute region variable and " "hlfir.declare input unexpected"); if (Fortran::semantics::IsOptional(symbol)) - TODO(regionOp.getLoc(), - "remapping OPTIONAL symbol in OpenACC compute region"); - auto rawValue = - fir::BoxAddrOp::create(builder, regionOp.getLoc(), hostType, value); + TODO(loc, "remapping OPTIONAL symbol in OpenACC compute region"); + auto rawValue = fir::BoxAddrOp::create(builder, loc, hostType, value); mapper.map(hostInput, rawValue); } else { assert(!llvm::isa(hostType) && @@ -2430,8 +2470,7 @@ static void remapDataOperandSymbols( assert(fir::isa_ref_type(hostType) && fir::isa_ref_type(computeType) && "compute region variable and host variable should both be raw " "addresses"); - mlir::Value cast = - builder.createConvert(regionOp.getLoc(), hostType, value); + mlir::Value cast = builder.createConvert(loc, hostType, value); mapper.map(hostInput, cast); } if (mlir::Value dummyScope = hostDeclare.getDummyScope()) { diff --git a/flang/test/Lower/OpenACC/acc-data-operands-remapping-common.f90 b/flang/test/Lower/OpenACC/acc-data-operands-remapping-common.f90 new file mode 100644 index 0000000000000..1ab883e0599c0 --- /dev/null +++ b/flang/test/Lower/OpenACC/acc-data-operands-remapping-common.f90 @@ -0,0 +1,43 @@ +! Test remapping of common blocks appearing in OpenACC data directives. + +! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s + +subroutine test + real :: x(100), y(100), overlap1(100), overlap2(100) + equivalence (x(50), overlap1) + equivalence (x(40), overlap2) + common /comm/ x, y + !$acc declare link(/comm/) + !$acc parallel loop copyin(/comm/) + do i = 1, 100 + x(i) = overlap1(i)*2+ overlap2(i) + enddo +end subroutine +! CHECK-LABEL: func.func @_QPtest() { +! CHECK: %[[ADDRESS_OF_0:.*]] = fir.address_of(@comm_) +! CHECK: %[[COPYIN_0:.*]] = acc.copyin varPtr(%[[ADDRESS_OF_0]] : !fir.ref>) -> !fir.ref> {name = "comm"} +! CHECK: acc.parallel combined(loop) dataOperands(%[[COPYIN_0]] : !fir.ref>) { +! CHECK: %[[CONSTANT_8:.*]] = arith.constant 196 : index +! CHECK: %[[COORDINATE_OF_4:.*]] = fir.coordinate_of %[[COPYIN_0]], %{{.*}} : (!fir.ref>, index) -> !fir.ref +! CHECK: %[[CONVERT_4:.*]] = fir.convert %[[COORDINATE_OF_4]] : (!fir.ref) -> !fir.ptr> +! CHECK: %[[SHAPE_4:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1> +! CHECK: %[[DECLARE_5:.*]]:2 = hlfir.declare %[[CONVERT_4]](%[[SHAPE_4]]) storage(%[[COPYIN_0]][196]) {uniq_name = "_QFtestEoverlap1"} : (!fir.ptr>, !fir.shape<1>, !fir.ref>) -> (!fir.ptr>, !fir.ptr>) +! CHECK: %[[CONSTANT_9:.*]] = arith.constant 156 : index +! CHECK: %[[COORDINATE_OF_5:.*]] = fir.coordinate_of %[[COPYIN_0]], %{{.*}} : (!fir.ref>, index) -> !fir.ref +! CHECK: %[[CONVERT_5:.*]] = fir.convert %[[COORDINATE_OF_5]] : (!fir.ref) -> !fir.ptr> +! CHECK: %[[SHAPE_5:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1> +! CHECK: %[[DECLARE_6:.*]]:2 = hlfir.declare %[[CONVERT_5]](%[[SHAPE_5]]) storage(%[[COPYIN_0]][156]) {uniq_name = "_QFtestEoverlap2"} : (!fir.ptr>, !fir.shape<1>, !fir.ref>) -> (!fir.ptr>, !fir.ptr>) +! CHECK: %[[CONSTANT_10:.*]] = arith.constant 0 : index +! CHECK: %[[COORDINATE_OF_6:.*]] = fir.coordinate_of %[[COPYIN_0]], %{{.*}} : (!fir.ref>, index) -> !fir.ref +! CHECK: %[[CONVERT_6:.*]] = fir.convert %[[COORDINATE_OF_6]] : (!fir.ref) -> !fir.ptr> +! CHECK: %[[SHAPE_6:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1> +! CHECK: %[[DECLARE_7:.*]]:2 = hlfir.declare %[[CONVERT_6]](%[[SHAPE_6]]) storage(%[[COPYIN_0]][0]) {uniq_name = "_QFtestEx"} : (!fir.ptr>, !fir.shape<1>, !fir.ref>) -> (!fir.ptr>, !fir.ptr>) +! CHECK: %[[CONSTANT_11:.*]] = arith.constant 400 : index +! CHECK: %[[COORDINATE_OF_7:.*]] = fir.coordinate_of %[[COPYIN_0]], %{{.*}} : (!fir.ref>, index) -> !fir.ref +! CHECK: %[[CONVERT_7:.*]] = fir.convert %[[COORDINATE_OF_7]] : (!fir.ref) -> !fir.ref> +! CHECK: %[[SHAPE_7:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1> +! CHECK: %[[DECLARE_8:.*]]:2 = hlfir.declare %[[CONVERT_7]](%[[SHAPE_7]]) storage(%[[COPYIN_0]][400]) {uniq_name = "_QFtestEy"} : (!fir.ref>, !fir.shape<1>, !fir.ref>) -> (!fir.ref>, !fir.ref>) +! CHECK: acc.loop combined(parallel) +! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[DECLARE_5]]#0 +! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[DECLARE_6]]#0 +! CHECK: %[[DESIGNATE_2:.*]] = hlfir.designate %[[DECLARE_7]]#0 From eeffaf110e991791a8ff7e9b7bb40b5317c70bad Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 22 Oct 2025 10:27:55 +0200 Subject: [PATCH 064/530] [AllocToken] Refactor stateless token calculation into Support (#163633) Refactor the stateless (hash-based) token calculation logic out of the `AllocToken` pass and into `llvm/Support/AllocToken.h`. This helps with making the token calculation logic available to other parts of the codebase, which will be necessary for frontend implementation of `__builtin_infer_alloc_token` to perform constexpr evaluation. The `AllocTokenMode` enum and a new `AllocTokenMetadata` struct are moved into a shared header. The `getAllocTokenHash()` function now provides the source of truth for calculating token IDs for `TypeHash` and `TypeHashPointerSplit` modes. --- llvm/include/llvm/Support/AllocToken.h | 58 +++++++++++++++++++ llvm/lib/Support/AllocToken.cpp | 50 ++++++++++++++++ llvm/lib/Support/CMakeLists.txt | 1 + .../Transforms/Instrumentation/AllocToken.cpp | 57 ++++++------------ .../gn/secondary/llvm/lib/Support/BUILD.gn | 1 + 5 files changed, 128 insertions(+), 39 deletions(-) create mode 100644 llvm/include/llvm/Support/AllocToken.h create mode 100644 llvm/lib/Support/AllocToken.cpp diff --git a/llvm/include/llvm/Support/AllocToken.h b/llvm/include/llvm/Support/AllocToken.h new file mode 100644 index 0000000000000..8d82670eaeb8d --- /dev/null +++ b/llvm/include/llvm/Support/AllocToken.h @@ -0,0 +1,58 @@ +//===- llvm/Support/AllocToken.h - Allocation Token Calculation -----*- C++ -*// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Definition of AllocToken modes and shared calculation of stateless token IDs. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_ALLOCTOKEN_H +#define LLVM_SUPPORT_ALLOCTOKEN_H + +#include "llvm/ADT/SmallString.h" +#include +#include + +namespace llvm { + +/// Modes for generating allocation token IDs. +enum class AllocTokenMode { + /// Incrementally increasing token ID. + Increment, + + /// Simple mode that returns a statically-assigned random token ID. + Random, + + /// Token ID based on allocated type hash. + TypeHash, + + /// Token ID based on allocated type hash, where the top half ID-space is + /// reserved for types that contain pointers and the bottom half for types + /// that do not contain pointers. + TypeHashPointerSplit, +}; + +/// Metadata about an allocation used to generate a token ID. +struct AllocTokenMetadata { + SmallString<64> TypeName; + bool ContainsPointer; +}; + +/// Calculates stable allocation token ID. Returns std::nullopt for stateful +/// modes that are only available in the AllocToken pass. +/// +/// \param Mode The token generation mode. +/// \param Metadata The metadata about the allocation. +/// \param MaxTokens The maximum number of tokens (must not be 0) +/// \return The calculated allocation token ID, or std::nullopt. +LLVM_ABI std::optional +getAllocToken(AllocTokenMode Mode, const AllocTokenMetadata &Metadata, + uint64_t MaxTokens); + +} // end namespace llvm + +#endif // LLVM_SUPPORT_ALLOCTOKEN_H diff --git a/llvm/lib/Support/AllocToken.cpp b/llvm/lib/Support/AllocToken.cpp new file mode 100644 index 0000000000000..95ecda2ffd8ba --- /dev/null +++ b/llvm/lib/Support/AllocToken.cpp @@ -0,0 +1,50 @@ +//===- AllocToken.cpp - Allocation Token Calculation ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Definition of AllocToken modes and shared calculation of stateless token IDs. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/AllocToken.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/SipHash.h" + +using namespace llvm; + +static uint64_t getStableHash(const AllocTokenMetadata &Metadata, + uint64_t MaxTokens) { + return getStableSipHash(Metadata.TypeName) % MaxTokens; +} + +std::optional llvm::getAllocToken(AllocTokenMode Mode, + const AllocTokenMetadata &Metadata, + uint64_t MaxTokens) { + assert(MaxTokens && "Must provide non-zero max tokens"); + + switch (Mode) { + case AllocTokenMode::Increment: + case AllocTokenMode::Random: + // Stateful modes cannot be implemented as a pure function. + return std::nullopt; + + case AllocTokenMode::TypeHash: + return getStableHash(Metadata, MaxTokens); + + case AllocTokenMode::TypeHashPointerSplit: { + if (MaxTokens == 1) + return 0; + const uint64_t HalfTokens = MaxTokens / 2; + uint64_t Hash = getStableHash(Metadata, HalfTokens); + if (Metadata.ContainsPointer) + Hash += HalfTokens; + return Hash; + } + } + + llvm_unreachable(""); +} diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt index 42b21b5e62029..671a5fe941cef 100644 --- a/llvm/lib/Support/CMakeLists.txt +++ b/llvm/lib/Support/CMakeLists.txt @@ -149,6 +149,7 @@ add_llvm_component_library(LLVMSupport AArch64BuildAttributes.cpp ARMAttributeParser.cpp ARMWinEH.cpp + AllocToken.cpp Allocator.cpp AutoConvert.cpp Base64.cpp diff --git a/llvm/lib/Transforms/Instrumentation/AllocToken.cpp b/llvm/lib/Transforms/Instrumentation/AllocToken.cpp index 29968b834cfce..08738450e4b74 100644 --- a/llvm/lib/Transforms/Instrumentation/AllocToken.cpp +++ b/llvm/lib/Transforms/Instrumentation/AllocToken.cpp @@ -36,6 +36,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/Type.h" +#include "llvm/Support/AllocToken.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" @@ -54,29 +55,12 @@ #include using namespace llvm; +using TokenMode = AllocTokenMode; #define DEBUG_TYPE "alloc-token" namespace { -//===--- Constants --------------------------------------------------------===// - -enum class TokenMode : unsigned { - /// Incrementally increasing token ID. - Increment = 0, - - /// Simple mode that returns a statically-assigned random token ID. - Random = 1, - - /// Token ID based on allocated type hash. - TypeHash = 2, - - /// Token ID based on allocated type hash, where the top half ID-space is - /// reserved for types that contain pointers and the bottom half for types - /// that do not contain pointers. - TypeHashPointerSplit = 3, -}; - //===--- Command-line options ---------------------------------------------===// cl::opt ClMode( @@ -217,22 +201,19 @@ class TypeHashMode : public ModeBase { using ModeBase::ModeBase; uint64_t operator()(const CallBase &CB, OptimizationRemarkEmitter &ORE) { - const auto [N, H] = getHash(CB, ORE); - return N ? boundedToken(H) : H; - } -protected: - std::pair getHash(const CallBase &CB, - OptimizationRemarkEmitter &ORE) { if (MDNode *N = getAllocTokenMetadata(CB)) { MDString *S = cast(N->getOperand(0)); - return {N, getStableSipHash(S->getString())}; + AllocTokenMetadata Metadata{S->getString(), containsPointer(N)}; + if (auto Token = getAllocToken(TokenMode::TypeHash, Metadata, MaxTokens)) + return *Token; } // Fallback. remarkNoMetadata(CB, ORE); - return {nullptr, ClFallbackToken}; + return ClFallbackToken; } +protected: /// Remark that there was no precise type information. static void remarkNoMetadata(const CallBase &CB, OptimizationRemarkEmitter &ORE) { @@ -253,20 +234,18 @@ class TypeHashPointerSplitMode : public TypeHashMode { using TypeHashMode::TypeHashMode; uint64_t operator()(const CallBase &CB, OptimizationRemarkEmitter &ORE) { - if (MaxTokens == 1) - return 0; - const uint64_t HalfTokens = MaxTokens / 2; - const auto [N, H] = getHash(CB, ORE); - if (!N) { - // Pick the fallback token (ClFallbackToken), which by default is 0, - // meaning it'll fall into the pointer-less bucket. Override by setting - // -alloc-token-fallback if that is the wrong choice. - return H; + if (MDNode *N = getAllocTokenMetadata(CB)) { + MDString *S = cast(N->getOperand(0)); + AllocTokenMetadata Metadata{S->getString(), containsPointer(N)}; + if (auto Token = getAllocToken(TokenMode::TypeHashPointerSplit, Metadata, + MaxTokens)) + return *Token; } - uint64_t Hash = H % HalfTokens; // base hash - if (containsPointer(N)) - Hash += HalfTokens; - return Hash; + // Pick the fallback token (ClFallbackToken), which by default is 0, meaning + // it'll fall into the pointer-less bucket. Override by setting + // -alloc-token-fallback if that is the wrong choice. + remarkNoMetadata(CB, ORE); + return ClFallbackToken; } }; diff --git a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn index 38ba4661daacc..df9ddf91f2c49 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn @@ -45,6 +45,7 @@ static_library("Support") { "ARMAttributeParser.cpp", "ARMBuildAttributes.cpp", "ARMWinEH.cpp", + "AllocToken.cpp", "Allocator.cpp", "AutoConvert.cpp", "BalancedPartitioning.cpp", From 1f7ddb61b368c56838bb7301ddef4547d0be9682 Mon Sep 17 00:00:00 2001 From: Kaloyan Ignatov Date: Wed, 22 Oct 2025 10:48:39 +0200 Subject: [PATCH 065/530] [NFC][Offload][OMPT] Improve readability of liboffload OMPT tests (#163181) - ompt_target_data_op_t, ompt_scope_endpoint_t and ompt_target_t are now printed as strings instead of just numbers to ease debugging - some missing clang-format clauses have been added --- offload/test/ompt/callbacks.h | 67 ++++++++++++----- offload/test/ompt/omp_api.c | 10 +-- offload/test/ompt/target_memcpy.c | 12 ++-- offload/test/ompt/target_memcpy_emi.c | 22 +++--- offload/test/ompt/veccopy.c | 43 +++++------ offload/test/ompt/veccopy_data.c | 67 ++++++++--------- offload/test/ompt/veccopy_disallow_both.c | 75 +++++++++---------- offload/test/ompt/veccopy_emi.c | 83 +++++++++++----------- offload/test/ompt/veccopy_emi_map.c | 83 +++++++++++----------- offload/test/ompt/veccopy_map.c | 46 ++++++------ offload/test/ompt/veccopy_no_device_init.c | 42 +++++------ offload/test/ompt/veccopy_wrong_return.c | 43 +++++------ 12 files changed, 327 insertions(+), 266 deletions(-) diff --git a/offload/test/ompt/callbacks.h b/offload/test/ompt/callbacks.h index 95437d9cdcfb1..2e7763f0abbac 100644 --- a/offload/test/ompt/callbacks.h +++ b/offload/test/ompt/callbacks.h @@ -5,6 +5,37 @@ // Tool related code below #include +static const char *ompt_target_data_op_t_values[] = { + "", + "ompt_target_data_alloc", + "ompt_target_data_transfer_to_device", + "ompt_target_data_transfer_from_device", + "ompt_target_data_delete", + "ompt_target_data_associate", + "ompt_target_data_disassociate", + "ompt_target_data_alloc_async", + "ompt_target_data_transfer_to_device_async", + "ompt_target_data_transfer_from_device_async", + "ompt_target_data_delete_async"}; + +static const char *ompt_scope_endpoint_t_values[] = { + "", "ompt_scope_begin", "ompt_scope_end", "ompt_scope_beginend"}; + +static const char *ompt_target_t_values[] = {"", + "ompt_target", + "ompt_target_enter_data", + "ompt_target_exit_data", + "ompt_target_update", + "", + "", + "", + "", + "", + "ompt_target_nowait", + "ompt_target_enter_data_nowait", + "ompt_target_exit_data_nowait", + "ompt_target_update_nowait"}; + // For EMI callbacks ompt_id_t next_op_id = 0x8000000000000001; @@ -38,11 +69,11 @@ static void on_ompt_callback_target_data_op( void *src_addr, int src_device_num, void *dest_addr, int dest_device_num, size_t bytes, const void *codeptr_ra) { assert(codeptr_ra != 0 && "Unexpected null codeptr"); - printf(" Callback DataOp: target_id=%lu host_op_id=%lu optype=%d src=%p " + printf(" Callback DataOp: target_id=%lu host_op_id=%lu optype=%s src=%p " "src_device_num=%d " "dest=%p dest_device_num=%d bytes=%lu code=%p\n", - target_id, host_op_id, optype, src_addr, src_device_num, dest_addr, - dest_device_num, bytes, codeptr_ra); + target_id, host_op_id, ompt_target_data_op_t_values[optype], src_addr, + src_device_num, dest_addr, dest_device_num, bytes, codeptr_ra); } static void on_ompt_callback_target(ompt_target_t kind, @@ -51,9 +82,10 @@ static void on_ompt_callback_target(ompt_target_t kind, ompt_id_t target_id, const void *codeptr_ra) { assert(codeptr_ra != 0 && "Unexpected null codeptr"); - printf("Callback Target: target_id=%lu kind=%d endpoint=%d device_num=%d " + printf("Callback Target: target_id=%lu kind=%s endpoint=%s device_num=%d " "code=%p\n", - target_id, kind, endpoint, device_num, codeptr_ra); + target_id, ompt_target_t_values[kind], + ompt_scope_endpoint_t_values[endpoint], device_num, codeptr_ra); } static void on_ompt_callback_target_submit(ompt_id_t target_id, @@ -84,13 +116,15 @@ static void on_ompt_callback_target_data_op_emi( // target_task_data may be null, avoid dereferencing it uint64_t target_task_data_value = (target_task_data) ? target_task_data->value : 0; - printf(" Callback DataOp EMI: endpoint=%d optype=%d target_task_data=%p " + printf(" Callback DataOp EMI: endpoint=%s optype=%s target_task_data=%p " "(0x%lx) target_data=%p (0x%lx) host_op_id=%p (0x%lx) src=%p " "src_device_num=%d " "dest=%p dest_device_num=%d bytes=%lu code=%p\n", - endpoint, optype, target_task_data, target_task_data_value, - target_data, target_data->value, host_op_id, *host_op_id, src_addr, - src_device_num, dest_addr, dest_device_num, bytes, codeptr_ra); + ompt_scope_endpoint_t_values[endpoint], + ompt_target_data_op_t_values[optype], target_task_data, + target_task_data_value, target_data, target_data->value, host_op_id, + *host_op_id, src_addr, src_device_num, dest_addr, dest_device_num, + bytes, codeptr_ra); } static void on_ompt_callback_target_emi(ompt_target_t kind, @@ -102,20 +136,21 @@ static void on_ompt_callback_target_emi(ompt_target_t kind, assert(codeptr_ra != 0 && "Unexpected null codeptr"); if (endpoint == ompt_scope_begin) target_data->value = next_op_id++; - printf("Callback Target EMI: kind=%d endpoint=%d device_num=%d task_data=%p " + printf("Callback Target EMI: kind=%s endpoint=%s device_num=%d task_data=%p " "(0x%lx) target_task_data=%p (0x%lx) target_data=%p (0x%lx) code=%p\n", - kind, endpoint, device_num, task_data, task_data->value, - target_task_data, target_task_data->value, target_data, - target_data->value, codeptr_ra); + ompt_target_t_values[kind], ompt_scope_endpoint_t_values[endpoint], + device_num, task_data, task_data ? task_data->value : 0, + target_task_data, target_task_data ? target_task_data->value : 0, + target_data, target_data->value, codeptr_ra); } static void on_ompt_callback_target_submit_emi( ompt_scope_endpoint_t endpoint, ompt_data_t *target_data, ompt_id_t *host_op_id, unsigned int requested_num_teams) { - printf(" Callback Submit EMI: endpoint=%d req_num_teams=%d target_data=%p " + printf(" Callback Submit EMI: endpoint=%s req_num_teams=%d target_data=%p " "(0x%lx) host_op_id=%p (0x%lx)\n", - endpoint, requested_num_teams, target_data, target_data->value, - host_op_id, *host_op_id); + ompt_scope_endpoint_t_values[endpoint], requested_num_teams, + target_data, target_data->value, host_op_id, *host_op_id); } static void on_ompt_callback_target_map_emi(ompt_data_t *target_data, diff --git a/offload/test/ompt/omp_api.c b/offload/test/ompt/omp_api.c index a16ef7a64aa7d..5fb2098f0ce79 100644 --- a/offload/test/ompt/omp_api.c +++ b/offload/test/ompt/omp_api.c @@ -1,6 +1,8 @@ +// clang-format off // RUN: %libomptarget-compile-run-and-check-generic // REQUIRES: ompt // REQUIRES: gpu +// clang-format on #include "omp.h" #include @@ -32,8 +34,8 @@ int main(int argc, char **argv) { // clang-format off /// CHECK: Callback Init: -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=5 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=6 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4 +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_associate +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_disassociate +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete /// CHECK: Callback Fini: diff --git a/offload/test/ompt/target_memcpy.c b/offload/test/ompt/target_memcpy.c index f244e0f418ed6..f769995579f50 100644 --- a/offload/test/ompt/target_memcpy.c +++ b/offload/test/ompt/target_memcpy.c @@ -1,6 +1,8 @@ +// clang-format off // RUN: %libomptarget-compile-run-and-check-generic // REQUIRES: ompt // REQUIRES: gpu +// clang-format on /* * Verify that for the target OpenMP APIs, the return address is non-null and @@ -46,26 +48,26 @@ int main() { } // clang-format off -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1 +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc /// CHECK-SAME: src_device_num=[[HOST:[0-9]+]] /// CHECK-SAME: dest_device_num=[[DEVICE:[0-9]+]] /// CHECK-NOT: code=(nil) /// CHECK: code=[[CODE1:0x[0-f]+]] -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2 +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device /// CHECK-SAME: src_device_num=[[HOST]] {{.+}} dest_device_num=[[DEVICE]] /// CHECK-NOT: code=(nil) /// CHECK-NOT: code=[[CODE1]] /// CHECK: code=[[CODE2:0x[0-f]+]] -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3 +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device /// CHECK-SAME: src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[DEVICE]] /// CHECK-NOT: code=(nil) /// CHECK-NOT: code=[[CODE2]] /// CHECK: code=[[CODE3:0x[0-f]+]] -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3 +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device /// CHECK-SAME: src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[HOST]] /// CHECK-NOT: code=(nil) /// CHECK-NOT: code=[[CODE3]] /// CHECK: code=[[CODE4:0x[0-f]+]] -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4 +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete /// CHECK-NOT: code=(nil) /// CHECK-NOT: code=[[CODE4]] diff --git a/offload/test/ompt/target_memcpy_emi.c b/offload/test/ompt/target_memcpy_emi.c index 934caba6efab3..39f262a366f94 100644 --- a/offload/test/ompt/target_memcpy_emi.c +++ b/offload/test/ompt/target_memcpy_emi.c @@ -1,6 +1,8 @@ +// clang-format off // RUN: %libomptarget-compile-run-and-check-generic // REQUIRES: ompt // REQUIRES: gpu +// clang-format on /* * Verify all three data transfer directions: H2D, D2D and D2H @@ -54,28 +56,28 @@ int main(void) { /// CHECK: Callback Init: /// CHECK: Allocating Memory on Device -/// CHECK: Callback DataOp EMI: endpoint=1 optype=1 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc /// CHECK-SAME: src_device_num=[[HOST:[0-9]+]] /// CHECK-SAME: dest_device_num=[[DEVICE:[0-9]+]] -/// CHECK: Callback DataOp EMI: endpoint=2 optype=1 {{.+}} src_device_num=[[HOST]] {{.+}} dest_device_num=[[DEVICE]] +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc {{.+}} src_device_num=[[HOST]] {{.+}} dest_device_num=[[DEVICE]] /// CHECK: Testing: Host to Device -/// CHECK: Callback DataOp EMI: endpoint=1 optype=2 {{.+}} src_device_num=[[HOST]] {{.+}} dest_device_num=[[DEVICE]] -/// CHECK: Callback DataOp EMI: endpoint=2 optype=2 {{.+}} src_device_num=[[HOST]] {{.+}} dest_device_num=[[DEVICE]] +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device {{.+}} src_device_num=[[HOST]] {{.+}} dest_device_num=[[DEVICE]] +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device {{.+}} src_device_num=[[HOST]] {{.+}} dest_device_num=[[DEVICE]] /// CHECK: Testing: Device to Device -/// CHECK: Callback DataOp EMI: endpoint=1 optype=3 {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[DEVICE]] -/// CHECK: Callback DataOp EMI: endpoint=2 optype=3 {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[DEVICE]] +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[DEVICE]] +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[DEVICE]] /// CHECK: Testing: Device to Host -/// CHECK: Callback DataOp EMI: endpoint=1 optype=3 {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[HOST]] -/// CHECK: Callback DataOp EMI: endpoint=2 optype=3 {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[HOST]] +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[HOST]] +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[HOST]] /// CHECK: Checking Correctness /// CHECK: Freeing Memory on Device -/// CHECK: Callback DataOp EMI: endpoint=1 optype=4 {{.+}} src_device_num=[[DEVICE]] -/// CHECK: Callback DataOp EMI: endpoint=2 optype=4 {{.+}} src_device_num=[[DEVICE]] +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete {{.+}} src_device_num=[[DEVICE]] +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete {{.+}} src_device_num=[[DEVICE]] /// CHECK: Callback Fini: diff --git a/offload/test/ompt/veccopy.c b/offload/test/ompt/veccopy.c index f28d94f524bb8..24d7363e65599 100644 --- a/offload/test/ompt/veccopy.c +++ b/offload/test/ompt/veccopy.c @@ -1,6 +1,8 @@ +// clang-format off // RUN: %libomptarget-compile-run-and-check-generic // REQUIRES: ompt // REQUIRES: gpu +// clang-format on /* * Example OpenMP program that registers non-EMI callbacks @@ -54,48 +56,47 @@ int main() { // clang-format off /// CHECK: Callback Init: /// CHECK: Callback Load: -/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1 device_num=[[DEVICE_NUM:[0-9]+]] +/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_begin device_num=[[DEVICE_NUM:[0-9]+]] /// CHECK-NOT: code=(nil) /// CHECK: code=[[CODE1:.*]] -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1 +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc /// CHECK: code=[[CODE1]] -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2 +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device /// CHECK: code=[[CODE1]] -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1 +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc /// CHECK: code=[[CODE1]] -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2 +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device /// CHECK: code=[[CODE1]] /// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=1 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3 +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device /// CHECK: code=[[CODE1]] -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3 +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device /// CHECK: code=[[CODE1]] -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4 +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete /// CHECK: code=[[CODE1]] -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4 +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete /// CHECK: code=[[CODE1]] -/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2 device_num=[[DEVICE_NUM]] code=[[CODE1]] +/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_end device_num=[[DEVICE_NUM]] code=[[CODE1]] -/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1 -/// device_num=[[DEVICE_NUM]] +/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_begin device_num=[[DEVICE_NUM]] /// CHECK-NOT: code=(nil) /// CHECK: code=[[CODE2:.*]] -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1 +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc /// CHECK: code=[[CODE2]] -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2 +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device /// CHECK: code=[[CODE2]] -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1 +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc /// CHECK: code=[[CODE2]] -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2 +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device /// CHECK: code=[[CODE2]] /// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=0 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3 +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device /// CHECK: code=[[CODE2]] -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3 +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device /// CHECK: code=[[CODE2]] -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4 +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete /// CHECK: code=[[CODE2]] -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4 +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete /// CHECK: code=[[CODE2]] -/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2 device_num=[[DEVICE_NUM]] code=[[CODE2]] +/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_end device_num=[[DEVICE_NUM]] code=[[CODE2]] /// CHECK: Callback Fini: diff --git a/offload/test/ompt/veccopy_data.c b/offload/test/ompt/veccopy_data.c index 059ca97c3cde3..9df5374193e94 100644 --- a/offload/test/ompt/veccopy_data.c +++ b/offload/test/ompt/veccopy_data.c @@ -1,6 +1,8 @@ +// clang-format off // RUN: %libomptarget-compile-run-and-check-generic // REQUIRES: ompt // REQUIRES: gpu +// clang-format on /* * Example OpenMP program that registers EMI callbacks. @@ -73,85 +75,86 @@ int main() { return rc; } +// clang-format off /// CHECK-NOT: Callback Target EMI: /// CHECK-NOT: device_num=-1 /// CHECK: Callback Init: /// CHECK: Callback Load: -/// CHECK: Callback Target EMI: kind=2 endpoint=1 +/// CHECK: Callback Target EMI: kind=ompt_target_enter_data endpoint=ompt_scope_begin /// CHECK-NOT: device_num=-1 /// CHECK-NOT: code=(nil) /// CHECK: code=[[CODE1:.*]] -/// CHECK: Callback DataOp EMI: endpoint=1 optype=1 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc /// CHECK: code=[[CODE1]] -/// CHECK: Callback DataOp EMI: endpoint=2 optype=1 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc /// CHECK-NOT: dest=(nil) /// CHECK: code=[[CODE1]] -/// CHECK: Callback DataOp EMI: endpoint=1 optype=2 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device /// CHECK: code=[[CODE1]] -/// CHECK: Callback DataOp EMI: endpoint=2 optype=2 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device /// CHECK: code=[[CODE1]] -/// CHECK: Callback Target EMI: kind=2 endpoint=2 +/// CHECK: Callback Target EMI: kind=ompt_target_enter_data endpoint=ompt_scope_end /// CHECK-NOT: device_num=-1 /// CHECK: code=[[CODE1]] -/// CHECK: Callback Target EMI: kind=1 endpoint=1 +/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_begin /// CHECK-NOT: device_num=-1 /// CHECK-NOT: code=(nil) /// CHECK: code=[[CODE2:.*]] -/// CHECK: Callback DataOp EMI: endpoint=1 optype=1 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc /// CHECK: code=[[CODE2]] -/// CHECK: Callback DataOp EMI: endpoint=2 optype=1 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc /// CHECK-NOT: dest=(nil) /// CHECK: code=[[CODE2]] -/// CHECK: Callback DataOp EMI: endpoint=1 optype=2 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device /// CHECK: code=[[CODE2]] -/// CHECK: Callback DataOp EMI: endpoint=2 optype=2 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device /// CHECK: code=[[CODE2]] -/// CHECK: Callback Submit EMI: endpoint=1 req_num_teams=1 -/// CHECK: Callback Submit EMI: endpoint=2 req_num_teams=1 -/// CHECK: Callback DataOp EMI: endpoint=1 optype=3 +/// CHECK: Callback Submit EMI: endpoint=ompt_scope_begin req_num_teams=1 +/// CHECK: Callback Submit EMI: endpoint=ompt_scope_end req_num_teams=1 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device /// CHECK: code=[[CODE2]] -/// CHECK: Callback DataOp EMI: endpoint=2 optype=3 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device /// CHECK: code=[[CODE2]] -/// CHECK: Callback DataOp EMI: endpoint=1 optype=4 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete /// CHECK: code=[[CODE2]] -/// CHECK: Callback DataOp EMI: endpoint=2 optype=4 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete /// CHECK: code=[[CODE2]] -/// CHECK: Callback Target EMI: kind=1 endpoint=2 +/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_end /// CHECK-NOT: device_num=-1 /// CHECK: code=[[CODE2]] -/// CHECK: Callback Target EMI: kind=3 endpoint=1 +/// CHECK: Callback Target EMI: kind=ompt_target_exit_data endpoint=ompt_scope_begin /// CHECK-NOT: device_num=-1 /// CHECK-NOT: code=(nil) /// CHECK: code=[[CODE3:.*]] -/// CHECK: Callback DataOp EMI: endpoint=1 optype=3 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device /// CHECK: code=[[CODE3]] -/// CHECK: Callback DataOp EMI: endpoint=2 optype=3 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device /// CHECK: code=[[CODE3]] -/// CHECK: Callback DataOp EMI: endpoint=1 optype=4 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete /// CHECK: code=[[CODE3]] -/// CHECK: Callback DataOp EMI: endpoint=2 optype=4 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete /// CHECK: code=[[CODE3]] -/// CHECK: Callback Target EMI: kind=3 endpoint=2 +/// CHECK: Callback Target EMI: kind=ompt_target_exit_data endpoint=ompt_scope_end /// CHECK-NOT: device_num=-1 /// CHECK: code=[[CODE3]] -/// CHECK: Callback Target EMI: kind=1 endpoint=1 +/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_begin /// CHECK-NOT: device_num=-1 /// CHECK-NOT: code=(nil) /// CHECK: code=[[CODE4:.*]] -/// CHECK: Callback Submit EMI: endpoint=1 req_num_teams=1 -/// CHECK: Callback Submit EMI: endpoint=2 req_num_teams=1 -/// CHECK: Callback Target EMI: kind=1 endpoint=2 +/// CHECK: Callback Submit EMI: endpoint=ompt_scope_begin req_num_teams=1 +/// CHECK: Callback Submit EMI: endpoint=ompt_scope_end req_num_teams=1 +/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_end /// CHECK-NOT: device_num=-1 /// CHECK: code=[[CODE4]] -/// CHECK: Callback Target EMI: kind=4 endpoint=1 +/// CHECK: Callback Target EMI: kind=ompt_target_update endpoint=ompt_scope_begin /// CHECK-NOT: device_num=-1 /// CHECK-NOT: code=(nil) /// CHECK: code=[[CODE5:.*]] -/// CHECK: Callback DataOp EMI: endpoint=1 optype=3 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device /// CHECK: code=[[CODE5]] -/// CHECK: Callback DataOp EMI: endpoint=2 optype=3 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device /// CHECK: code=[[CODE5]] -/// CHECK: Callback Target EMI: kind=4 endpoint=2 +/// CHECK: Callback Target EMI: kind=ompt_target_update endpoint=ompt_scope_end /// CHECK-NOT: device_num=-1 /// CHECK: code=[[CODE5]] /// CHECK: Callback Fini: diff --git a/offload/test/ompt/veccopy_disallow_both.c b/offload/test/ompt/veccopy_disallow_both.c index b531a628803e4..bfc67c5f4d274 100644 --- a/offload/test/ompt/veccopy_disallow_both.c +++ b/offload/test/ompt/veccopy_disallow_both.c @@ -1,6 +1,8 @@ +// clang-format off // RUN: %libomptarget-compile-run-and-check-generic // REQUIRES: ompt // REQUIRES: gpu +// clang-format on /* * Example OpenMP program that shows that both EMI and non-EMI @@ -54,48 +56,49 @@ int main() { return rc; } +// clang-format off /// CHECK: Callback Init: /// CHECK: Callback Load: -/// CHECK: Callback Target EMI: kind=1 endpoint=1 -/// CHECK: Callback DataOp EMI: endpoint=1 optype=1 -/// CHECK: Callback DataOp EMI: endpoint=2 optype=1 +/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_begin +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc /// CHECK-NOT: dest=(nil) -/// CHECK: Callback DataOp EMI: endpoint=1 optype=2 -/// CHECK: Callback DataOp EMI: endpoint=2 optype=2 -/// CHECK: Callback DataOp EMI: endpoint=1 optype=1 -/// CHECK: Callback DataOp EMI: endpoint=2 optype=1 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc /// CHECK-NOT: dest=(nil) -/// CHECK: Callback DataOp EMI: endpoint=1 optype=2 -/// CHECK: Callback DataOp EMI: endpoint=2 optype=2 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device /// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=1 -/// CHECK: Callback DataOp EMI: endpoint=1 optype=3 -/// CHECK: Callback DataOp EMI: endpoint=2 optype=3 -/// CHECK: Callback DataOp EMI: endpoint=1 optype=3 -/// CHECK: Callback DataOp EMI: endpoint=2 optype=3 -/// CHECK: Callback DataOp EMI: endpoint=1 optype=4 -/// CHECK: Callback DataOp EMI: endpoint=2 optype=4 -/// CHECK: Callback DataOp EMI: endpoint=1 optype=4 -/// CHECK: Callback DataOp EMI: endpoint=2 optype=4 -/// CHECK: Callback Target EMI: kind=1 endpoint=2 -/// CHECK: Callback Target EMI: kind=1 endpoint=1 -/// CHECK: Callback DataOp EMI: endpoint=1 optype=1 -/// CHECK: Callback DataOp EMI: endpoint=2 optype=1 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete +/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_end +/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_begin +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc /// CHECK-NOT: dest=(nil) -/// CHECK: Callback DataOp EMI: endpoint=1 optype=2 -/// CHECK: Callback DataOp EMI: endpoint=2 optype=2 -/// CHECK: Callback DataOp EMI: endpoint=1 optype=1 -/// CHECK: Callback DataOp EMI: endpoint=2 optype=1 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc /// CHECK-NOT: dest=(nil) -/// CHECK: Callback DataOp EMI: endpoint=1 optype=2 -/// CHECK: Callback DataOp EMI: endpoint=2 optype=2 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device /// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=0 -/// CHECK: Callback DataOp EMI: endpoint=1 optype=3 -/// CHECK: Callback DataOp EMI: endpoint=2 optype=3 -/// CHECK: Callback DataOp EMI: endpoint=1 optype=3 -/// CHECK: Callback DataOp EMI: endpoint=2 optype=3 -/// CHECK: Callback DataOp EMI: endpoint=1 optype=4 -/// CHECK: Callback DataOp EMI: endpoint=2 optype=4 -/// CHECK: Callback DataOp EMI: endpoint=1 optype=4 -/// CHECK: Callback DataOp EMI: endpoint=2 optype=4 -/// CHECK: Callback Target EMI: kind=1 endpoint=2 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete +/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_end /// CHECK: Callback Fini: diff --git a/offload/test/ompt/veccopy_emi.c b/offload/test/ompt/veccopy_emi.c index 2c57a85c14756..a1427b86a58fa 100644 --- a/offload/test/ompt/veccopy_emi.c +++ b/offload/test/ompt/veccopy_emi.c @@ -1,6 +1,8 @@ +// clang-format off // RUN: %libomptarget-compile-run-and-check-generic // REQUIRES: ompt // REQUIRES: gpu +// clang-format on /* * Example OpenMP program that registers EMI callbacks @@ -52,89 +54,90 @@ int main() { return rc; } +// clang-format off /// CHECK: Callback Init: /// CHECK: Callback Load: -/// CHECK: Callback Target EMI: kind=1 endpoint=1 +/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_begin /// CHECK-NOT: code=(nil) /// CHECK: code=[[CODE1:.*]] -/// CHECK: Callback DataOp EMI: endpoint=1 optype=1 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc /// CHECK: code=[[CODE1]] -/// CHECK: Callback DataOp EMI: endpoint=2 optype=1 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc /// CHECK-NOT: dest=(nil) /// CHECK: code=[[CODE1]] -/// CHECK: Callback DataOp EMI: endpoint=1 optype=2 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device /// CHECK: code=[[CODE1]] -/// CHECK: Callback DataOp EMI: endpoint=2 optype=2 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device /// CHECK: code=[[CODE1]] -/// CHECK: Callback DataOp EMI: endpoint=1 optype=1 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc /// CHECK: code=[[CODE1]] -/// CHECK: Callback DataOp EMI: endpoint=2 optype=1 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc /// CHECK-NOT: dest=(nil) /// CHECK: code=[[CODE1]] -/// CHECK: Callback DataOp EMI: endpoint=1 optype=2 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device /// CHECK: code=[[CODE1]] -/// CHECK: Callback DataOp EMI: endpoint=2 optype=2 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device /// CHECK: code=[[CODE1]] -/// CHECK: Callback Submit EMI: endpoint=1 req_num_teams=1 -/// CHECK: Callback Submit EMI: endpoint=2 req_num_teams=1 -/// CHECK: Callback DataOp EMI: endpoint=1 optype=3 +/// CHECK: Callback Submit EMI: endpoint=ompt_scope_begin req_num_teams=1 +/// CHECK: Callback Submit EMI: endpoint=ompt_scope_end req_num_teams=1 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device /// CHECK: code=[[CODE1]] -/// CHECK: Callback DataOp EMI: endpoint=2 optype=3 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device /// CHECK: code=[[CODE1]] -/// CHECK: Callback DataOp EMI: endpoint=1 optype=3 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device /// CHECK: code=[[CODE1]] -/// CHECK: Callback DataOp EMI: endpoint=2 optype=3 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device /// CHECK: code=[[CODE1]] -/// CHECK: Callback DataOp EMI: endpoint=1 optype=4 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete /// CHECK: code=[[CODE1]] -/// CHECK: Callback DataOp EMI: endpoint=2 optype=4 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete /// CHECK: code=[[CODE1]] -/// CHECK: Callback DataOp EMI: endpoint=1 optype=4 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete /// CHECK: code=[[CODE1]] -/// CHECK: Callback DataOp EMI: endpoint=2 optype=4 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete /// CHECK: code=[[CODE1]] -/// CHECK: Callback Target EMI: kind=1 endpoint=2 +/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_end /// CHECK: code=[[CODE1]] -/// CHECK: Callback Target EMI: kind=1 endpoint=1 +/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_begin /// CHECK-NOT: code=(nil) /// CHECK: code=[[CODE2:.*]] -/// CHECK: Callback DataOp EMI: endpoint=1 optype=1 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc /// CHECK: code=[[CODE2]] -/// CHECK: Callback DataOp EMI: endpoint=2 optype=1 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc /// CHECK-NOT: dest=(nil) /// CHECK: code=[[CODE2]] -/// CHECK: Callback DataOp EMI: endpoint=1 optype=2 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device /// CHECK: code=[[CODE2]] -/// CHECK: Callback DataOp EMI: endpoint=2 optype=2 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device /// CHECK: code=[[CODE2]] -/// CHECK: Callback DataOp EMI: endpoint=1 optype=1 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc /// CHECK: code=[[CODE2]] -/// CHECK: Callback DataOp EMI: endpoint=2 optype=1 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc /// CHECK-NOT: dest=(nil) /// CHECK: code=[[CODE2]] -/// CHECK: Callback DataOp EMI: endpoint=1 optype=2 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device /// CHECK: code=[[CODE2]] -/// CHECK: Callback DataOp EMI: endpoint=2 optype=2 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device /// CHECK: code=[[CODE2]] -/// CHECK: Callback Submit EMI: endpoint=1 req_num_teams=0 -/// CHECK: Callback Submit EMI: endpoint=2 req_num_teams=0 -/// CHECK: Callback DataOp EMI: endpoint=1 optype=3 +/// CHECK: Callback Submit EMI: endpoint=ompt_scope_begin req_num_teams=0 +/// CHECK: Callback Submit EMI: endpoint=ompt_scope_end req_num_teams=0 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device /// CHECK: code=[[CODE2]] -/// CHECK: Callback DataOp EMI: endpoint=2 optype=3 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device /// CHECK: code=[[CODE2]] -/// CHECK: Callback DataOp EMI: endpoint=1 optype=3 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device /// CHECK: code=[[CODE2]] -/// CHECK: Callback DataOp EMI: endpoint=2 optype=3 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device /// CHECK: code=[[CODE2]] -/// CHECK: Callback DataOp EMI: endpoint=1 optype=4 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete /// CHECK: code=[[CODE2]] -/// CHECK: Callback DataOp EMI: endpoint=2 optype=4 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete /// CHECK: code=[[CODE2]] -/// CHECK: Callback DataOp EMI: endpoint=1 optype=4 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete /// CHECK: code=[[CODE2]] -/// CHECK: Callback DataOp EMI: endpoint=2 optype=4 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete /// CHECK: code=[[CODE2]] -/// CHECK: Callback Target EMI: kind=1 endpoint=2 +/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_end /// CHECK: code=[[CODE2]] /// CHECK: Callback Fini: diff --git a/offload/test/ompt/veccopy_emi_map.c b/offload/test/ompt/veccopy_emi_map.c index fa18a43cd8a50..450faa1f28b0e 100644 --- a/offload/test/ompt/veccopy_emi_map.c +++ b/offload/test/ompt/veccopy_emi_map.c @@ -1,6 +1,8 @@ +// clang-format off // RUN: %libomptarget-compile-run-and-check-generic // REQUIRES: ompt // REQUIRES: gpu +// clang-format on /* * Example OpenMP program that shows that map-EMI callbacks are not supported. @@ -52,51 +54,52 @@ int main() { return rc; } +// clang-format off /// CHECK: 0: Could not register callback 'ompt_callback_target_map_emi' /// CHECK: Callback Init: /// CHECK: Callback Load: -/// CHECK: Callback Target EMI: kind=1 endpoint=1 -/// CHECK: Callback DataOp EMI: endpoint=1 optype=1 -/// CHECK: Callback DataOp EMI: endpoint=2 optype=1 +/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_begin +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc /// CHECK-NOT: dest=(nil) -/// CHECK: Callback DataOp EMI: endpoint=1 optype=2 -/// CHECK: Callback DataOp EMI: endpoint=2 optype=2 -/// CHECK: Callback DataOp EMI: endpoint=1 optype=1 -/// CHECK: Callback DataOp EMI: endpoint=2 optype=1 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc /// CHECK-NOT: dest=(nil) -/// CHECK: Callback DataOp EMI: endpoint=1 optype=2 -/// CHECK: Callback DataOp EMI: endpoint=2 optype=2 -/// CHECK: Callback Submit EMI: endpoint=1 req_num_teams=1 -/// CHECK: Callback Submit EMI: endpoint=2 req_num_teams=1 -/// CHECK: Callback DataOp EMI: endpoint=1 optype=3 -/// CHECK: Callback DataOp EMI: endpoint=2 optype=3 -/// CHECK: Callback DataOp EMI: endpoint=1 optype=3 -/// CHECK: Callback DataOp EMI: endpoint=2 optype=3 -/// CHECK: Callback DataOp EMI: endpoint=1 optype=4 -/// CHECK: Callback DataOp EMI: endpoint=2 optype=4 -/// CHECK: Callback DataOp EMI: endpoint=1 optype=4 -/// CHECK: Callback DataOp EMI: endpoint=2 optype=4 -/// CHECK: Callback Target EMI: kind=1 endpoint=2 -/// CHECK: Callback Target EMI: kind=1 endpoint=1 -/// CHECK: Callback DataOp EMI: endpoint=1 optype=1 -/// CHECK: Callback DataOp EMI: endpoint=2 optype=1 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device +/// CHECK: Callback Submit EMI: endpoint=ompt_scope_begin req_num_teams=1 +/// CHECK: Callback Submit EMI: endpoint=ompt_scope_end req_num_teams=1 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete +/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_end +/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_begin +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc /// CHECK-NOT: dest=(nil) -/// CHECK: Callback DataOp EMI: endpoint=1 optype=2 -/// CHECK: Callback DataOp EMI: endpoint=2 optype=2 -/// CHECK: Callback DataOp EMI: endpoint=1 optype=1 -/// CHECK: Callback DataOp EMI: endpoint=2 optype=1 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc /// CHECK-NOT: dest=(nil) -/// CHECK: Callback DataOp EMI: endpoint=1 optype=2 -/// CHECK: Callback DataOp EMI: endpoint=2 optype=2 -/// CHECK: Callback Submit EMI: endpoint=1 req_num_teams=0 -/// CHECK: Callback Submit EMI: endpoint=2 req_num_teams=0 -/// CHECK: Callback DataOp EMI: endpoint=1 optype=3 -/// CHECK: Callback DataOp EMI: endpoint=2 optype=3 -/// CHECK: Callback DataOp EMI: endpoint=1 optype=3 -/// CHECK: Callback DataOp EMI: endpoint=2 optype=3 -/// CHECK: Callback DataOp EMI: endpoint=1 optype=4 -/// CHECK: Callback DataOp EMI: endpoint=2 optype=4 -/// CHECK: Callback DataOp EMI: endpoint=1 optype=4 -/// CHECK: Callback DataOp EMI: endpoint=2 optype=4 -/// CHECK: Callback Target EMI: kind=1 endpoint=2 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device +/// CHECK: Callback Submit EMI: endpoint=ompt_scope_begin req_num_teams=0 +/// CHECK: Callback Submit EMI: endpoint=ompt_scope_end req_num_teams=0 +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete +/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete +/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_end /// CHECK: Callback Fini: diff --git a/offload/test/ompt/veccopy_map.c b/offload/test/ompt/veccopy_map.c index 2e817d328e59f..12e141ea74d07 100644 --- a/offload/test/ompt/veccopy_map.c +++ b/offload/test/ompt/veccopy_map.c @@ -1,6 +1,8 @@ +// clang-format off // RUN: %libomptarget-compile-run-and-check-generic // REQUIRES: ompt // REQUIRES: gpu +// clang-format on /* * Example OpenMP program that shows that map callbacks are not supported. @@ -51,31 +53,31 @@ int main() { return rc; } - +// clang-format off /// CHECK: 0: Could not register callback 'ompt_callback_target_map' /// CHECK: Callback Init: /// CHECK: Callback Load: -/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2 +/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_begin +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device /// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=1 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4 -/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2 - -/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2 +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete +/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_end + +/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_begin +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device /// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=0 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4 -/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2 +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete +/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_end /// CHECK: Callback Fini: diff --git a/offload/test/ompt/veccopy_no_device_init.c b/offload/test/ompt/veccopy_no_device_init.c index 8ee8243281187..ade06fcc92290 100644 --- a/offload/test/ompt/veccopy_no_device_init.c +++ b/offload/test/ompt/veccopy_no_device_init.c @@ -1,6 +1,7 @@ // clang-format off // RUN: %libomptarget-compile-run-and-check-generic // REQUIRES: ompt +// clang-format on /* * Example OpenMP program that shows that if no device init callback @@ -51,30 +52,31 @@ int main() { return rc; } + // clang-format off /// CHECK-NOT: Callback Init: /// CHECK: Callback Load: -/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2 +/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_begin +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device /// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=1 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4 -/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2 +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete +/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_end -/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2 +/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_begin +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device /// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=0 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4 -/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4 -/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2 +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete +/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete +/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_end /// CHECK-NOT: Callback Fini: diff --git a/offload/test/ompt/veccopy_wrong_return.c b/offload/test/ompt/veccopy_wrong_return.c index 2d07b4e1bf04a..17327f3553817 100644 --- a/offload/test/ompt/veccopy_wrong_return.c +++ b/offload/test/ompt/veccopy_wrong_return.c @@ -1,5 +1,7 @@ +// clang-format off // RUN: %libomptarget-compile-run-and-check-generic // REQUIRES: ompt +// clang-format on /* * Example OpenMP program that shows that if the initialize function @@ -51,29 +53,30 @@ int main() { return rc; } +// clang-format off /// CHECK-NOT: Callback Init: /// CHECK-NOT: Callback Load: -/// CHECK-NOT: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1 -/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1 -/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2 -/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1 -/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2 +/// CHECK-NOT: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_begin +/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc +/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device +/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc +/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device /// CHECK-NOT: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=1 -/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3 -/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3 -/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4 -/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4 -/// CHECK-NOT: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2 +/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device +/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device +/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete +/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete +/// CHECK-NOT: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_end -/// CHECK-NOT: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1 -/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1 -/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2 -/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1 -/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2 +/// CHECK-NOT: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_begin +/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc +/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device +/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc +/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device /// CHECK-NOT: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=0 -/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3 -/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3 -/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4 -/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4 -/// CHECK-NOT: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2 +/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device +/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device +/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete +/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete +/// CHECK-NOT: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_end /// CHECK-NOT: Callback Fini From f1c1063acb12c7ce7724a88bf93351dd76b561c4 Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Wed, 22 Oct 2025 01:51:10 -0700 Subject: [PATCH 066/530] Revert "[InstCombinePHI] Enhance PHI CSE to remove redundant phis" (#164520) Reverts llvm/llvm-project#163453 Causes crashes, see https://github.com/llvm/llvm-project/pull/163453#issuecomment-3429922732 --- llvm/lib/Analysis/InstructionSimplify.cpp | 92 +------ .../InstCombine/select_with_identical_phi.ll | 243 ------------------ 2 files changed, 1 insertion(+), 334 deletions(-) delete mode 100644 llvm/test/Transforms/InstCombine/select_with_identical_phi.ll diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index b573023372d81..8da51d039f197 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -4866,89 +4866,6 @@ static Value *simplifySelectWithFCmp(Value *Cond, Value *T, Value *F, return nullptr; } -/// Look for the following pattern and simplify %to_fold to %identicalPhi. -/// Here %phi, %to_fold and %phi.next perform the same functionality as -/// %identicalPhi and hence the select instruction %to_fold can be folded -/// into %identicalPhi. -/// -/// BB1: -/// %identicalPhi = phi [ X, %BB0 ], [ %identicalPhi.next, %BB1 ] -/// %phi = phi [ X, %BB0 ], [ %phi.next, %BB1 ] -/// ... -/// %identicalPhi.next = select %cmp, %val, %identicalPhi -/// (or select %cmp, %identicalPhi, %val) -/// %to_fold = select %cmp2, %identicalPhi, %phi -/// %phi.next = select %cmp, %val, %to_fold -/// (or select %cmp, %to_fold, %val) -/// -/// Prove that %phi and %identicalPhi are the same by induction: -/// -/// Base case: Both %phi and %identicalPhi are equal on entry to the loop. -/// Inductive case: -/// Suppose %phi and %identicalPhi are equal at iteration i. -/// We look at their values at iteration i+1 which are %phi.next and -/// %identicalPhi.next. They would have become different only when %cmp is -/// false and the corresponding values %to_fold and %identicalPhi differ -/// (similar reason for the other "or" case in the bracket). -/// -/// The only condition when %to_fold and %identicalPh could differ is when %cmp2 -/// is false and %to_fold is %phi, which contradicts our inductive hypothesis -/// that %phi and %identicalPhi are equal. Thus %phi and %identicalPhi are -/// always equal at iteration i+1. -bool isSimplifierIdenticalPHI(PHINode &PN, PHINode &IdenticalPN) { - if (PN.getParent() != IdenticalPN.getParent()) - return false; - if (PN.getNumIncomingValues() != 2) - return false; - - // Check that only the backedge incoming value is different. - unsigned DiffVals = 0; - BasicBlock *DiffValBB = nullptr; - for (unsigned i = 0; i < 2; i++) { - BasicBlock *PredBB = PN.getIncomingBlock(i); - if (PN.getIncomingValueForBlock(PredBB) != - IdenticalPN.getIncomingValueForBlock(PredBB)) { - DiffVals++; - DiffValBB = PredBB; - } - } - if (DiffVals != 1) - return false; - // Now check that the backedge incoming values are two select - // instructions with the same condition. Either their true - // values are the same, or their false values are the same. - auto *SI = dyn_cast(PN.getIncomingValueForBlock(DiffValBB)); - auto *IdenticalSI = - dyn_cast(IdenticalPN.getIncomingValueForBlock(DiffValBB)); - if (!SI || !IdenticalSI) - return false; - if (SI->getCondition() != IdenticalSI->getCondition()) - return false; - - SelectInst *SIOtherVal = nullptr; - Value *IdenticalSIOtherVal = nullptr; - if (SI->getTrueValue() == IdenticalSI->getTrueValue()) { - SIOtherVal = dyn_cast(SI->getFalseValue()); - IdenticalSIOtherVal = IdenticalSI->getFalseValue(); - } else if (SI->getFalseValue() == IdenticalSI->getFalseValue()) { - SIOtherVal = dyn_cast(SI->getTrueValue()); - IdenticalSIOtherVal = IdenticalSI->getTrueValue(); - } else { - return false; - } - - // Now check that the other values in select, i.e., %to_fold and - // %identicalPhi, are essentially the same value. - if (!SIOtherVal || IdenticalSIOtherVal != &IdenticalPN) - return false; - if (!(SIOtherVal->getTrueValue() == &IdenticalPN && - SIOtherVal->getFalseValue() == &PN) && - !(SIOtherVal->getTrueValue() == &PN && - SIOtherVal->getFalseValue() == &IdenticalPN)) - return false; - return true; -} - /// Given operands for a SelectInst, see if we can fold the result. /// If not, this returns null. static Value *simplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal, @@ -5124,14 +5041,7 @@ static Value *simplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal, std::optional Imp = isImpliedByDomCondition(Cond, Q.CxtI, Q.DL); if (Imp) return *Imp ? TrueVal : FalseVal; - // Look for same PHIs in the true and false values. - if (auto *TruePHI = dyn_cast(TrueVal)) - if (auto *FalsePHI = dyn_cast(FalseVal)) { - if (isSimplifierIdenticalPHI(*TruePHI, *FalsePHI)) - return FalseVal; - if (isSimplifierIdenticalPHI(*FalsePHI, *TruePHI)) - return TrueVal; - } + return nullptr; } diff --git a/llvm/test/Transforms/InstCombine/select_with_identical_phi.ll b/llvm/test/Transforms/InstCombine/select_with_identical_phi.ll deleted file mode 100644 index 7816781250799..0000000000000 --- a/llvm/test/Transforms/InstCombine/select_with_identical_phi.ll +++ /dev/null @@ -1,243 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -S -passes=instcombine | FileCheck %s -@A = extern_weak global float, align 4 - -; %same.as.v1 is a select with two phis %v1 and %phi.to.remove as the true -; and false values, while %v1 and %phi.to.remove are actually the same. -; Fold the selection instruction %same.as.v1 to %v1. -define void @select_with_identical_phi(ptr %m, ptr %n, i32 %count) { -; CHECK-LABEL: @select_with_identical_phi( -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[V0:%.*]] = phi float [ 0x4415AF1D80000000, [[ENTRY:%.*]] ], [ [[V0_1:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[Q:%.*]] = phi ptr [ [[M:%.*]], [[ENTRY]] ], [ [[Q_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[C:%.*]] = phi ptr [ [[N:%.*]], [[ENTRY]] ], [ [[C_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[Q_LOAD:%.*]] = load float, ptr [[Q]], align 4 -; CHECK-NEXT: [[C_LOAD:%.*]] = load float, ptr [[C]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fsub float [[Q_LOAD]], [[C_LOAD]] -; CHECK-NEXT: [[CMP1:%.*]] = fcmp olt float [[SUB]], [[V0]] -; CHECK-NEXT: [[V0_1]] = select i1 [[CMP1]], float [[SUB]], float [[V0]] -; CHECK-NEXT: [[CMP2:%.*]] = fcmp ogt float [[SUB]], [[V1]] -; CHECK-NEXT: [[V1_1]] = select i1 [[CMP2]], float [[SUB]], float [[V1]] -; CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I]], 1 -; CHECK-NEXT: [[Q_NEXT]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 4 -; CHECK-NEXT: [[C_NEXT]] = getelementptr inbounds nuw i8, ptr [[C]], i64 4 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], [[COUNT:%.*]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]] -; CHECK: exit: -; CHECK-NEXT: store float [[V1_1]], ptr @A, align 4 -; CHECK-NEXT: ret void -; -entry: - br label %for.body - -for.body: ; preds = %entry, %for.body - %v0 = phi float [ 0x4415AF1D80000000, %entry ], [ %v0.1, %for.body ] - %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ] - %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ] - %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ] - %q = phi ptr [ %m, %entry ], [ %q.next, %for.body ] - %c = phi ptr [ %n, %entry ], [ %c.next, %for.body ] - %q.load = load float, ptr %q - %c.load = load float, ptr %c - %sub = fsub float %q.load, %c.load - %cmp1 = fcmp olt float %sub, %v0 - %v0.1 = select i1 %cmp1, float %sub, float %v0 - %same.as.v1 = select i1 %cmp1, float %v1, float %phi.to.remove - %cmp2 = fcmp ogt float %sub, %same.as.v1 - %v1.1 = select i1 %cmp2, float %sub, float %v1 - %phi.to.remove.next = select i1 %cmp2, float %sub, float %same.as.v1 - %inc.i = add nuw nsw i32 %i, 1 - %q.next = getelementptr inbounds i8, ptr %q, i64 4 - %c.next = getelementptr inbounds i8, ptr %c, i64 4 - %exitcond = icmp eq i32 %inc.i, %count - br i1 %exitcond, label %exit, label %for.body - -exit: - %vl.1.lcssa = phi float [ %v1.1, %for.body ] - store float %vl.1.lcssa, ptr @A - ret void -} - -; The difference from select_with_identical_phi() is that the true and false values in -; %phi.to.remove.next and %v1.1 are swapped. -; Check that %same.as.v1 can be folded. -define void @select_with_identical_phi_2(ptr %m, ptr %n, i32 %count) { -; CHECK-LABEL: @select_with_identical_phi_2( -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[V0:%.*]] = phi float [ 0x4415AF1D80000000, [[ENTRY:%.*]] ], [ [[V0_1:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[Q:%.*]] = phi ptr [ [[M:%.*]], [[ENTRY]] ], [ [[Q_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[C:%.*]] = phi ptr [ [[N:%.*]], [[ENTRY]] ], [ [[C_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[Q_LOAD:%.*]] = load float, ptr [[Q]], align 4 -; CHECK-NEXT: [[C_LOAD:%.*]] = load float, ptr [[C]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fsub float [[Q_LOAD]], [[C_LOAD]] -; CHECK-NEXT: [[CMP1:%.*]] = fcmp olt float [[SUB]], [[V0]] -; CHECK-NEXT: [[V0_1]] = select i1 [[CMP1]], float [[SUB]], float [[V0]] -; CHECK-NEXT: [[CMP2:%.*]] = fcmp ogt float [[SUB]], [[V1]] -; CHECK-NEXT: [[V1_1]] = select i1 [[CMP2]], float [[V1]], float [[SUB]] -; CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I]], 1 -; CHECK-NEXT: [[Q_NEXT]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 4 -; CHECK-NEXT: [[C_NEXT]] = getelementptr inbounds nuw i8, ptr [[C]], i64 4 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], [[COUNT:%.*]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]] -; CHECK: exit: -; CHECK-NEXT: store float [[V1_1]], ptr @A, align 4 -; CHECK-NEXT: ret void -; -entry: - br label %for.body - -for.body: ; preds = %entry, %for.body - %v0 = phi float [ 0x4415AF1D80000000, %entry ], [ %v0.1, %for.body ] - %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ] - %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ] - %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ] - %q = phi ptr [ %m, %entry ], [ %q.next, %for.body ] - %c = phi ptr [ %n, %entry ], [ %c.next, %for.body ] - %q.load = load float, ptr %q - %c.load = load float, ptr %c - %sub = fsub float %q.load, %c.load - %cmp1 = fcmp olt float %sub, %v0 - %v0.1 = select i1 %cmp1, float %sub, float %v0 - %same.as.v1 = select i1 %cmp1, float %v1, float %phi.to.remove - %cmp2 = fcmp ogt float %sub, %same.as.v1 - %v1.1 = select i1 %cmp2, float %v1, float %sub - %phi.to.remove.next = select i1 %cmp2, float %same.as.v1, float %sub - %inc.i = add nuw nsw i32 %i, 1 - %q.next = getelementptr inbounds i8, ptr %q, i64 4 - %c.next = getelementptr inbounds i8, ptr %c, i64 4 - %exitcond = icmp eq i32 %inc.i, %count - br i1 %exitcond, label %exit, label %for.body - -exit: - %vl.1.lcssa = phi float [ %v1.1, %for.body ] - store float %vl.1.lcssa, ptr @A - ret void -} - -; The difference from select_with_identical_phi() is that the true and false values in -; same.as.v1 are swapped. -; Check that %same.as.v1 can be folded. -define void @select_with_identical_phi_3(ptr %m, ptr %n, i32 %count) { -; CHECK-LABEL: @select_with_identical_phi_3( -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[V0:%.*]] = phi float [ 0x4415AF1D80000000, [[ENTRY:%.*]] ], [ [[V0_1:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[Q:%.*]] = phi ptr [ [[M:%.*]], [[ENTRY]] ], [ [[Q_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[C:%.*]] = phi ptr [ [[N:%.*]], [[ENTRY]] ], [ [[C_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[Q_LOAD:%.*]] = load float, ptr [[Q]], align 4 -; CHECK-NEXT: [[C_LOAD:%.*]] = load float, ptr [[C]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fsub float [[Q_LOAD]], [[C_LOAD]] -; CHECK-NEXT: [[CMP1:%.*]] = fcmp olt float [[SUB]], [[V0]] -; CHECK-NEXT: [[V0_1]] = select i1 [[CMP1]], float [[SUB]], float [[V0]] -; CHECK-NEXT: [[CMP2:%.*]] = fcmp ogt float [[SUB]], [[V1]] -; CHECK-NEXT: [[V1_1]] = select i1 [[CMP2]], float [[SUB]], float [[V1]] -; CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I]], 1 -; CHECK-NEXT: [[Q_NEXT]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 4 -; CHECK-NEXT: [[C_NEXT]] = getelementptr inbounds nuw i8, ptr [[C]], i64 4 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], [[COUNT:%.*]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]] -; CHECK: exit: -; CHECK-NEXT: store float [[V1_1]], ptr @A, align 4 -; CHECK-NEXT: ret void -; -entry: - br label %for.body - -for.body: ; preds = %entry, %for.body - %v0 = phi float [ 0x4415AF1D80000000, %entry ], [ %v0.1, %for.body ] - %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ] - %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ] - %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ] - %q = phi ptr [ %m, %entry ], [ %q.next, %for.body ] - %c = phi ptr [ %n, %entry ], [ %c.next, %for.body ] - %q.load = load float, ptr %q - %c.load = load float, ptr %c - %sub = fsub float %q.load, %c.load - %cmp1 = fcmp olt float %sub, %v0 - %v0.1 = select i1 %cmp1, float %sub, float %v0 - %same.as.v1 = select i1 %cmp1, float %phi.to.remove, float %v1 - %cmp2 = fcmp ogt float %sub, %same.as.v1 - %v1.1 = select i1 %cmp2, float %sub, float %v1 - %phi.to.remove.next = select i1 %cmp2, float %sub, float %same.as.v1 - %inc.i = add nuw nsw i32 %i, 1 - %q.next = getelementptr inbounds i8, ptr %q, i64 4 - %c.next = getelementptr inbounds i8, ptr %c, i64 4 - %exitcond = icmp eq i32 %inc.i, %count - br i1 %exitcond, label %exit, label %for.body - -exit: - %vl.1.lcssa = phi float [ %v1.1, %for.body ] - store float %vl.1.lcssa, ptr @A - ret void -} - -; The difference from select_with_identical_phi() is that the true and false values in -; %same.as.v1, %phi.to.remove.next and %v1.1 are swapped. -; Check that %same.as.v1 can be folded. -define void @select_with_identical_phi_4(ptr %m, ptr %n, i32 %count) { -; CHECK-LABEL: @select_with_identical_phi_4( -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[V0:%.*]] = phi float [ 0x4415AF1D80000000, [[ENTRY:%.*]] ], [ [[V0_1:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[Q:%.*]] = phi ptr [ [[M:%.*]], [[ENTRY]] ], [ [[Q_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[C:%.*]] = phi ptr [ [[N:%.*]], [[ENTRY]] ], [ [[C_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[Q_LOAD:%.*]] = load float, ptr [[Q]], align 4 -; CHECK-NEXT: [[C_LOAD:%.*]] = load float, ptr [[C]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fsub float [[Q_LOAD]], [[C_LOAD]] -; CHECK-NEXT: [[CMP1:%.*]] = fcmp olt float [[SUB]], [[V0]] -; CHECK-NEXT: [[V0_1]] = select i1 [[CMP1]], float [[SUB]], float [[V0]] -; CHECK-NEXT: [[CMP2:%.*]] = fcmp ogt float [[SUB]], [[V1]] -; CHECK-NEXT: [[V1_1]] = select i1 [[CMP2]], float [[V1]], float [[SUB]] -; CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I]], 1 -; CHECK-NEXT: [[Q_NEXT]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 4 -; CHECK-NEXT: [[C_NEXT]] = getelementptr inbounds nuw i8, ptr [[C]], i64 4 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], [[COUNT:%.*]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]] -; CHECK: exit: -; CHECK-NEXT: store float [[V1_1]], ptr @A, align 4 -; CHECK-NEXT: ret void -; -entry: - br label %for.body - -for.body: ; preds = %entry, %for.body - %v0 = phi float [ 0x4415AF1D80000000, %entry ], [ %v0.1, %for.body ] - %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ] - %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ] - %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ] - %q = phi ptr [ %m, %entry ], [ %q.next, %for.body ] - %c = phi ptr [ %n, %entry ], [ %c.next, %for.body ] - %q.load = load float, ptr %q - %c.load = load float, ptr %c - %sub = fsub float %q.load, %c.load - %cmp1 = fcmp olt float %sub, %v0 - %v0.1 = select i1 %cmp1, float %sub, float %v0 - %same.as.v1 = select i1 %cmp1, float %phi.to.remove, float %v1 - %cmp2 = fcmp ogt float %sub, %same.as.v1 - %v1.1 = select i1 %cmp2, float %v1, float %sub - %phi.to.remove.next = select i1 %cmp2, float %same.as.v1, float %sub - %inc.i = add nuw nsw i32 %i, 1 - %q.next = getelementptr inbounds i8, ptr %q, i64 4 - %c.next = getelementptr inbounds i8, ptr %c, i64 4 - %exitcond = icmp eq i32 %inc.i, %count - br i1 %exitcond, label %exit, label %for.body - -exit: - %vl.1.lcssa = phi float [ %v1.1, %for.body ] - store float %vl.1.lcssa, ptr @A - ret void -} From 954a7190561e12802fd143bbaf58e73f54fe58eb Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Wed, 22 Oct 2025 11:01:33 +0200 Subject: [PATCH 067/530] [clang][bytecode] Fix unsigned wraparound behavior with bitfields (#164445) and inc/dec operations. We need to truncate the new value to the bitfield bitwidth. --- clang/lib/AST/ByteCode/Compiler.cpp | 44 ++++++++++++++- clang/lib/AST/ByteCode/Interp.h | 76 +++++++++++++++++++++++++- clang/lib/AST/ByteCode/Opcodes.td | 13 +++++ clang/test/AST/ByteCode/bitfields.cpp | 79 +++++++++++++++++++++++++++ 4 files changed, 206 insertions(+), 6 deletions(-) diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index 6b989276e6d7d..f15b3c1e5aae4 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -6432,6 +6432,13 @@ bool Compiler::visitFunc(const FunctionDecl *F) { return this->emitNoRet(SourceInfo{}); } +static uint32_t getBitWidth(const Expr *E) { + assert(E->refersToBitField()); + const auto *ME = cast(E); + const auto *FD = cast(ME->getMemberDecl()); + return FD->getBitWidthValue(); +} + template bool Compiler::VisitUnaryOperator(const UnaryOperator *E) { const Expr *SubExpr = E->getSubExpr(); @@ -6460,10 +6467,15 @@ bool Compiler::VisitUnaryOperator(const UnaryOperator *E) { return DiscardResult ? this->emitPopPtr(E) : true; } - if (T == PT_Float) { + if (T == PT_Float) return DiscardResult ? this->emitIncfPop(getFPOptions(E), E) : this->emitIncf(getFPOptions(E), E); - } + + if (SubExpr->refersToBitField()) + return DiscardResult ? this->emitIncPopBitfield(*T, E->canOverflow(), + getBitWidth(SubExpr), E) + : this->emitIncBitfield(*T, E->canOverflow(), + getBitWidth(SubExpr), E); return DiscardResult ? this->emitIncPop(*T, E->canOverflow(), E) : this->emitInc(*T, E->canOverflow(), E); @@ -6484,9 +6496,15 @@ bool Compiler::VisitUnaryOperator(const UnaryOperator *E) { return DiscardResult ? this->emitPopPtr(E) : true; } - if (T == PT_Float) { + if (T == PT_Float) return DiscardResult ? this->emitDecfPop(getFPOptions(E), E) : this->emitDecf(getFPOptions(E), E); + + if (SubExpr->refersToBitField()) { + return DiscardResult ? this->emitDecPopBitfield(*T, E->canOverflow(), + getBitWidth(SubExpr), E) + : this->emitDecBitfield(*T, E->canOverflow(), + getBitWidth(SubExpr), E); } return DiscardResult ? this->emitDecPop(*T, E->canOverflow(), E) @@ -6515,6 +6533,11 @@ bool Compiler::VisitUnaryOperator(const UnaryOperator *E) { if (DiscardResult) { if (T == PT_Float) return this->emitIncfPop(getFPOptions(E), E); + if (SubExpr->refersToBitField()) + return DiscardResult ? this->emitIncPopBitfield(*T, E->canOverflow(), + getBitWidth(SubExpr), E) + : this->emitIncBitfield(*T, E->canOverflow(), + getBitWidth(SubExpr), E); return this->emitIncPop(*T, E->canOverflow(), E); } @@ -6530,6 +6553,11 @@ bool Compiler::VisitUnaryOperator(const UnaryOperator *E) { return false; if (!this->emitStoreFloat(E)) return false; + } else if (SubExpr->refersToBitField()) { + assert(isIntegralType(*T)); + if (!this->emitPreIncBitfield(*T, E->canOverflow(), getBitWidth(SubExpr), + E)) + return false; } else { assert(isIntegralType(*T)); if (!this->emitPreInc(*T, E->canOverflow(), E)) @@ -6560,6 +6588,11 @@ bool Compiler::VisitUnaryOperator(const UnaryOperator *E) { if (DiscardResult) { if (T == PT_Float) return this->emitDecfPop(getFPOptions(E), E); + if (SubExpr->refersToBitField()) + return DiscardResult ? this->emitDecPopBitfield(*T, E->canOverflow(), + getBitWidth(SubExpr), E) + : this->emitDecBitfield(*T, E->canOverflow(), + getBitWidth(SubExpr), E); return this->emitDecPop(*T, E->canOverflow(), E); } @@ -6575,6 +6608,11 @@ bool Compiler::VisitUnaryOperator(const UnaryOperator *E) { return false; if (!this->emitStoreFloat(E)) return false; + } else if (SubExpr->refersToBitField()) { + assert(isIntegralType(*T)); + if (!this->emitPreDecBitfield(*T, E->canOverflow(), getBitWidth(SubExpr), + E)) + return false; } else { assert(isIntegralType(*T)); if (!this->emitPreDec(*T, E->canOverflow(), E)) diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h index d8529daf591ee..89f6fbefb1907 100644 --- a/clang/lib/AST/ByteCode/Interp.h +++ b/clang/lib/AST/ByteCode/Interp.h @@ -702,7 +702,7 @@ enum class IncDecOp { template bool IncDecHelper(InterpState &S, CodePtr OpPC, const Pointer &Ptr, - bool CanOverflow) { + bool CanOverflow, UnsignedOrNone BitWidth = std::nullopt) { assert(!Ptr.isDummy()); if (!S.inConstantContext()) { @@ -725,12 +725,18 @@ bool IncDecHelper(InterpState &S, CodePtr OpPC, const Pointer &Ptr, if constexpr (Op == IncDecOp::Inc) { if (!T::increment(Value, &Result) || !CanOverflow) { - Ptr.deref() = Result; + if (BitWidth) + Ptr.deref() = Result.truncate(*BitWidth); + else + Ptr.deref() = Result; return true; } } else { if (!T::decrement(Value, &Result) || !CanOverflow) { - Ptr.deref() = Result; + if (BitWidth) + Ptr.deref() = Result.truncate(*BitWidth); + else + Ptr.deref() = Result; return true; } } @@ -774,6 +780,17 @@ bool Inc(InterpState &S, CodePtr OpPC, bool CanOverflow) { CanOverflow); } +template ::T> +bool IncBitfield(InterpState &S, CodePtr OpPC, bool CanOverflow, + unsigned BitWidth) { + const Pointer &Ptr = S.Stk.pop(); + if (!CheckLoad(S, OpPC, Ptr, AK_Increment)) + return false; + + return IncDecHelper(S, OpPC, Ptr, CanOverflow, + BitWidth); +} + /// 1) Pops a pointer from the stack /// 2) Load the value from the pointer /// 3) Writes the value increased by one back to the pointer @@ -786,6 +803,17 @@ bool IncPop(InterpState &S, CodePtr OpPC, bool CanOverflow) { return IncDecHelper(S, OpPC, Ptr, CanOverflow); } +template ::T> +bool IncPopBitfield(InterpState &S, CodePtr OpPC, bool CanOverflow, + uint32_t BitWidth) { + const Pointer &Ptr = S.Stk.pop(); + if (!CheckLoad(S, OpPC, Ptr, AK_Increment)) + return false; + + return IncDecHelper(S, OpPC, Ptr, CanOverflow, + BitWidth); +} + template ::T> bool PreInc(InterpState &S, CodePtr OpPC, bool CanOverflow) { const Pointer &Ptr = S.Stk.peek(); @@ -795,6 +823,17 @@ bool PreInc(InterpState &S, CodePtr OpPC, bool CanOverflow) { return IncDecHelper(S, OpPC, Ptr, CanOverflow); } +template ::T> +bool PreIncBitfield(InterpState &S, CodePtr OpPC, bool CanOverflow, + uint32_t BitWidth) { + const Pointer &Ptr = S.Stk.peek(); + if (!CheckLoad(S, OpPC, Ptr, AK_Increment)) + return false; + + return IncDecHelper(S, OpPC, Ptr, CanOverflow, + BitWidth); +} + /// 1) Pops a pointer from the stack /// 2) Load the value from the pointer /// 3) Writes the value decreased by one back to the pointer @@ -808,6 +847,16 @@ bool Dec(InterpState &S, CodePtr OpPC, bool CanOverflow) { return IncDecHelper(S, OpPC, Ptr, CanOverflow); } +template ::T> +bool DecBitfield(InterpState &S, CodePtr OpPC, bool CanOverflow, + uint32_t BitWidth) { + const Pointer &Ptr = S.Stk.pop(); + if (!CheckLoad(S, OpPC, Ptr, AK_Decrement)) + return false; + + return IncDecHelper(S, OpPC, Ptr, CanOverflow, + BitWidth); +} /// 1) Pops a pointer from the stack /// 2) Load the value from the pointer @@ -821,6 +870,17 @@ bool DecPop(InterpState &S, CodePtr OpPC, bool CanOverflow) { return IncDecHelper(S, OpPC, Ptr, CanOverflow); } +template ::T> +bool DecPopBitfield(InterpState &S, CodePtr OpPC, bool CanOverflow, + uint32_t BitWidth) { + const Pointer &Ptr = S.Stk.pop(); + if (!CheckLoad(S, OpPC, Ptr, AK_Decrement)) + return false; + + return IncDecHelper(S, OpPC, Ptr, CanOverflow, + BitWidth); +} + template ::T> bool PreDec(InterpState &S, CodePtr OpPC, bool CanOverflow) { const Pointer &Ptr = S.Stk.peek(); @@ -829,6 +889,16 @@ bool PreDec(InterpState &S, CodePtr OpPC, bool CanOverflow) { return IncDecHelper(S, OpPC, Ptr, CanOverflow); } +template ::T> +bool PreDecBitfield(InterpState &S, CodePtr OpPC, bool CanOverflow, + uint32_t BitWidth) { + const Pointer &Ptr = S.Stk.peek(); + if (!CheckLoad(S, OpPC, Ptr, AK_Decrement)) + return false; + return IncDecHelper(S, OpPC, Ptr, CanOverflow, + BitWidth); +} + template bool IncDecFloatHelper(InterpState &S, CodePtr OpPC, const Pointer &Ptr, uint32_t FPOI) { diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td index 532c4448e6f40..406feb57418fa 100644 --- a/clang/lib/AST/ByteCode/Opcodes.td +++ b/clang/lib/AST/ByteCode/Opcodes.td @@ -612,12 +612,25 @@ class OverflowOpcode : Opcode { let HasGroup = 1; } +class OverflowBitfieldOpcode : Opcode { + let Types = [AluTypeClass]; + let Args = [ArgBool, ArgUint32]; + let HasGroup = 1; +} + def Inc : OverflowOpcode; +def IncBitfield : OverflowBitfieldOpcode; def IncPop : OverflowOpcode; +def IncPopBitfield : OverflowBitfieldOpcode; def PreInc : OverflowOpcode; +def PreIncBitfield : OverflowBitfieldOpcode; + def Dec : OverflowOpcode; +def DecBitfield : OverflowBitfieldOpcode; def DecPop : OverflowOpcode; +def DecPopBitfield : OverflowBitfieldOpcode; def PreDec : OverflowOpcode; +def PreDecBitfield : OverflowBitfieldOpcode; // Float increment and decrement. def Incf: FloatOpcode; diff --git a/clang/test/AST/ByteCode/bitfields.cpp b/clang/test/AST/ByteCode/bitfields.cpp index df8d5678287d3..a58328286bd9a 100644 --- a/clang/test/AST/ByteCode/bitfields.cpp +++ b/clang/test/AST/ByteCode/bitfields.cpp @@ -128,3 +128,82 @@ namespace NonConstBitWidth { // both-note {{read of non-const variable}} }; } + +namespace IncDecOverflow { + constexpr bool test1() { + struct {unsigned u: 5; } a {}; + a.u--; + return a.u == 31; + } + static_assert(test1(), ""); + + constexpr bool test2() { + struct {unsigned u: 5; } a {}; + --a.u; + return a.u == 31; + } + static_assert(test2(), ""); + + constexpr bool test3() { + int x = 0; + struct {unsigned u: 5; } a {}; + x = a.u--; + return a.u == 31; + } + static_assert(test3(), ""); + + constexpr bool test4() { + int x = 0; + struct {unsigned u: 5; } a {}; + x = --a.u; + return a.u == 31; + } + static_assert(test4(), ""); + + constexpr bool test5() { + struct {unsigned u: 5; } a {}; + a.u = 31; + ++a.u; + + return a.u == 0; + } + static_assert(test5(), ""); + + constexpr bool test6() { + struct {unsigned u: 5; } a {}; + a.u = 31; + ++a.u; + + return a.u == 0; + } + static_assert(test6(), ""); + + constexpr bool test7() { + struct {unsigned u: 5; } a {}; + a.u = 31; + a.u++; + + return a.u == 0; + } + static_assert(test7(), ""); + + constexpr bool test8() { + int x = 0; + struct {unsigned u: 5; } a {}; + a.u = 31; + x = a.u++; + + return a.u == 0; + } + static_assert(test8(), ""); + + constexpr bool test9() { + int x = 0; + struct {unsigned u: 5; } a {}; + a.u = 31; + x = ++a.u; + + return a.u == 0; + } + static_assert(test9(), ""); +} From 5afe29e0faaee44c44f1daa3e2693fc1ab212530 Mon Sep 17 00:00:00 2001 From: Luo Yuanke Date: Wed, 22 Oct 2025 17:11:27 +0800 Subject: [PATCH 068/530] [FastIsel] Get the right register type for call instruction (#164565) When switch from fast isel to dag isel the input value is from llvm IR instruction. If the instruction is call we should get the calling convention of the callee and pass it to RegsForValue::getCopyFromRegs, so that it can deduce the right RegisterVT of the returned value of the callee. --------- Co-authored-by: Yuanke Luo --- .../SelectionDAG/SelectionDAGBuilder.cpp | 7 +- llvm/test/CodeGen/X86/bf16-fast-isel.ll | 66 +++++++++++++++++++ 2 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/X86/bf16-fast-isel.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 20a0efd3afa1c..dcf2df305d24a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -1977,8 +1977,13 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) { if (const Instruction *Inst = dyn_cast(V)) { Register InReg = FuncInfo.InitializeRegForValue(Inst); + std::optional CallConv; + auto *CI = dyn_cast(Inst); + if (CI && !CI->isInlineAsm()) + CallConv = CI->getCallingConv(); + RegsForValue RFV(*DAG.getContext(), TLI, DAG.getDataLayout(), InReg, - Inst->getType(), std::nullopt); + Inst->getType(), CallConv); SDValue Chain = DAG.getEntryNode(); return RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr, V); } diff --git a/llvm/test/CodeGen/X86/bf16-fast-isel.ll b/llvm/test/CodeGen/X86/bf16-fast-isel.ll new file mode 100644 index 0000000000000..c659e0e647d36 --- /dev/null +++ b/llvm/test/CodeGen/X86/bf16-fast-isel.ll @@ -0,0 +1,66 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --fast-isel < %s -mtriple=x86_64-unknown-unknown | FileCheck %s + +define i8 @test_direct_call(ptr %f) nounwind { +; CHECK-LABEL: test_direct_call: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq foo@PLT +; CHECK-NEXT: callq bar@PLT +; CHECK-NEXT: popq %rcx +; CHECK-NEXT: retq +entry: + %call = call bfloat @foo(ptr %f) + %call2 = call zeroext i8 @bar(bfloat %call) + ret i8 %call2 +} + +define i8 @test_fast_direct_call(ptr %f) nounwind { +; CHECK-LABEL: test_fast_direct_call: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq foo_fast@PLT +; CHECK-NEXT: callq bar@PLT +; CHECK-NEXT: popq %rcx +; CHECK-NEXT: retq +entry: + %call = call fastcc bfloat @foo_fast(ptr %f) + %call2 = call zeroext i8 @bar(bfloat %call) + ret i8 %call2 +} + +define i8 @test_indirect_all(ptr %fptr, ptr %f) nounwind { +; CHECK-LABEL: test_indirect_all: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: movq %rsi, %rdi +; CHECK-NEXT: callq foo@PLT +; CHECK-NEXT: callq *%rbx +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: retq +entry: + %call = call bfloat @foo(ptr %f) + %call2 = call zeroext i8 %fptr(bfloat %call) + ret i8 %call2 +} + +define i8 @test_fast_indirect_all(ptr %fptr, ptr %f) nounwind { +; CHECK-LABEL: test_fast_indirect_all: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: movq %rsi, %rdi +; CHECK-NEXT: callq foo@PLT +; CHECK-NEXT: callq *%rbx +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: retq +entry: + %call = call fastcc bfloat @foo(ptr %f) + %call2 = call zeroext i8 %fptr(bfloat %call) + ret i8 %call2 +} + +declare bfloat @foo(ptr %f) +declare zeroext i8 @bar(bfloat) +declare fastcc bfloat @foo_fast(ptr %f) From 29623f1e98f4970d5b9aa62875cda5c2c66694a8 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 22 Oct 2025 11:23:02 +0200 Subject: [PATCH 069/530] [AllocToken] Make token mode a pass parameter (#163634) Refactor the AllocToken pass to accept the mode via pass options rather than LLVM cl::opt. This is both cleaner, but also required to make the mode frontend-driven and avoid potential inconsistencies. --- llvm/include/llvm/Support/AllocToken.h | 10 ++++++++ .../Transforms/Instrumentation/AllocToken.h | 2 ++ llvm/lib/Passes/PassBuilder.cpp | 25 +++++++++++++++++++ llvm/lib/Passes/PassRegistry.def | 5 +++- llvm/lib/Support/AllocToken.cpp | 11 ++++++++ .../Transforms/Instrumentation/AllocToken.cpp | 18 +------------ llvm/test/Instrumentation/AllocToken/basic.ll | 2 +- .../Instrumentation/AllocToken/basic32.ll | 2 +- llvm/test/Instrumentation/AllocToken/fast.ll | 2 +- .../Instrumentation/AllocToken/intrinsic.ll | 2 +- .../Instrumentation/AllocToken/intrinsic32.ll | 2 +- .../test/Instrumentation/AllocToken/invoke.ll | 2 +- .../Instrumentation/AllocToken/nonlibcalls.ll | 2 +- .../AllocToken/typehashpointersplit.ll | 2 +- 14 files changed, 61 insertions(+), 26 deletions(-) diff --git a/llvm/include/llvm/Support/AllocToken.h b/llvm/include/llvm/Support/AllocToken.h index 8d82670eaeb8d..e40d8163a9d7c 100644 --- a/llvm/include/llvm/Support/AllocToken.h +++ b/llvm/include/llvm/Support/AllocToken.h @@ -14,6 +14,7 @@ #define LLVM_SUPPORT_ALLOCTOKEN_H #include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringRef.h" #include #include @@ -36,6 +37,15 @@ enum class AllocTokenMode { TypeHashPointerSplit, }; +/// The default allocation token mode. +inline constexpr AllocTokenMode DefaultAllocTokenMode = + AllocTokenMode::TypeHashPointerSplit; + +/// Returns the AllocTokenMode from its canonical string name; if an invalid +/// name was provided returns nullopt. +LLVM_ABI std::optional +getAllocTokenModeFromString(StringRef Name); + /// Metadata about an allocation used to generate a token ID. struct AllocTokenMetadata { SmallString<64> TypeName; diff --git a/llvm/include/llvm/Transforms/Instrumentation/AllocToken.h b/llvm/include/llvm/Transforms/Instrumentation/AllocToken.h index b1391cb04302c..077703c214745 100644 --- a/llvm/include/llvm/Transforms/Instrumentation/AllocToken.h +++ b/llvm/include/llvm/Transforms/Instrumentation/AllocToken.h @@ -16,6 +16,7 @@ #include "llvm/IR/Analysis.h" #include "llvm/IR/PassManager.h" +#include "llvm/Support/AllocToken.h" #include namespace llvm { @@ -23,6 +24,7 @@ namespace llvm { class Module; struct AllocTokenOptions { + AllocTokenMode Mode = DefaultAllocTokenMode; std::optional MaxTokens; bool FastABI = false; bool Extended = false; diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index e45cac848d0a2..048c58da36277 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -1095,6 +1095,31 @@ Expected parseMSanPassOptions(StringRef Params) { return Result; } +Expected parseAllocTokenPassOptions(StringRef Params) { + AllocTokenOptions Result; + while (!Params.empty()) { + StringRef ParamName; + std::tie(ParamName, Params) = Params.split(';'); + + if (ParamName.consume_front("mode=")) { + if (auto Mode = getAllocTokenModeFromString(ParamName)) + Result.Mode = *Mode; + else + return make_error( + formatv("invalid argument to AllocToken pass mode " + "parameter: '{}'", + ParamName) + .str(), + inconvertibleErrorCode()); + } else { + return make_error( + formatv("invalid AllocToken pass parameter '{}'", ParamName).str(), + inconvertibleErrorCode()); + } + } + return Result; +} + /// Parser of parameters for SimplifyCFG pass. Expected parseSimplifyCFGOptions(StringRef Params) { SimplifyCFGOptions Result; diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 884d8daf63463..a66b6e4f3a17d 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -126,7 +126,6 @@ MODULE_PASS("openmp-opt", OpenMPOptPass()) MODULE_PASS("openmp-opt-postlink", OpenMPOptPass(ThinOrFullLTOPhase::FullLTOPostLink)) MODULE_PASS("partial-inliner", PartialInlinerPass()) -MODULE_PASS("alloc-token", AllocTokenPass()) MODULE_PASS("pgo-icall-prom", PGOIndirectCallPromotion()) MODULE_PASS("pgo-instr-gen", PGOInstrumentationGen()) MODULE_PASS("pgo-instr-use", PGOInstrumentationUse()) @@ -182,6 +181,10 @@ MODULE_PASS("wholeprogramdevirt", WholeProgramDevirtPass()) #ifndef MODULE_PASS_WITH_PARAMS #define MODULE_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS) #endif +MODULE_PASS_WITH_PARAMS( + "alloc-token", "AllocTokenPass", + [](AllocTokenOptions Opts) { return AllocTokenPass(Opts); }, + parseAllocTokenPassOptions, "mode=") MODULE_PASS_WITH_PARAMS( "asan", "AddressSanitizerPass", [](AddressSanitizerOptions Opts) { return AddressSanitizerPass(Opts); }, diff --git a/llvm/lib/Support/AllocToken.cpp b/llvm/lib/Support/AllocToken.cpp index 95ecda2ffd8ba..8e9e89f0df353 100644 --- a/llvm/lib/Support/AllocToken.cpp +++ b/llvm/lib/Support/AllocToken.cpp @@ -11,11 +11,22 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/AllocToken.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/SipHash.h" using namespace llvm; +std::optional +llvm::getAllocTokenModeFromString(StringRef Name) { + return StringSwitch>(Name) + .Case("increment", AllocTokenMode::Increment) + .Case("random", AllocTokenMode::Random) + .Case("typehash", AllocTokenMode::TypeHash) + .Case("typehashpointersplit", AllocTokenMode::TypeHashPointerSplit) + .Default(std::nullopt); +} + static uint64_t getStableHash(const AllocTokenMetadata &Metadata, uint64_t MaxTokens) { return getStableSipHash(Metadata.TypeName) % MaxTokens; diff --git a/llvm/lib/Transforms/Instrumentation/AllocToken.cpp b/llvm/lib/Transforms/Instrumentation/AllocToken.cpp index 08738450e4b74..8181e4ef1d74f 100644 --- a/llvm/lib/Transforms/Instrumentation/AllocToken.cpp +++ b/llvm/lib/Transforms/Instrumentation/AllocToken.cpp @@ -63,22 +63,6 @@ namespace { //===--- Command-line options ---------------------------------------------===// -cl::opt ClMode( - "alloc-token-mode", cl::Hidden, cl::desc("Token assignment mode"), - cl::init(TokenMode::TypeHashPointerSplit), - cl::values( - clEnumValN(TokenMode::Increment, "increment", - "Incrementally increasing token ID"), - clEnumValN(TokenMode::Random, "random", - "Statically-assigned random token ID"), - clEnumValN(TokenMode::TypeHash, "typehash", - "Token ID based on allocated type hash"), - clEnumValN( - TokenMode::TypeHashPointerSplit, "typehashpointersplit", - "Token ID based on allocated type hash, where the top half " - "ID-space is reserved for types that contain pointers and the " - "bottom half for types that do not contain pointers. "))); - cl::opt ClFuncPrefix("alloc-token-prefix", cl::desc("The allocation function prefix"), cl::Hidden, cl::init("__alloc_token_")); @@ -265,7 +249,7 @@ class AllocToken { : Options(transformOptionsFromCl(std::move(Opts))), Mod(M), FAM(MAM.getResult(M).getManager()), Mode(IncrementMode(*IntPtrTy, *Options.MaxTokens)) { - switch (ClMode.getValue()) { + switch (Options.Mode) { case TokenMode::Increment: break; case TokenMode::Random: diff --git a/llvm/test/Instrumentation/AllocToken/basic.ll b/llvm/test/Instrumentation/AllocToken/basic.ll index 099d37df264d6..0c34b1373cfa1 100644 --- a/llvm/test/Instrumentation/AllocToken/basic.ll +++ b/llvm/test/Instrumentation/AllocToken/basic.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=increment -S | FileCheck %s +; RUN: opt < %s -passes='inferattrs,alloc-token' -S | FileCheck %s target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/Instrumentation/AllocToken/basic32.ll b/llvm/test/Instrumentation/AllocToken/basic32.ll index 944a452f4b4d7..52d1d1446f0ef 100644 --- a/llvm/test/Instrumentation/AllocToken/basic32.ll +++ b/llvm/test/Instrumentation/AllocToken/basic32.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=increment -S | FileCheck %s +; RUN: opt < %s -passes='inferattrs,alloc-token' -S | FileCheck %s target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128" diff --git a/llvm/test/Instrumentation/AllocToken/fast.ll b/llvm/test/Instrumentation/AllocToken/fast.ll index 19a3ef6bb9ede..f6bf5ee0b7c97 100644 --- a/llvm/test/Instrumentation/AllocToken/fast.ll +++ b/llvm/test/Instrumentation/AllocToken/fast.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=increment -alloc-token-fast-abi -alloc-token-max=3 -S | FileCheck %s +; RUN: opt < %s -passes='inferattrs,alloc-token' -alloc-token-fast-abi -alloc-token-max=3 -S | FileCheck %s target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/Instrumentation/AllocToken/intrinsic.ll b/llvm/test/Instrumentation/AllocToken/intrinsic.ll index 13aaa90008a7c..5c6f2f147b3d0 100644 --- a/llvm/test/Instrumentation/AllocToken/intrinsic.ll +++ b/llvm/test/Instrumentation/AllocToken/intrinsic.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; Test that the alloc-token pass lowers the intrinsic to a constant token ID. ; -; RUN: opt < %s -passes=alloc-token -alloc-token-mode=typehashpointersplit -alloc-token-max=2 -S | FileCheck %s +; RUN: opt < %s -passes='alloc-token' -alloc-token-max=2 -S | FileCheck %s target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Instrumentation/AllocToken/intrinsic32.ll b/llvm/test/Instrumentation/AllocToken/intrinsic32.ll index eb5dbbe91a83e..15f7c258e2a5d 100644 --- a/llvm/test/Instrumentation/AllocToken/intrinsic32.ll +++ b/llvm/test/Instrumentation/AllocToken/intrinsic32.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; Test that the alloc-token pass lowers the intrinsic to a constant token ID. ; -; RUN: opt < %s -passes=alloc-token -alloc-token-mode=typehashpointersplit -alloc-token-max=2 -S | FileCheck %s +; RUN: opt < %s -passes='alloc-token' -alloc-token-max=2 -S | FileCheck %s target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128" target triple = "i386-pc-linux-gnu" diff --git a/llvm/test/Instrumentation/AllocToken/invoke.ll b/llvm/test/Instrumentation/AllocToken/invoke.ll index 347c99a2e8f8d..8e7ab3848dc05 100644 --- a/llvm/test/Instrumentation/AllocToken/invoke.ll +++ b/llvm/test/Instrumentation/AllocToken/invoke.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=increment -S | FileCheck %s +; RUN: opt < %s -passes='inferattrs,alloc-token' -S | FileCheck %s target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/Instrumentation/AllocToken/nonlibcalls.ll b/llvm/test/Instrumentation/AllocToken/nonlibcalls.ll index 19673da1bcfb6..45f573ee7b044 100644 --- a/llvm/test/Instrumentation/AllocToken/nonlibcalls.ll +++ b/llvm/test/Instrumentation/AllocToken/nonlibcalls.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=increment -alloc-token-extended -S | FileCheck %s +; RUN: opt < %s -passes='inferattrs,alloc-token' -alloc-token-extended -S | FileCheck %s target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/Instrumentation/AllocToken/typehashpointersplit.ll b/llvm/test/Instrumentation/AllocToken/typehashpointersplit.ll index 1f776480c5b3a..4d1be5eac8cd2 100644 --- a/llvm/test/Instrumentation/AllocToken/typehashpointersplit.ll +++ b/llvm/test/Instrumentation/AllocToken/typehashpointersplit.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=typehashpointersplit -alloc-token-max=2 -S | FileCheck %s +; RUN: opt < %s -passes='inferattrs,alloc-token' -alloc-token-max=2 -S | FileCheck %s target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" From 104b78330662ec13fcdd472582589b2aee8428b9 Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 22 Oct 2025 10:38:58 +0100 Subject: [PATCH 070/530] [AArch64] Extend bitcast(extload) tests. NFC --- .../test/CodeGen/AArch64/load-zext-bitcast.ll | 525 +++++++++++++++++- 1 file changed, 513 insertions(+), 12 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll b/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll index 1a83930dfafa2..9193025264cc0 100644 --- a/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll +++ b/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll @@ -2,8 +2,8 @@ ; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s ; load zero-extended i32, bitcast to f64 -define double @_Z9load_u64_from_u32_testPj(ptr %n){ -; CHECK-LABEL: _Z9load_u64_from_u32_testPj: +define double @load_u64_from_u32(ptr %n){ +; CHECK-LABEL: load_u64_from_u32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr s0, [x0] ; CHECK-NEXT: ret @@ -15,8 +15,8 @@ entry: } ; load zero-extended i16, bitcast to f64 -define double @_Z9load_u64_from_u16_testPj(ptr %n){ -; CHECK-LABEL: _Z9load_u64_from_u16_testPj: +define double @load_u64_from_u16(ptr %n){ +; CHECK-LABEL: load_u64_from_u16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr h0, [x0] ; CHECK-NEXT: ret @@ -28,8 +28,8 @@ entry: } ; load zero-extended i8, bitcast to f64 -define double @_Z16load_u64_from_u8Ph(ptr %n){ -; CHECK-LABEL: _Z16load_u64_from_u8Ph: +define double @load_u64_from_u8(ptr %n){ +; CHECK-LABEL: load_u64_from_u8: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr b0, [x0] ; CHECK-NEXT: ret @@ -41,8 +41,8 @@ entry: } ; load zero-extended i16, bitcast to f32 -define float @_Z17load_u32_from_u16Pt(ptr %n){ -; CHECK-LABEL: _Z17load_u32_from_u16Pt: +define float @load_u32_from_u16(ptr %n){ +; CHECK-LABEL: load_u32_from_u16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr h0, [x0] ; CHECK-NEXT: ret @@ -54,8 +54,8 @@ entry: } ; load zero-extended i8, bitcast to f32 -define float @_Z16load_u32_from_u8Ph(ptr %n){ -; CHECK-LABEL: _Z16load_u32_from_u8Ph: +define float @load_u32_from_u8(ptr %n){ +; CHECK-LABEL: load_u32_from_u8: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr b0, [x0] ; CHECK-NEXT: ret @@ -67,8 +67,8 @@ entry: } ; load zero-extended i8, bitcast to f16 -define half @_Z16load_u16_from_u8Ph(ptr %n){ -; CHECK-LABEL: _Z16load_u16_from_u8Ph: +define half @load_u16_from_u8(ptr %n){ +; CHECK-LABEL: load_u16_from_u8: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr b0, [x0] ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0 @@ -80,3 +80,504 @@ entry: ret half %1 } + +define double @load_u64_from_u32_off1(ptr %n){ +; CHECK-LABEL: load_u64_from_u32_off1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldur w8, [x0, #1] +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 1 + %0 = load i32, ptr %p, align 4 + %conv = zext i32 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define double @load_u64_from_u16_off1(ptr %n){ +; CHECK-LABEL: load_u64_from_u16_off1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldurh w8, [x0, #1] +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 1 + %0 = load i16, ptr %p, align 2 + %conv = zext i16 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define double @load_u64_from_u8_off1(ptr %n){ +; CHECK-LABEL: load_u64_from_u8_off1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldrb w8, [x0, #1] +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 1 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define float @load_u32_from_u16_off1(ptr %n){ +; CHECK-LABEL: load_u32_from_u16_off1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldurh w8, [x0, #1] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 1 + %0 = load i16, ptr %p, align 2 + %conv = zext i16 %0 to i32 + %1 = bitcast i32 %conv to float + ret float %1 +} + +define float @load_u32_from_u8_off1(ptr %n){ +; CHECK-LABEL: load_u32_from_u8_off1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldrb w8, [x0, #1] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 1 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i32 + %1 = bitcast i32 %conv to float + ret float %1 +} + +define half @load_u16_from_u8_off1(ptr %n){ +; CHECK-LABEL: load_u16_from_u8_off1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldrb w8, [x0, #1] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 1 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i16 + %1 = bitcast i16 %conv to half + ret half %1 +} + + + +define double @load_u64_from_u32_off2(ptr %n){ +; CHECK-LABEL: load_u64_from_u32_off2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldur w8, [x0, #2] +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 2 + %0 = load i32, ptr %p, align 4 + %conv = zext i32 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define double @load_u64_from_u16_off2(ptr %n){ +; CHECK-LABEL: load_u64_from_u16_off2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldrh w8, [x0, #2] +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 2 + %0 = load i16, ptr %p, align 2 + %conv = zext i16 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define double @load_u64_from_u8_off2(ptr %n){ +; CHECK-LABEL: load_u64_from_u8_off2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldrb w8, [x0, #2] +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 2 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define float @load_u32_from_u16_off2(ptr %n){ +; CHECK-LABEL: load_u32_from_u16_off2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr h0, [x0, #2] +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 2 + %0 = load i16, ptr %p, align 2 + %conv = zext i16 %0 to i32 + %1 = bitcast i32 %conv to float + ret float %1 +} + +define float @load_u32_from_u8_off2(ptr %n){ +; CHECK-LABEL: load_u32_from_u8_off2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr b0, [x0, #1] +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 2 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i32 + %1 = bitcast i32 %conv to float + ret float %1 +} + +define half @load_u16_from_u8_off2(ptr %n){ +; CHECK-LABEL: load_u16_from_u8_off2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr b0, [x0, #1] +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 2 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i16 + %1 = bitcast i16 %conv to half + ret half %1 +} + + + +define double @load_u64_from_u32_off255(ptr %n){ +; CHECK-LABEL: load_u64_from_u32_off255: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldur w8, [x0, #255] +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 255 + %0 = load i32, ptr %p, align 4 + %conv = zext i32 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define double @load_u64_from_u16_off255(ptr %n){ +; CHECK-LABEL: load_u64_from_u16_off255: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldurh w8, [x0, #255] +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 255 + %0 = load i16, ptr %p, align 2 + %conv = zext i16 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define double @load_u64_from_u8_off255(ptr %n){ +; CHECK-LABEL: load_u64_from_u8_off255: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldrb w8, [x0, #255] +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 255 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define float @load_u32_from_u16_off255(ptr %n){ +; CHECK-LABEL: load_u32_from_u16_off255: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldurh w8, [x0, #255] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 255 + %0 = load i16, ptr %p, align 2 + %conv = zext i16 %0 to i32 + %1 = bitcast i32 %conv to float + ret float %1 +} + +define float @load_u32_from_u8_off255(ptr %n){ +; CHECK-LABEL: load_u32_from_u8_off255: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldrb w8, [x0, #255] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 255 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i32 + %1 = bitcast i32 %conv to float + ret float %1 +} + +define half @load_u16_from_u8_off255(ptr %n){ +; CHECK-LABEL: load_u16_from_u8_off255: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldrb w8, [x0, #255] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 255 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i16 + %1 = bitcast i16 %conv to half + ret half %1 +} + + +define double @load_u64_from_u32_off256(ptr %n){ +; CHECK-LABEL: load_u64_from_u32_off256: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr s0, [x0, #256] +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 256 + %0 = load i32, ptr %p, align 4 + %conv = zext i32 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define double @load_u64_from_u16_off256(ptr %n){ +; CHECK-LABEL: load_u64_from_u16_off256: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr h0, [x0, #128] +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 256 + %0 = load i16, ptr %p, align 2 + %conv = zext i16 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define double @load_u64_from_u8_off256(ptr %n){ +; CHECK-LABEL: load_u64_from_u8_off256: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr b0, [x0, #64] +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 256 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define float @load_u32_from_u16_off256(ptr %n){ +; CHECK-LABEL: load_u32_from_u16_off256: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr h0, [x0, #256] +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 256 + %0 = load i16, ptr %p, align 2 + %conv = zext i16 %0 to i32 + %1 = bitcast i32 %conv to float + ret float %1 +} + +define float @load_u32_from_u8_off256(ptr %n){ +; CHECK-LABEL: load_u32_from_u8_off256: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr b0, [x0, #128] +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 256 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i32 + %1 = bitcast i32 %conv to float + ret float %1 +} + +define half @load_u16_from_u8_off256(ptr %n){ +; CHECK-LABEL: load_u16_from_u8_off256: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr b0, [x0, #128] +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 256 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i16 + %1 = bitcast i16 %conv to half + ret half %1 +} + + + +define double @load_u64_from_u32_offn(ptr %n){ +; CHECK-LABEL: load_u64_from_u32_offn: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr s0, [x0, #16380] +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 16380 + %0 = load i32, ptr %p, align 4 + %conv = zext i32 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define double @load_u64_from_u16_offn(ptr %n){ +; CHECK-LABEL: load_u64_from_u16_offn: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #8190 // =0x1ffe +; CHECK-NEXT: ldr h0, [x0, x8] +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 8190 + %0 = load i16, ptr %p, align 2 + %conv = zext i16 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define double @load_u64_from_u8_offn(ptr %n){ +; CHECK-LABEL: load_u64_from_u8_offn: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr b0, [x0, #4095] +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 4095 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define float @load_u32_from_u16_offn(ptr %n){ +; CHECK-LABEL: load_u32_from_u16_offn: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr h0, [x0, #8190] +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 8190 + %0 = load i16, ptr %p, align 2 + %conv = zext i16 %0 to i32 + %1 = bitcast i32 %conv to float + ret float %1 +} + +define float @load_u32_from_u8_offn(ptr %n){ +; CHECK-LABEL: load_u32_from_u8_offn: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr b0, [x0, #4095] +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 4095 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i32 + %1 = bitcast i32 %conv to float + ret float %1 +} + +define half @load_u16_from_u8_offn(ptr %n){ +; CHECK-LABEL: load_u16_from_u8_offn: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr b0, [x0, #4095] +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 4095 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i16 + %1 = bitcast i16 %conv to half + ret half %1 +} + + +define double @load_u64_from_u32_offnp1(ptr %n){ +; CHECK-LABEL: load_u64_from_u32_offnp1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add x8, x0, #4, lsl #12 // =16384 +; CHECK-NEXT: ldr s0, [x8] +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 16384 + %0 = load i32, ptr %p, align 4 + %conv = zext i32 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define double @load_u64_from_u16_offnp1(ptr %n){ +; CHECK-LABEL: load_u64_from_u16_offnp1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr h0, [x0, #4096] +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 8192 + %0 = load i16, ptr %p, align 2 + %conv = zext i16 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define double @load_u64_from_u8_offnp1(ptr %n){ +; CHECK-LABEL: load_u64_from_u8_offnp1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr b0, [x0, #1024] +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 4096 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i64 + %1 = bitcast i64 %conv to double + ret double %1 +} + +define float @load_u32_from_u16_offnp1(ptr %n){ +; CHECK-LABEL: load_u32_from_u16_offnp1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add x8, x0, #2, lsl #12 // =8192 +; CHECK-NEXT: ldr h0, [x8] +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 8192 + %0 = load i16, ptr %p, align 2 + %conv = zext i16 %0 to i32 + %1 = bitcast i32 %conv to float + ret float %1 +} + +define float @load_u32_from_u8_offnp1(ptr %n){ +; CHECK-LABEL: load_u32_from_u8_offnp1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr b0, [x0, #2048] +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 4096 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i32 + %1 = bitcast i32 %conv to float + ret float %1 +} + +define half @load_u16_from_u8_offnp1(ptr %n){ +; CHECK-LABEL: load_u16_from_u8_offnp1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr b0, [x0, #2048] +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-NEXT: ret +entry: + %p = getelementptr i8, ptr %n, i64 4096 + %0 = load i8, ptr %p, align 1 + %conv = zext i8 %0 to i16 + %1 = bitcast i16 %conv to half + ret half %1 +} + From c9fb37c75f741f1179f2d2c661d27d36645b0310 Mon Sep 17 00:00:00 2001 From: jeanPerier Date: Wed, 22 Oct 2025 11:46:18 +0200 Subject: [PATCH 071/530] [flang][FIR] add fir.assumed_size_extent to abstract assumed-size extent encoding (#164452) The purpose of this patch is to allow converting FIR array representation to memref when possible without hitting memref verifier issue. The issue was that FIR arrays may be assumed size, in which case the last dimension will not be known at runtime. Flang uses -1 to encode this to fulfill Fortran 2023 standard requirements in 18.5.3 point 5 about CFI_desc_t. When arrays are converted to memeref, if this `-1` reaches memeref operations, it triggers verifier errors (even if the conversion happened in code that guards the code to be entered at runtime if the array is assumed-size because folders/verifiers do not take into account reachability). This follows-up on discussions in #163505 merge requests --- .../include/flang/Optimizer/Dialect/FIROps.td | 35 +++++++++++++++++ flang/lib/Lower/Bridge.cpp | 4 +- flang/lib/Lower/ConvertVariable.cpp | 2 +- flang/lib/Optimizer/CodeGen/CodeGen.cpp | 39 +++++++++++++++++++ flang/lib/Optimizer/Dialect/FIROps.cpp | 28 +++++++++++++ .../Optimizer/Transforms/ArrayValueCopy.cpp | 4 +- flang/test/Fir/assumed-size-ops-codegen.fir | 19 +++++++++ flang/test/Fir/assumed-size-ops-folding.fir | 13 +++++++ flang/test/Fir/assumed-size-ops-roundtrip.fir | 13 +++++++ flang/test/HLFIR/assumed-type-actual-args.f90 | 2 +- flang/test/Lower/HLFIR/assumed-rank-iface.f90 | 2 +- flang/test/Lower/HLFIR/select-rank.f90 | 8 ++-- flang/test/Lower/Intrinsics/lbound.f90 | 2 +- flang/test/Lower/Intrinsics/ubound.f90 | 2 +- .../Lower/array-expression-assumed-size.f90 | 8 ++-- flang/test/Lower/entry-statement.f90 | 2 +- 16 files changed, 165 insertions(+), 18 deletions(-) create mode 100644 flang/test/Fir/assumed-size-ops-codegen.fir create mode 100644 flang/test/Fir/assumed-size-ops-folding.fir create mode 100644 flang/test/Fir/assumed-size-ops-roundtrip.fir diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td index fc6eedc6ed4c6..86502c6271d23 100644 --- a/flang/include/flang/Optimizer/Dialect/FIROps.td +++ b/flang/include/flang/Optimizer/Dialect/FIROps.td @@ -1249,6 +1249,41 @@ def fir_IsAssumedSizeOp : fir_SimpleOp<"is_assumed_size", [NoMemoryEffect]> { let results = (outs BoolLike); } +def fir_AssumedSizeExtentOp : fir_SimpleOneResultOp<"assumed_size_extent", [NoMemoryEffect]> { + let summary = "get the assumed-size last extent sentinel"; + + let description = [{ + Returns the special extent value representing the last dimension of an + assumed-size array. This is used to model the semantics in FIR without + directly materializing the sentinel value. The concrete encoding is + introduced during FIR to LLVM lowering. + + ``` + %e = fir.assumed_size_extent : index + ``` + }]; + + let results = (outs Index); + let assemblyFormat = "attr-dict `:` type(results)"; +} + +def fir_IsAssumedSizeExtentOp : fir_SimpleOp<"is_assumed_size_extent", [NoMemoryEffect]> { + let summary = "is value the assumed-size last extent sentinel"; + + let description = [{ + Returns true iff the given integer equals the assumed-size extent sentinel. + + ``` + %t = fir.is_assumed_size_extent %v : (index) -> i1 + %c = fir.is_assumed_size_extent %x : (i32) -> i1 + ``` + }]; + + let arguments = (ins AnyIntegerLike:$val); + let results = (outs BoolLike); + let hasCanonicalizer = 1; +} + def fir_BoxIsPtrOp : fir_SimpleOp<"box_isptr", [NoMemoryEffect]> { let summary = "is the boxed value a POINTER?"; diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index 3b711ccbe786a..238a623dc6c5d 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -4010,8 +4010,8 @@ class FirConverter : public Fortran::lower::AbstractConverter { // parameters and dynamic type. The selector cannot be a // POINTER/ALLOCATBLE as per F'2023 C1160. fir::ExtendedValue newExv; - llvm::SmallVector assumeSizeExtents{ - builder->createMinusOneInteger(loc, builder->getIndexType())}; + llvm::SmallVector assumeSizeExtents{ + fir::AssumedSizeExtentOp::create(*builder, loc)}; mlir::Value baseAddr = hlfir::genVariableRawAddress(loc, *builder, selector); const bool isVolatile = fir::isa_volatile_type(selector.getType()); diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp index 00ec1b51e5400..2517ab35d4ff0 100644 --- a/flang/lib/Lower/ConvertVariable.cpp +++ b/flang/lib/Lower/ConvertVariable.cpp @@ -1711,7 +1711,7 @@ static void lowerExplicitLowerBounds( /// CFI_desc_t requirements in 18.5.3 point 5.). static mlir::Value getAssumedSizeExtent(mlir::Location loc, fir::FirOpBuilder &builder) { - return builder.createMinusOneInteger(loc, builder.getIndexType()); + return fir::AssumedSizeExtentOp::create(builder, loc); } /// Lower explicit extents into \p result if this is an explicit-shape or diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp index 70bb43a2510ba..e71f4e3cee49c 100644 --- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp +++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp @@ -749,6 +749,44 @@ struct VolatileCastOpConversion } }; +/// Lower `fir.assumed_size_extent` to constant -1 of index type. +struct AssumedSizeExtentOpConversion + : public fir::FIROpConversion { + using FIROpConversion::FIROpConversion; + + llvm::LogicalResult + matchAndRewrite(fir::AssumedSizeExtentOp op, OpAdaptor, + mlir::ConversionPatternRewriter &rewriter) const override { + mlir::Location loc = op.getLoc(); + mlir::Type ity = lowerTy().indexType(); + auto cst = fir::genConstantIndex(loc, ity, rewriter, -1); + rewriter.replaceOp(op, cst.getResult()); + return mlir::success(); + } +}; + +/// Lower `fir.is_assumed_size_extent` to integer equality with -1. +struct IsAssumedSizeExtentOpConversion + : public fir::FIROpConversion { + using FIROpConversion::FIROpConversion; + + llvm::LogicalResult + matchAndRewrite(fir::IsAssumedSizeExtentOp op, OpAdaptor adaptor, + mlir::ConversionPatternRewriter &rewriter) const override { + mlir::Location loc = op.getLoc(); + mlir::Value val = adaptor.getVal(); + mlir::Type valTy = val.getType(); + // Create constant -1 of the operand type. + auto negOneAttr = rewriter.getIntegerAttr(valTy, -1); + auto negOne = + mlir::LLVM::ConstantOp::create(rewriter, loc, valTy, negOneAttr); + auto cmp = mlir::LLVM::ICmpOp::create( + rewriter, loc, mlir::LLVM::ICmpPredicate::eq, val, negOne); + rewriter.replaceOp(op, cmp.getResult()); + return mlir::success(); + } +}; + /// convert value of from-type to value of to-type struct ConvertOpConversion : public fir::FIROpConversion { using FIROpConversion::FIROpConversion; @@ -4360,6 +4398,7 @@ void fir::populateFIRToLLVMConversionPatterns( AllocaOpConversion, AllocMemOpConversion, BoxAddrOpConversion, BoxCharLenOpConversion, BoxDimsOpConversion, BoxEleSizeOpConversion, BoxIsAllocOpConversion, BoxIsArrayOpConversion, BoxIsPtrOpConversion, + AssumedSizeExtentOpConversion, IsAssumedSizeExtentOpConversion, BoxOffsetOpConversion, BoxProcHostOpConversion, BoxRankOpConversion, BoxTypeCodeOpConversion, BoxTypeDescOpConversion, CallOpConversion, CmpcOpConversion, VolatileCastOpConversion, ConvertOpConversion, diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp index 1712af1d1eba7..d0164f32d9b6a 100644 --- a/flang/lib/Optimizer/Dialect/FIROps.cpp +++ b/flang/lib/Optimizer/Dialect/FIROps.cpp @@ -5142,6 +5142,34 @@ void fir::BoxTotalElementsOp::getCanonicalizationPatterns( patterns.add(context); } +//===----------------------------------------------------------------------===// +// IsAssumedSizeExtentOp and AssumedSizeExtentOp +//===----------------------------------------------------------------------===// + +namespace { +struct FoldIsAssumedSizeExtentOnCtor + : public mlir::OpRewritePattern { + using mlir::OpRewritePattern::OpRewritePattern; + mlir::LogicalResult + matchAndRewrite(fir::IsAssumedSizeExtentOp op, + mlir::PatternRewriter &rewriter) const override { + if (llvm::isa_and_nonnull( + op.getVal().getDefiningOp())) { + mlir::Type i1 = rewriter.getI1Type(); + rewriter.replaceOpWithNewOp( + op, i1, rewriter.getIntegerAttr(i1, 1)); + return mlir::success(); + } + return mlir::failure(); + } +}; +} // namespace + +void fir::IsAssumedSizeExtentOp::getCanonicalizationPatterns( + mlir::RewritePatternSet &patterns, mlir::MLIRContext *context) { + patterns.add(context); +} + //===----------------------------------------------------------------------===// // LocalitySpecifierOp //===----------------------------------------------------------------------===// diff --git a/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp b/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp index ed9a2ae11f0d9..5bf783db92bf3 100644 --- a/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp +++ b/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp @@ -832,8 +832,8 @@ static mlir::Type getEleTy(mlir::Type ty) { static bool isAssumedSize(llvm::SmallVectorImpl &extents) { if (extents.empty()) return false; - auto cstLen = fir::getIntIfConstant(extents.back()); - return cstLen.has_value() && *cstLen == -1; + return llvm::isa_and_nonnull( + extents.back().getDefiningOp()); } // Extract extents from the ShapeOp/ShapeShiftOp into the result vector. diff --git a/flang/test/Fir/assumed-size-ops-codegen.fir b/flang/test/Fir/assumed-size-ops-codegen.fir new file mode 100644 index 0000000000000..54e9b3c8f30f4 --- /dev/null +++ b/flang/test/Fir/assumed-size-ops-codegen.fir @@ -0,0 +1,19 @@ +// RUN: fir-opt --fir-to-llvm-ir="target=x86_64-unknown-linux-gnu" %s | FileCheck %s + +// CHECK-LABEL: @assumed_size_extent( +// CHECK: %[[CNEG1:.*]] = llvm.mlir.constant(-1 : i64) +// CHECK: llvm.return %[[CNEG1]] : i64 +func.func @assumed_size_extent() -> index { + %e = fir.assumed_size_extent : index + return %e : index +} + +// CHECK-LABEL: @is_assumed_size_extent( +// CHECK: %[[NEG1:.*]] = llvm.mlir.constant(-1 : i64) +// CHECK: %[[CMP:.*]] = llvm.icmp "eq" +// CHECK: llvm.return %[[CMP]] : i1 +func.func @is_assumed_size_extent(%x: index) -> i1 { + %c = fir.is_assumed_size_extent %x : (index) -> i1 + return %c : i1 +} + diff --git a/flang/test/Fir/assumed-size-ops-folding.fir b/flang/test/Fir/assumed-size-ops-folding.fir new file mode 100644 index 0000000000000..9fd5fabb7290b --- /dev/null +++ b/flang/test/Fir/assumed-size-ops-folding.fir @@ -0,0 +1,13 @@ +// RUN: fir-opt --canonicalize %s | FileCheck %s + +// Verify: fir.is_assumed_size_extent(fir.assumed_size_extent) folds to i1 true. + +// CHECK-LABEL: func.func @fold( +func.func @fold() -> i1 { + %e = fir.assumed_size_extent : index + // CHECK: %[[C:.*]] = arith.constant true + %t = fir.is_assumed_size_extent %e : (index) -> i1 + return %t : i1 +} + + diff --git a/flang/test/Fir/assumed-size-ops-roundtrip.fir b/flang/test/Fir/assumed-size-ops-roundtrip.fir new file mode 100644 index 0000000000000..c3c1883a2b7c0 --- /dev/null +++ b/flang/test/Fir/assumed-size-ops-roundtrip.fir @@ -0,0 +1,13 @@ +// RUN: fir-opt %s | fir-opt | FileCheck %s + +func.func @roundtrip() { + // CHECK: %[[E:.*]] = fir.assumed_size_extent : index + %e = fir.assumed_size_extent : index + + // CHECK: %[[T:.*]] = fir.is_assumed_size_extent %[[E]] : (index) -> i1 + %t = fir.is_assumed_size_extent %e : (index) -> i1 + + return +} + + diff --git a/flang/test/HLFIR/assumed-type-actual-args.f90 b/flang/test/HLFIR/assumed-type-actual-args.f90 index 42e9ed27340e7..aaac98ba3c79d 100644 --- a/flang/test/HLFIR/assumed-type-actual-args.f90 +++ b/flang/test/HLFIR/assumed-type-actual-args.f90 @@ -113,7 +113,7 @@ subroutine s5b(x) ! CHECK-LABEL: func.func @_QPtest2( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref> {fir.bindc_name = "x"}) { ! CHECK: %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_1:.*]] = arith.constant -1 : index +! CHECK: %[[VAL_1:.*]] = fir.assumed_size_extent : index ! CHECK: %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1> ! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_2]]) dummy_scope %[[DSCOPE]] {uniq_name = "_QFtest2Ex"} : (!fir.ref>, !fir.shape<1>, !fir.dscope) -> (!fir.box>, !fir.ref>) ! CHECK: fir.call @_QPs2(%[[VAL_3]]#1) fastmath : (!fir.ref>) -> () diff --git a/flang/test/Lower/HLFIR/assumed-rank-iface.f90 b/flang/test/Lower/HLFIR/assumed-rank-iface.f90 index 9ecbb7c9e5b94..ffb36fa4b7003 100644 --- a/flang/test/Lower/HLFIR/assumed-rank-iface.f90 +++ b/flang/test/Lower/HLFIR/assumed-rank-iface.f90 @@ -145,7 +145,7 @@ subroutine int_r2_assumed_size_to_assumed_rank(x) ! CHECK: %[[VAL_3:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_4:.*]] = arith.cmpi sgt, %[[VAL_2]], %[[VAL_3]] : index ! CHECK: %[[VAL_5:.*]] = arith.select %[[VAL_4]], %[[VAL_2]], %[[VAL_3]] : index -! CHECK: %[[VAL_6:.*]] = arith.constant -1 : index +! CHECK: %[[VAL_6:.*]] = fir.assumed_size_extent : index ! CHECK: %[[VAL_7:.*]] = fir.shape %[[VAL_5]], %[[VAL_6]] : (index, index) -> !fir.shape<2> ! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_7]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFint_r2_assumed_size_to_assumed_rankEx"} : (!fir.ref>, !fir.shape<2>, !fir.dscope) -> (!fir.box>, !fir.ref>) ! CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_8]]#0 : (!fir.box>) -> !fir.box> diff --git a/flang/test/Lower/HLFIR/select-rank.f90 b/flang/test/Lower/HLFIR/select-rank.f90 index 0f80c723e468f..f1f968decd412 100644 --- a/flang/test/Lower/HLFIR/select-rank.f90 +++ b/flang/test/Lower/HLFIR/select-rank.f90 @@ -371,7 +371,7 @@ subroutine test_branching(x) ! CHECK: fir.call @_QPr1(%[[VAL_11]]#0) fastmath : (!fir.box>) -> () ! CHECK: cf.br ^bb6 ! CHECK: ^bb5: -! CHECK: %[[VAL_12:.*]] = arith.constant -1 : index +! CHECK: %[[VAL_12:.*]] = fir.assumed_size_extent : index ! CHECK: %[[VAL_13:.*]] = fir.box_addr %[[VAL_2]]#1 : (!fir.box>) -> !fir.ref> ! CHECK: %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (!fir.ref>) -> !fir.ref> ! CHECK: %[[VAL_15:.*]] = fir.shape %[[VAL_12]] : (index) -> !fir.shape<1> @@ -435,7 +435,7 @@ subroutine test_branching(x) ! CHECK: fir.call @_QPrdefault(%[[VAL_8]]#0) fastmath : (!fir.box>) -> () ! CHECK: cf.br ^bb5 ! CHECK: ^bb4: -! CHECK: %[[VAL_9:.*]] = arith.constant -1 : index +! CHECK: %[[VAL_9:.*]] = fir.assumed_size_extent : index ! CHECK: %[[VAL_10:.*]] = fir.box_addr %[[VAL_2]]#1 : (!fir.box>) -> !fir.ref> ! CHECK: %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (!fir.ref>) -> !fir.ref> ! CHECK: %[[VAL_12:.*]] = fir.shape %[[VAL_9]] : (index) -> !fir.shape<1> @@ -482,7 +482,7 @@ subroutine test_branching(x) ! CHECK: fir.call @_QPr1_implicit(%[[VAL_21]]#1) fastmath : (!fir.ref>) -> () ! CHECK: cf.br ^bb6 ! CHECK: ^bb5: -! CHECK: %[[VAL_22:.*]] = arith.constant -1 : index +! CHECK: %[[VAL_22:.*]] = fir.assumed_size_extent : index ! CHECK: %[[VAL_23:.*]] = fir.box_addr %[[VAL_2]]#1 : (!fir.box>) -> !fir.ref> ! CHECK: %[[VAL_24:.*]] = fir.convert %[[VAL_23]] : (!fir.ref>) -> !fir.ref> ! CHECK: %[[VAL_25:.*]] = fir.shape %[[VAL_22]] : (index) -> !fir.shape<1> @@ -534,7 +534,7 @@ subroutine test_branching(x) ! CHECK: fir.call @_QPrc1_implicit(%[[VAL_26]]) fastmath : (!fir.boxchar<1>) -> () ! CHECK: cf.br ^bb6 ! CHECK: ^bb5: -! CHECK: %[[VAL_27:.*]] = arith.constant -1 : index +! CHECK: %[[VAL_27:.*]] = fir.assumed_size_extent : index ! CHECK: %[[VAL_28:.*]] = fir.box_addr %[[VAL_8]]#1 : (!fir.box>>) -> !fir.ref>> ! CHECK: %[[VAL_29:.*]] = fir.convert %[[VAL_28]] : (!fir.ref>>) -> !fir.ref>> ! CHECK: %[[VAL_30:.*]] = fir.shape %[[VAL_27]] : (index) -> !fir.shape<1> diff --git a/flang/test/Lower/Intrinsics/lbound.f90 b/flang/test/Lower/Intrinsics/lbound.f90 index a5ca2d39a5ee4..75c11ff8bc95f 100644 --- a/flang/test/Lower/Intrinsics/lbound.f90 +++ b/flang/test/Lower/Intrinsics/lbound.f90 @@ -40,7 +40,7 @@ subroutine lbound_test_2(a, dim, res) subroutine lbound_test_3(a, dim, res) real, dimension(2:10, 3:*) :: a integer(8):: dim, res -! CHECK: %[[VAL_0:.*]] = arith.constant -1 : index +! CHECK: %[[VAL_0:.*]] = fir.assumed_size_extent : index ! CHECK: %[[VAL_1:.*]] = fir.load %arg1 : !fir.ref ! CHECK: %[[VAL_2:.*]] = fir.shape_shift %{{.*}}, %{{.*}}, %{{.*}}, %[[VAL_0]] : (index, index, index, index) -> !fir.shapeshift<2> ! CHECK: %[[VAL_3:.*]] = fir.embox %arg0(%[[VAL_2]]) : (!fir.ref>, !fir.shapeshift<2>) -> !fir.box> diff --git a/flang/test/Lower/Intrinsics/ubound.f90 b/flang/test/Lower/Intrinsics/ubound.f90 index dae21acda170d..bc8cff8982542 100644 --- a/flang/test/Lower/Intrinsics/ubound.f90 +++ b/flang/test/Lower/Intrinsics/ubound.f90 @@ -48,7 +48,7 @@ subroutine ubound_test_2(a, dim, res) subroutine ubound_test_3(a, dim, res) real, dimension(10, 20, *) :: a integer(8):: dim, res -! CHECK: %[[VAL_0:.*]] = arith.constant -1 : index +! CHECK: %[[VAL_0:.*]] = fir.assumed_size_extent : index ! CHECK: %[[VAL_1:.*]] = fir.shape %{{.*}}, %{{.*}}, %[[VAL_0]] : (index, index, index) -> !fir.shape<3> ! CHECK: %[[VAL_2:.*]] = fir.embox %{{.*}}(%[[VAL_1]]) : (!fir.ref>, !fir.shape<3>) -> !fir.box> ! CHECK: %[[VAL_3:.*]] = fir.load %{{.*}} : !fir.ref diff --git a/flang/test/Lower/array-expression-assumed-size.f90 b/flang/test/Lower/array-expression-assumed-size.f90 index 2fbf315aff114..a498148d07fc7 100644 --- a/flang/test/Lower/array-expression-assumed-size.f90 +++ b/flang/test/Lower/array-expression-assumed-size.f90 @@ -19,7 +19,7 @@ end subroutine assumed_size_forall_test ! CHECK: %[[VAL_1A:.*]] = fir.convert %c10{{.*}} : (i64) -> index ! CHECK: %[[VAL_1B:.*]] = arith.cmpi sgt, %[[VAL_1A]], %c0{{.*}} : index ! CHECK: %[[VAL_1:.*]] = arith.select %[[VAL_1B]], %[[VAL_1A]], %c0{{.*}} : index -! CHECK: %[[VAL_2:.*]] = arith.constant -1 : index +! CHECK: %[[VAL_2:.*]] = fir.assumed_size_extent : index ! CHECK: %[[VAL_3:.*]] = arith.constant 1 : index ! CHECK: %[[VAL_4:.*]] = arith.constant 1 : i64 ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i64) -> index @@ -82,7 +82,7 @@ end subroutine assumed_size_forall_test ! CHECK: %[[VAL_2A:.*]] = fir.convert %c10{{.*}} : (i64) -> index ! CHECK: %[[VAL_2B:.*]] = arith.cmpi sgt, %[[VAL_2A]], %c0{{.*}} : index ! CHECK: %[[VAL_2:.*]] = arith.select %[[VAL_2B]], %[[VAL_2A]], %c0{{.*}} : index -! CHECK: %[[VAL_3:.*]] = arith.constant -1 : index +! CHECK: %[[VAL_3:.*]] = fir.assumed_size_extent : index ! CHECK: %[[VAL_4:.*]] = arith.constant 2 : i32 ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i32) -> index ! CHECK: %[[VAL_6:.*]] = arith.constant 6 : i32 @@ -149,7 +149,7 @@ end subroutine assumed_size_forall_test ! PostOpt-DAG: %[[VAL_4:.*]] = arith.constant 0 : index ! PostOpt-DAG: %[[VAL_5:.*]] = arith.constant 3 : index ! PostOpt-DAG: %[[VAL_6:.*]] = arith.constant 4 : index -! PostOpt-DAG: %[[VAL_7:.*]] = arith.constant -1 : index +! PostOpt-DAG: %[[VAL_7:.*]] = fir.assumed_size_extent : index ! PostOpt: %[[VAL_8:.*]] = fir.shape %[[VAL_1]], %[[VAL_7]] : (index, index) -> !fir.shape<2> ! PostOpt: %[[VAL_9:.*]] = fir.slice %[[VAL_2]], %[[VAL_1]], %[[VAL_2]], %[[VAL_2]], %[[VAL_3]], %[[VAL_2]] : (index, index, index, index, index, index) -> !fir.slice<2> ! PostOpt: %[[VAL_10:.*]] = fir.allocmem !fir.array<10x?xi32>, %[[VAL_3]] @@ -227,8 +227,8 @@ end subroutine assumed_size_forall_test ! PostOpt-DAG: %[[VAL_4:.*]] = arith.constant 1 : index ! PostOpt-DAG: %[[VAL_5:.*]] = arith.constant 0 : index ! PostOpt-DAG: %[[VAL_6:.*]] = arith.constant 5 : index -! PostOpt-DAG: %[[VAL_8:.*]] = arith.constant -1 : index ! PostOpt: %[[VAL_7:.*]] = fir.alloca i32 {adapt.valuebyref, bindc_name = "i"} +! PostOpt: %[[VAL_8:.*]] = fir.assumed_size_extent : index ! PostOpt: %[[VAL_9:.*]] = fir.shape %[[VAL_2]], %[[VAL_8]] : (index, index) -> !fir.shape<2> ! PostOpt: %[[VAL_10:.*]] = fir.allocmem !fir.array<10x?xi32>, %[[VAL_4]] ! PostOpt: br ^bb1(%[[VAL_5]], %[[VAL_4]] : index, index) diff --git a/flang/test/Lower/entry-statement.f90 b/flang/test/Lower/entry-statement.f90 index 83d2d324aed87..f1e535a2bd3a1 100644 --- a/flang/test/Lower/entry-statement.f90 +++ b/flang/test/Lower/entry-statement.f90 @@ -491,7 +491,7 @@ subroutine assumed_size() ! CHECK-LABEL: func.func @_QPentry_with_assumed_size( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope -! CHECK: %[[VAL_2:.*]] = arith.constant -1 : index +! CHECK: %[[VAL_2:.*]] = fir.assumed_size_extent : index ! CHECK: %[[VAL_3:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1> ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %[[VAL_1]] {uniq_name = "_QFassumed_sizeEx"} : (!fir.ref>, !fir.shape<1>, !fir.dscope) -> (!fir.box>, !fir.ref>) ! CHECK: cf.br ^bb1 From e83eee335c477ab80612b09bf840700d6982c3ef Mon Sep 17 00:00:00 2001 From: kper Date: Wed, 22 Oct 2025 11:49:34 +0200 Subject: [PATCH 072/530] [DAG] Create SDPatternMatch method `m_SelectLike` to match `ISD::Select` and `ISD::VSelect` (#164069) Fixes #150019 --- llvm/include/llvm/CodeGen/SDPatternMatch.h | 5 +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 39 +++++++++---------- .../CodeGen/SelectionDAGPatternMatchTest.cpp | 25 ++++++++++++ 3 files changed, 49 insertions(+), 20 deletions(-) diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h index 201dc68de8b76..0dcf400962393 100644 --- a/llvm/include/llvm/CodeGen/SDPatternMatch.h +++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h @@ -558,6 +558,11 @@ m_VSelect(const T0_P &Cond, const T1_P &T, const T2_P &F) { return TernaryOpc_match(ISD::VSELECT, Cond, T, F); } +template +inline auto m_SelectLike(const T0_P &Cond, const T1_P &T, const T2_P &F) { + return m_AnyOf(m_Select(Cond, T, F), m_VSelect(Cond, T, F)); +} + template inline Result_match<0, TernaryOpc_match> m_Load(const T0_P &Ch, const T1_P &Ptr, const T2_P &Offset) { diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 310d35d9b1d1e..6aa71254fe6ef 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -2476,16 +2476,17 @@ static bool canFoldInAddressingMode(SDNode *N, SDNode *Use, SelectionDAG &DAG, /// masked vector operation if the target supports it. static SDValue foldSelectWithIdentityConstant(SDNode *N, SelectionDAG &DAG, bool ShouldCommuteOperands) { - // Match a select as operand 1. The identity constant that we are looking for - // is only valid as operand 1 of a non-commutative binop. SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); + + // Match a select as operand 1. The identity constant that we are looking for + // is only valid as operand 1 of a non-commutative binop. if (ShouldCommuteOperands) std::swap(N0, N1); - unsigned SelOpcode = N1.getOpcode(); - if ((SelOpcode != ISD::VSELECT && SelOpcode != ISD::SELECT) || - !N1.hasOneUse()) + SDValue Cond, TVal, FVal; + if (!sd_match(N1, m_OneUse(m_SelectLike(m_Value(Cond), m_Value(TVal), + m_Value(FVal))))) return SDValue(); // We can't hoist all instructions because of immediate UB (not speculatable). @@ -2493,11 +2494,9 @@ static SDValue foldSelectWithIdentityConstant(SDNode *N, SelectionDAG &DAG, if (!DAG.isSafeToSpeculativelyExecuteNode(N)) return SDValue(); + unsigned SelOpcode = N1.getOpcode(); unsigned Opcode = N->getOpcode(); EVT VT = N->getValueType(0); - SDValue Cond = N1.getOperand(0); - SDValue TVal = N1.getOperand(1); - SDValue FVal = N1.getOperand(2); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // This transform increases uses of N0, so freeze it to be safe. @@ -13856,12 +13855,11 @@ static SDValue tryToFoldExtendSelectLoad(SDNode *N, const TargetLowering &TLI, Opcode == ISD::ANY_EXTEND) && "Expected EXTEND dag node in input!"); - if (!(N0->getOpcode() == ISD::SELECT || N0->getOpcode() == ISD::VSELECT) || - !N0.hasOneUse()) + SDValue Cond, Op1, Op2; + if (!sd_match(N0, m_OneUse(m_SelectLike(m_Value(Cond), m_Value(Op1), + m_Value(Op2))))) return SDValue(); - SDValue Op1 = N0->getOperand(1); - SDValue Op2 = N0->getOperand(2); if (!isCompatibleLoad(Op1, Opcode) || !isCompatibleLoad(Op2, Opcode)) return SDValue(); @@ -13883,7 +13881,7 @@ static SDValue tryToFoldExtendSelectLoad(SDNode *N, const TargetLowering &TLI, SDValue Ext1 = DAG.getNode(Opcode, DL, VT, Op1); SDValue Ext2 = DAG.getNode(Opcode, DL, VT, Op2); - return DAG.getSelect(DL, VT, N0->getOperand(0), Ext1, Ext2); + return DAG.getSelect(DL, VT, Cond, Ext1, Ext2); } /// Try to fold a sext/zext/aext dag node into a ConstantSDNode or @@ -29620,13 +29618,14 @@ static SDValue takeInexpensiveLog2(SelectionDAG &DAG, const SDLoc &DL, EVT VT, } // c ? X : Y -> c ? Log2(X) : Log2(Y) - if ((Op.getOpcode() == ISD::SELECT || Op.getOpcode() == ISD::VSELECT) && - Op.hasOneUse()) { - if (SDValue LogX = takeInexpensiveLog2(DAG, DL, VT, Op.getOperand(1), - Depth + 1, AssumeNonZero)) - if (SDValue LogY = takeInexpensiveLog2(DAG, DL, VT, Op.getOperand(2), - Depth + 1, AssumeNonZero)) - return DAG.getSelect(DL, VT, Op.getOperand(0), LogX, LogY); + SDValue Cond, TVal, FVal; + if (sd_match(Op, m_OneUse(m_SelectLike(m_Value(Cond), m_Value(TVal), + m_Value(FVal))))) { + if (SDValue LogX = + takeInexpensiveLog2(DAG, DL, VT, TVal, Depth + 1, AssumeNonZero)) + if (SDValue LogY = + takeInexpensiveLog2(DAG, DL, VT, FVal, Depth + 1, AssumeNonZero)) + return DAG.getSelect(DL, VT, Cond, LogX, LogY); } // log2(umin(X, Y)) -> umin(log2(X), log2(Y)) diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp index 16b997901dc1c..aa56aafa2812c 100644 --- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp +++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp @@ -550,6 +550,31 @@ TEST_F(SelectionDAGPatternMatchTest, matchNode) { EXPECT_FALSE(sd_match(Add, m_Node(ISD::ADD, m_ConstInt(), m_Value()))); } +TEST_F(SelectionDAGPatternMatchTest, matchSelectLike) { + SDLoc DL; + auto Int32VT = EVT::getIntegerVT(Context, 32); + auto VInt32VT = EVT::getVectorVT(Context, Int32VT, 4); + + SDValue Cond = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 0, Int32VT); + SDValue TVal = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, Int32VT); + SDValue FVal = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 2, Int32VT); + + SDValue VCond = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 0, VInt32VT); + SDValue VTVal = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, VInt32VT); + SDValue VFVal = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 2, VInt32VT); + + SDValue Select = DAG->getNode(ISD::SELECT, DL, Int32VT, Cond, TVal, FVal); + SDValue VSelect = + DAG->getNode(ISD::VSELECT, DL, Int32VT, VCond, VTVal, VFVal); + + using namespace SDPatternMatch; + EXPECT_TRUE(sd_match(Select, m_SelectLike(m_Specific(Cond), m_Specific(TVal), + m_Specific(FVal)))); + EXPECT_TRUE( + sd_match(VSelect, m_SelectLike(m_Specific(VCond), m_Specific(VTVal), + m_Specific(VFVal)))); +} + namespace { struct VPMatchContext : public SDPatternMatch::BasicMatchContext { using SDPatternMatch::BasicMatchContext::BasicMatchContext; From 1fbfac30f107cbf63f91101fa5b34dee397089af Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Wed, 22 Oct 2025 03:08:24 -0700 Subject: [PATCH 073/530] [WebAssembly] [Codegen] Add pattern for relaxed min max from fminimum/fmaximum over v4f32 and v2f64 (#162948) Related to #55932 --- .../WebAssembly/WebAssemblyISelLowering.cpp | 5 ++ .../WebAssembly/WebAssemblyInstrSIMD.td | 17 ++++++ .../CodeGen/WebAssembly/simd-relaxed-fmax.ll | 60 +++++++++++++++++++ .../CodeGen/WebAssembly/simd-relaxed-fmin.ll | 59 ++++++++++++++++++ 4 files changed, 141 insertions(+) create mode 100644 llvm/test/CodeGen/WebAssembly/simd-relaxed-fmax.ll create mode 100644 llvm/test/CodeGen/WebAssembly/simd-relaxed-fmin.ll diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index f9739492e7fe3..7ec463bdc3b84 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -183,6 +183,11 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( for (auto T : {MVT::i32, MVT::i64}) setOperationAction(Op, T, Custom); + if (Subtarget->hasRelaxedSIMD()) { + setOperationAction( + {ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXNUM, ISD::FMAXIMUMNUM}, + {MVT::v4f32, MVT::v2f64}, Legal); + } // SIMD-specific configuration if (Subtarget->hasSIMD128()) { diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index 784062066ed64..f0ac26b8edec8 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -1742,6 +1742,23 @@ defm SIMD_RELAXED_FMIN : defm SIMD_RELAXED_FMAX : RelaxedBinary; +let Predicates = [HasRelaxedSIMD] in { + foreach vec = [F32x4, F64x2] in { + defvar relaxed_min = !cast("SIMD_RELAXED_FMIN_"#vec); + defvar relaxed_max = !cast("SIMD_RELAXED_FMAX_"#vec); + + // Transform standard fminimum/fmaximum to relaxed versions + def : Pat<(vec.vt (fminnum (vec.vt V128:$lhs), (vec.vt V128:$rhs))), + (relaxed_min V128:$lhs, V128:$rhs)>; + def : Pat<(vec.vt (fminimumnum (vec.vt V128:$lhs), (vec.vt V128:$rhs))), + (relaxed_min V128:$lhs, V128:$rhs)>; + def : Pat<(vec.vt (fmaxnum (vec.vt V128:$lhs), (vec.vt V128:$rhs))), + (relaxed_max V128:$lhs, V128:$rhs)>; + def : Pat<(vec.vt (fmaximumnum (vec.vt V128:$lhs), (vec.vt V128:$rhs))), + (relaxed_max V128:$lhs, V128:$rhs)>; + } +} + //===----------------------------------------------------------------------===// // Relaxed rounding q15 multiplication //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmax.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmax.ll new file mode 100644 index 0000000000000..45f4ddd783a55 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmax.ll @@ -0,0 +1,60 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 + +; RUN: llc < %s -mtriple=wasm32-unknown-unknown -mattr=+simd128,+relaxed-simd | FileCheck %s + +; Test that fmaxnum and fmaximumnum get transformed to relaxed_max + +target triple = "wasm32" + +define <4 x float> @test_maxnum_f32x4(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test_maxnum_f32x4: +; CHECK: .functype test_maxnum_f32x4 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: f32x4.relaxed_max +; CHECK-NEXT: # fallthrough-return + %result = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b) + ret <4 x float> %result +} + +define <4 x float> @test_maximumnum_f32x4(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test_maximumnum_f32x4: +; CHECK: .functype test_maximumnum_f32x4 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: f32x4.relaxed_max +; CHECK-NEXT: # fallthrough-return + %result = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %a, <4 x float> %b) + ret <4 x float> %result +} + +define <2 x double> @test_maxnum_f64x2(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: test_maxnum_f64x2: +; CHECK: .functype test_maxnum_f64x2 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: f64x2.relaxed_max +; CHECK-NEXT: # fallthrough-return + %result = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %a, <2 x double> %b) + ret <2 x double> %result +} + +define <2 x double> @test_minimumnum_f64x2(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: test_minimumnum_f64x2: +; CHECK: .functype test_minimumnum_f64x2 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: f64x2.relaxed_max +; CHECK-NEXT: # fallthrough-return + %result = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> %a, <2 x double> %b) + ret <2 x double> %result +} + +declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) +declare <4 x float> @llvm.maximumnum.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>) +declare <2 x double> @llvm.maximumnum.v2f64(<2 x double>, <2 x double>) diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmin.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmin.ll new file mode 100644 index 0000000000000..f3eec023663a7 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmin.ll @@ -0,0 +1,59 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=wasm32-unknown-unknown -mattr=+simd128,+relaxed-simd | FileCheck %s + +; Test that fminnum and fminimumnum get transformed to relaxed_min + +target triple = "wasm32" + +define <4 x float> @test_minnum_f32x4(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test_minnum_f32x4: +; CHECK: .functype test_minnum_f32x4 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: f32x4.relaxed_min +; CHECK-NEXT: # fallthrough-return + %result = call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b) + ret <4 x float> %result +} + +define <4 x float> @test_minimumnum_f32x4(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test_minimumnum_f32x4: +; CHECK: .functype test_minimumnum_f32x4 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: f32x4.relaxed_min +; CHECK-NEXT: # fallthrough-return + %result = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> %a, <4 x float> %b) + ret <4 x float> %result +} + +define <2 x double> @test_minnum_f64x2(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: test_minnum_f64x2: +; CHECK: .functype test_minnum_f64x2 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: f64x2.relaxed_min +; CHECK-NEXT: # fallthrough-return + %result = call <2 x double> @llvm.minnum.v2f64(<2 x double> %a, <2 x double> %b) + ret <2 x double> %result +} + +define <2 x double> @test_minimumnum_f64x2(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: test_minimumnum_f64x2: +; CHECK: .functype test_minimumnum_f64x2 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: f64x2.relaxed_min +; CHECK-NEXT: # fallthrough-return + %result = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %a, <2 x double> %b) + ret <2 x double> %result +} + +declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) +declare <4 x float> @llvm.fminimumnum.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>) +declare <2 x double> @llvm.fminimumnum.v2f64(<2 x double>, <2 x double>) From 0d20f3fa1fee43bc50883fbc988171cc0eb5a8e3 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Wed, 22 Oct 2025 10:19:08 +0000 Subject: [PATCH 074/530] [lldb][test] Fix address type in ReadMemoryRanges test Tests added by #163651. Use lldb::addr_t (which is always 64-bit) for all addresses so that we don't calculate an invalid address on 32-bit and segfault. As happened on Linaro's Arm 32-bit buildbot. --- lldb/unittests/Target/MemoryTest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/unittests/Target/MemoryTest.cpp b/lldb/unittests/Target/MemoryTest.cpp index f7b4e97b1f64a..e444f68dc4871 100644 --- a/lldb/unittests/Target/MemoryTest.cpp +++ b/lldb/unittests/Target/MemoryTest.cpp @@ -245,7 +245,7 @@ class DummyReaderProcess : public Process { if (read_more_than_requested) size *= 2; uint8_t *buffer = static_cast(buf); - for (size_t addr = vm_addr; addr < vm_addr + size; addr++) + for (lldb::addr_t addr = vm_addr; addr < vm_addr + size; addr++) buffer[addr - vm_addr] = static_cast(addr); // LSB of addr. return size; } From 4f020c4fb3b014e00caeee14a48f1063390977f8 Mon Sep 17 00:00:00 2001 From: paperchalice Date: Wed, 22 Oct 2025 18:24:43 +0800 Subject: [PATCH 075/530] [doc] Remove unsafe-fp-math references (#164579) Stop mentioning `unsafe-fp-math` related things in documents. --- llvm/docs/CommandGuide/llc.rst | 7 ------- llvm/docs/CommandGuide/lli.rst | 5 ----- llvm/docs/SourceLevelDebugging.rst | 2 +- 3 files changed, 1 insertion(+), 13 deletions(-) diff --git a/llvm/docs/CommandGuide/llc.rst b/llvm/docs/CommandGuide/llc.rst index 900649f59c401..cc670f6043656 100644 --- a/llvm/docs/CommandGuide/llc.rst +++ b/llvm/docs/CommandGuide/llc.rst @@ -125,13 +125,6 @@ End-user Options Enable setting the FP exceptions build attribute not to use exceptions. -.. option:: --enable-unsafe-fp-math - - Enable optimizations that make unsafe assumptions about IEEE math (e.g. that - addition is associative) or may not work for all input ranges. These - optimizations allow the code generator to make use of some instructions which - would otherwise not be usable (such as ``fsin`` on X86). - .. option:: --stats Print statistics recorded by code-generation passes. diff --git a/llvm/docs/CommandGuide/lli.rst b/llvm/docs/CommandGuide/lli.rst index 94c001380441f..8afe10db05d48 100644 --- a/llvm/docs/CommandGuide/lli.rst +++ b/llvm/docs/CommandGuide/lli.rst @@ -107,11 +107,6 @@ FLOATING POINT OPTIONS Enable optimizations that assume no NAN values. -.. option:: -enable-unsafe-fp-math - - Causes :program:`lli` to enable optimizations that may decrease floating point - precision. - .. option:: -soft-float Causes :program:`lli` to generate software floating point library calls instead of diff --git a/llvm/docs/SourceLevelDebugging.rst b/llvm/docs/SourceLevelDebugging.rst index f057b2d2d2e36..12b5e3e549df7 100644 --- a/llvm/docs/SourceLevelDebugging.rst +++ b/llvm/docs/SourceLevelDebugging.rst @@ -674,7 +674,7 @@ Compiled to LLVM, this function would be represented like this: ret void, !dbg !24 } - attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } attributes #1 = { nounwind readnone } !llvm.dbg.cu = !{!0} From b7a93b1f7ffc7591d25984f9a940edf1289b971c Mon Sep 17 00:00:00 2001 From: Ivan Kosarev Date: Wed, 22 Oct 2025 11:40:11 +0100 Subject: [PATCH 076/530] [Utils][update_mc_test_checks] Generate check lines in alphabetical order. (#164424) Currently, check lines are generated beginning from those with the most used prefixes, and then if two or more prefixes cover the same number of RUN lines, they would come in the order of those RUN lines. This means adding RUN lines may change the order of check lines, even if the set of check prefixes remained the same, which is not ideal. --- llvm/test/MC/AMDGPU/literals.s | 1624 +++++++++++++-------------- llvm/utils/update_mc_test_checks.py | 28 +- 2 files changed, 828 insertions(+), 824 deletions(-) diff --git a/llvm/test/MC/AMDGPU/literals.s b/llvm/test/MC/AMDGPU/literals.s index 78aa8f2b44251..3faea9921bb23 100644 --- a/llvm/test/MC/AMDGPU/literals.s +++ b/llvm/test/MC/AMDGPU/literals.s @@ -20,282 +20,282 @@ //---------------------------------------------------------------------------// v_fract_f64 v[0:1], 0.5 -// SICI: v_fract_f64_e32 v[0:1], 0.5 ; encoding: [0xf0,0x7c,0x00,0x7e] -// GFX89: v_fract_f64_e32 v[0:1], 0.5 ; encoding: [0xf0,0x64,0x00,0x7e] -// GFX12XX: v_fract_f64_e32 v[0:1], 0.5 ; encoding: [0xf0,0x7c,0x00,0x7e] // GFX11: v_fract_f64_e32 v[0:1], 0.5 ; encoding: [0xf0,0x7c,0x00,0x7e] +// GFX12XX: v_fract_f64_e32 v[0:1], 0.5 ; encoding: [0xf0,0x7c,0x00,0x7e] +// GFX89: v_fract_f64_e32 v[0:1], 0.5 ; encoding: [0xf0,0x64,0x00,0x7e] +// SICI: v_fract_f64_e32 v[0:1], 0.5 ; encoding: [0xf0,0x7c,0x00,0x7e] v_sqrt_f64 v[0:1], -4.0 -// SICI: v_sqrt_f64_e32 v[0:1], -4.0 ; encoding: [0xf7,0x68,0x00,0x7e] -// GFX89: v_sqrt_f64_e32 v[0:1], -4.0 ; encoding: [0xf7,0x50,0x00,0x7e] -// GFX12XX: v_sqrt_f64_e32 v[0:1], -4.0 ; encoding: [0xf7,0x68,0x00,0x7e] // GFX11: v_sqrt_f64_e32 v[0:1], -4.0 ; encoding: [0xf7,0x68,0x00,0x7e] +// GFX12XX: v_sqrt_f64_e32 v[0:1], -4.0 ; encoding: [0xf7,0x68,0x00,0x7e] +// GFX89: v_sqrt_f64_e32 v[0:1], -4.0 ; encoding: [0xf7,0x50,0x00,0x7e] +// SICI: v_sqrt_f64_e32 v[0:1], -4.0 ; encoding: [0xf7,0x68,0x00,0x7e] v_log_clamp_f32 v1, 0.5 // NOGFX8PLUS: :[[@LINE-1]]:1: error: instruction not supported on this GPU // SICI: v_log_clamp_f32_e32 v1, 0.5 ; encoding: [0xf0,0x4c,0x02,0x7e] v_trunc_f32 v0, 0.5 -// SICI: v_trunc_f32_e32 v0, 0.5 ; encoding: [0xf0,0x42,0x00,0x7e] -// GFX89: v_trunc_f32_e32 v0, 0.5 ; encoding: [0xf0,0x38,0x00,0x7e] -// GFX12XX: v_trunc_f32_e32 v0, 0.5 ; encoding: [0xf0,0x42,0x00,0x7e] // GFX11: v_trunc_f32_e32 v0, 0.5 ; encoding: [0xf0,0x42,0x00,0x7e] +// GFX12XX: v_trunc_f32_e32 v0, 0.5 ; encoding: [0xf0,0x42,0x00,0x7e] +// GFX89: v_trunc_f32_e32 v0, 0.5 ; encoding: [0xf0,0x38,0x00,0x7e] +// SICI: v_trunc_f32_e32 v0, 0.5 ; encoding: [0xf0,0x42,0x00,0x7e] v_fract_f64 v[0:1], -1.0 -// SICI: v_fract_f64_e32 v[0:1], -1.0 ; encoding: [0xf3,0x7c,0x00,0x7e] -// GFX89: v_fract_f64_e32 v[0:1], -1.0 ; encoding: [0xf3,0x64,0x00,0x7e] -// GFX12XX: v_fract_f64_e32 v[0:1], -1.0 ; encoding: [0xf3,0x7c,0x00,0x7e] // GFX11: v_fract_f64_e32 v[0:1], -1.0 ; encoding: [0xf3,0x7c,0x00,0x7e] +// GFX12XX: v_fract_f64_e32 v[0:1], -1.0 ; encoding: [0xf3,0x7c,0x00,0x7e] +// GFX89: v_fract_f64_e32 v[0:1], -1.0 ; encoding: [0xf3,0x64,0x00,0x7e] +// SICI: v_fract_f64_e32 v[0:1], -1.0 ; encoding: [0xf3,0x7c,0x00,0x7e] v_trunc_f32 v0, -1.0 -// SICI: v_trunc_f32_e32 v0, -1.0 ; encoding: [0xf3,0x42,0x00,0x7e] -// GFX89: v_trunc_f32_e32 v0, -1.0 ; encoding: [0xf3,0x38,0x00,0x7e] -// GFX12XX: v_trunc_f32_e32 v0, -1.0 ; encoding: [0xf3,0x42,0x00,0x7e] // GFX11: v_trunc_f32_e32 v0, -1.0 ; encoding: [0xf3,0x42,0x00,0x7e] +// GFX12XX: v_trunc_f32_e32 v0, -1.0 ; encoding: [0xf3,0x42,0x00,0x7e] +// GFX89: v_trunc_f32_e32 v0, -1.0 ; encoding: [0xf3,0x38,0x00,0x7e] +// SICI: v_trunc_f32_e32 v0, -1.0 ; encoding: [0xf3,0x42,0x00,0x7e] v_fract_f64 v[0:1], 4.0 -// SICI: v_fract_f64_e32 v[0:1], 4.0 ; encoding: [0xf6,0x7c,0x00,0x7e] -// GFX89: v_fract_f64_e32 v[0:1], 4.0 ; encoding: [0xf6,0x64,0x00,0x7e] -// GFX12XX: v_fract_f64_e32 v[0:1], 4.0 ; encoding: [0xf6,0x7c,0x00,0x7e] // GFX11: v_fract_f64_e32 v[0:1], 4.0 ; encoding: [0xf6,0x7c,0x00,0x7e] +// GFX12XX: v_fract_f64_e32 v[0:1], 4.0 ; encoding: [0xf6,0x7c,0x00,0x7e] +// GFX89: v_fract_f64_e32 v[0:1], 4.0 ; encoding: [0xf6,0x64,0x00,0x7e] +// SICI: v_fract_f64_e32 v[0:1], 4.0 ; encoding: [0xf6,0x7c,0x00,0x7e] v_trunc_f32 v0, 4.0 -// SICI: v_trunc_f32_e32 v0, 4.0 ; encoding: [0xf6,0x42,0x00,0x7e] -// GFX89: v_trunc_f32_e32 v0, 4.0 ; encoding: [0xf6,0x38,0x00,0x7e] -// GFX12XX: v_trunc_f32_e32 v0, 4.0 ; encoding: [0xf6,0x42,0x00,0x7e] // GFX11: v_trunc_f32_e32 v0, 4.0 ; encoding: [0xf6,0x42,0x00,0x7e] +// GFX12XX: v_trunc_f32_e32 v0, 4.0 ; encoding: [0xf6,0x42,0x00,0x7e] +// GFX89: v_trunc_f32_e32 v0, 4.0 ; encoding: [0xf6,0x38,0x00,0x7e] +// SICI: v_trunc_f32_e32 v0, 4.0 ; encoding: [0xf6,0x42,0x00,0x7e] v_fract_f64 v[0:1], 0.0 -// SICI: v_fract_f64_e32 v[0:1], 0 ; encoding: [0x80,0x7c,0x00,0x7e] -// GFX89: v_fract_f64_e32 v[0:1], 0 ; encoding: [0x80,0x64,0x00,0x7e] -// GFX12XX: v_fract_f64_e32 v[0:1], 0 ; encoding: [0x80,0x7c,0x00,0x7e] // GFX11: v_fract_f64_e32 v[0:1], 0 ; encoding: [0x80,0x7c,0x00,0x7e] +// GFX12XX: v_fract_f64_e32 v[0:1], 0 ; encoding: [0x80,0x7c,0x00,0x7e] +// GFX89: v_fract_f64_e32 v[0:1], 0 ; encoding: [0x80,0x64,0x00,0x7e] +// SICI: v_fract_f64_e32 v[0:1], 0 ; encoding: [0x80,0x7c,0x00,0x7e] v_trunc_f32 v0, 0.0 -// SICI: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x42,0x00,0x7e] -// GFX89: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x38,0x00,0x7e] -// GFX12XX: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x42,0x00,0x7e] // GFX11: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x42,0x00,0x7e] +// GFX12XX: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x42,0x00,0x7e] +// GFX89: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x38,0x00,0x7e] +// SICI: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x42,0x00,0x7e] v_fract_f64 v[0:1], 1.5 -// SICI: v_fract_f64_e32 v[0:1], 0x3ff80000 ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf8,0x3f] -// GFX89: v_fract_f64_e32 v[0:1], 0x3ff80000 ; encoding: [0xff,0x64,0x00,0x7e,0x00,0x00,0xf8,0x3f] -// GFX12XX: v_fract_f64_e32 v[0:1], 0x3ff80000 ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf8,0x3f] // GFX11: v_fract_f64_e32 v[0:1], 0x3ff80000 ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf8,0x3f] +// GFX12XX: v_fract_f64_e32 v[0:1], 0x3ff80000 ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf8,0x3f] +// GFX89: v_fract_f64_e32 v[0:1], 0x3ff80000 ; encoding: [0xff,0x64,0x00,0x7e,0x00,0x00,0xf8,0x3f] +// SICI: v_fract_f64_e32 v[0:1], 0x3ff80000 ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf8,0x3f] v_trunc_f32 v0, 1.5 -// SICI: v_trunc_f32_e32 v0, 0x3fc00000 ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0xc0,0x3f] -// GFX89: v_trunc_f32_e32 v0, 0x3fc00000 ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x00,0xc0,0x3f] -// GFX12XX: v_trunc_f32_e32 v0, 0x3fc00000 ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0xc0,0x3f] // GFX11: v_trunc_f32_e32 v0, 0x3fc00000 ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0xc0,0x3f] +// GFX12XX: v_trunc_f32_e32 v0, 0x3fc00000 ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0xc0,0x3f] +// GFX89: v_trunc_f32_e32 v0, 0x3fc00000 ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x00,0xc0,0x3f] +// SICI: v_trunc_f32_e32 v0, 0x3fc00000 ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0xc0,0x3f] v_fract_f64 v[0:1], -3.1415 -// SICI: v_fract_f64_e32 v[0:1], 0xc00921ca ; encoding: [0xff,0x7c,0x00,0x7e,0xca,0x21,0x09,0xc0] -// GFX89: v_fract_f64_e32 v[0:1], 0xc00921ca ; encoding: [0xff,0x64,0x00,0x7e,0xca,0x21,0x09,0xc0] -// NOSICI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero -// NOGFX89: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero // GFX11: v_fract_f64_e32 v[0:1], 0xc00921ca ; encoding: [0xff,0x7c,0x00,0x7e,0xca,0x21,0x09,0xc0] // GFX12: v_fract_f64_e32 v[0:1], 0xc00921ca ; encoding: [0xff,0x7c,0x00,0x7e,0xca,0x21,0x09,0xc0] // GFX1250: v_fract_f64_e32 v[0:1], 0xc00921cac083126f ; encoding: [0xfe,0x7c,0x00,0x7e,0x6f,0x12,0x83,0xc0,0xca,0x21,0x09,0xc0] -// NOGFX11: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero -// NOGFX12: :[[@LINE-9]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// GFX89: v_fract_f64_e32 v[0:1], 0xc00921ca ; encoding: [0xff,0x64,0x00,0x7e,0xca,0x21,0x09,0xc0] +// NOGFX11: :[[@LINE-5]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// NOGFX12: :[[@LINE-6]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// NOGFX89: :[[@LINE-7]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// NOSICI: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// SICI: v_fract_f64_e32 v[0:1], 0xc00921ca ; encoding: [0xff,0x7c,0x00,0x7e,0xca,0x21,0x09,0xc0] // NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero v_trunc_f32 v0, -3.1415 -// SICI: v_trunc_f32_e32 v0, 0xc0490e56 ; encoding: [0xff,0x42,0x00,0x7e,0x56,0x0e,0x49,0xc0] -// GFX89: v_trunc_f32_e32 v0, 0xc0490e56 ; encoding: [0xff,0x38,0x00,0x7e,0x56,0x0e,0x49,0xc0] -// GFX12XX: v_trunc_f32_e32 v0, 0xc0490e56 ; encoding: [0xff,0x42,0x00,0x7e,0x56,0x0e,0x49,0xc0] // GFX11: v_trunc_f32_e32 v0, 0xc0490e56 ; encoding: [0xff,0x42,0x00,0x7e,0x56,0x0e,0x49,0xc0] +// GFX12XX: v_trunc_f32_e32 v0, 0xc0490e56 ; encoding: [0xff,0x42,0x00,0x7e,0x56,0x0e,0x49,0xc0] +// GFX89: v_trunc_f32_e32 v0, 0xc0490e56 ; encoding: [0xff,0x38,0x00,0x7e,0x56,0x0e,0x49,0xc0] +// SICI: v_trunc_f32_e32 v0, 0xc0490e56 ; encoding: [0xff,0x42,0x00,0x7e,0x56,0x0e,0x49,0xc0] v_fract_f64 v[0:1], 100000000000000000000000.0 -// SICI: v_fract_f64_e32 v[0:1], 0x44b52d02 ; encoding: [0xff,0x7c,0x00,0x7e,0x02,0x2d,0xb5,0x44] -// GFX89: v_fract_f64_e32 v[0:1], 0x44b52d02 ; encoding: [0xff,0x64,0x00,0x7e,0x02,0x2d,0xb5,0x44] -// NOSICI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero -// NOGFX89: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero // GFX11: v_fract_f64_e32 v[0:1], 0x44b52d02 ; encoding: [0xff,0x7c,0x00,0x7e,0x02,0x2d,0xb5,0x44] // GFX12: v_fract_f64_e32 v[0:1], 0x44b52d02 ; encoding: [0xff,0x7c,0x00,0x7e,0x02,0x2d,0xb5,0x44] // GFX1250: v_fract_f64_e32 v[0:1], 0x44b52d02c7e14af6 ; encoding: [0xfe,0x7c,0x00,0x7e,0xf6,0x4a,0xe1,0xc7,0x02,0x2d,0xb5,0x44] -// NOGFX11: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero -// NOGFX12: :[[@LINE-9]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// GFX89: v_fract_f64_e32 v[0:1], 0x44b52d02 ; encoding: [0xff,0x64,0x00,0x7e,0x02,0x2d,0xb5,0x44] +// NOGFX11: :[[@LINE-5]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// NOGFX12: :[[@LINE-6]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// NOGFX89: :[[@LINE-7]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// NOSICI: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// SICI: v_fract_f64_e32 v[0:1], 0x44b52d02 ; encoding: [0xff,0x7c,0x00,0x7e,0x02,0x2d,0xb5,0x44] // NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero v_trunc_f32 v0, 100000000000000000000000.0 -// SICI: v_trunc_f32_e32 v0, 0x65a96816 ; encoding: [0xff,0x42,0x00,0x7e,0x16,0x68,0xa9,0x65] -// GFX89: v_trunc_f32_e32 v0, 0x65a96816 ; encoding: [0xff,0x38,0x00,0x7e,0x16,0x68,0xa9,0x65] -// GFX12XX: v_trunc_f32_e32 v0, 0x65a96816 ; encoding: [0xff,0x42,0x00,0x7e,0x16,0x68,0xa9,0x65] // GFX11: v_trunc_f32_e32 v0, 0x65a96816 ; encoding: [0xff,0x42,0x00,0x7e,0x16,0x68,0xa9,0x65] +// GFX12XX: v_trunc_f32_e32 v0, 0x65a96816 ; encoding: [0xff,0x42,0x00,0x7e,0x16,0x68,0xa9,0x65] +// GFX89: v_trunc_f32_e32 v0, 0x65a96816 ; encoding: [0xff,0x38,0x00,0x7e,0x16,0x68,0xa9,0x65] +// SICI: v_trunc_f32_e32 v0, 0x65a96816 ; encoding: [0xff,0x42,0x00,0x7e,0x16,0x68,0xa9,0x65] v_fract_f64 v[0:1], 10000000.0 -// SICI: v_fract_f64_e32 v[0:1], 0x416312d0 ; encoding: [0xff,0x7c,0x00,0x7e,0xd0,0x12,0x63,0x41] -// GFX89: v_fract_f64_e32 v[0:1], 0x416312d0 ; encoding: [0xff,0x64,0x00,0x7e,0xd0,0x12,0x63,0x41] -// GFX12XX: v_fract_f64_e32 v[0:1], 0x416312d0 ; encoding: [0xff,0x7c,0x00,0x7e,0xd0,0x12,0x63,0x41] // GFX11: v_fract_f64_e32 v[0:1], 0x416312d0 ; encoding: [0xff,0x7c,0x00,0x7e,0xd0,0x12,0x63,0x41] +// GFX12XX: v_fract_f64_e32 v[0:1], 0x416312d0 ; encoding: [0xff,0x7c,0x00,0x7e,0xd0,0x12,0x63,0x41] +// GFX89: v_fract_f64_e32 v[0:1], 0x416312d0 ; encoding: [0xff,0x64,0x00,0x7e,0xd0,0x12,0x63,0x41] +// SICI: v_fract_f64_e32 v[0:1], 0x416312d0 ; encoding: [0xff,0x7c,0x00,0x7e,0xd0,0x12,0x63,0x41] v_trunc_f32 v0, 10000000.0 -// SICI: v_trunc_f32_e32 v0, 0x4b189680 ; encoding: [0xff,0x42,0x00,0x7e,0x80,0x96,0x18,0x4b] -// GFX89: v_trunc_f32_e32 v0, 0x4b189680 ; encoding: [0xff,0x38,0x00,0x7e,0x80,0x96,0x18,0x4b] -// GFX12XX: v_trunc_f32_e32 v0, 0x4b189680 ; encoding: [0xff,0x42,0x00,0x7e,0x80,0x96,0x18,0x4b] // GFX11: v_trunc_f32_e32 v0, 0x4b189680 ; encoding: [0xff,0x42,0x00,0x7e,0x80,0x96,0x18,0x4b] +// GFX12XX: v_trunc_f32_e32 v0, 0x4b189680 ; encoding: [0xff,0x42,0x00,0x7e,0x80,0x96,0x18,0x4b] +// GFX89: v_trunc_f32_e32 v0, 0x4b189680 ; encoding: [0xff,0x38,0x00,0x7e,0x80,0x96,0x18,0x4b] +// SICI: v_trunc_f32_e32 v0, 0x4b189680 ; encoding: [0xff,0x42,0x00,0x7e,0x80,0x96,0x18,0x4b] v_fract_f64 v[0:1], 3.402823e+38 -// SICI: v_fract_f64_e32 v[0:1], 0x47efffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xef,0x47] -// GFX89: v_fract_f64_e32 v[0:1], 0x47efffff ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0xef,0x47] -// NOSICI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero -// NOGFX89: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero // GFX11: v_fract_f64_e32 v[0:1], 0x47efffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xef,0x47] // GFX12: v_fract_f64_e32 v[0:1], 0x47efffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xef,0x47] // GFX1250: v_fract_f64_e32 v[0:1], 0x47efffff966ad924 ; encoding: [0xfe,0x7c,0x00,0x7e,0x24,0xd9,0x6a,0x96,0xff,0xff,0xef,0x47] -// NOGFX11: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero -// NOGFX12: :[[@LINE-9]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// GFX89: v_fract_f64_e32 v[0:1], 0x47efffff ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0xef,0x47] +// NOGFX11: :[[@LINE-5]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// NOGFX12: :[[@LINE-6]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// NOGFX89: :[[@LINE-7]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// NOSICI: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// SICI: v_fract_f64_e32 v[0:1], 0x47efffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xef,0x47] // NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero v_trunc_f32 v0, 3.402823e+38 -// SICI: v_trunc_f32_e32 v0, 0x7f7ffffd ; encoding: [0xff,0x42,0x00,0x7e,0xfd,0xff,0x7f,0x7f] -// GFX89: v_trunc_f32_e32 v0, 0x7f7ffffd ; encoding: [0xff,0x38,0x00,0x7e,0xfd,0xff,0x7f,0x7f] -// GFX12XX: v_trunc_f32_e32 v0, 0x7f7ffffd ; encoding: [0xff,0x42,0x00,0x7e,0xfd,0xff,0x7f,0x7f] // GFX11: v_trunc_f32_e32 v0, 0x7f7ffffd ; encoding: [0xff,0x42,0x00,0x7e,0xfd,0xff,0x7f,0x7f] +// GFX12XX: v_trunc_f32_e32 v0, 0x7f7ffffd ; encoding: [0xff,0x42,0x00,0x7e,0xfd,0xff,0x7f,0x7f] +// GFX89: v_trunc_f32_e32 v0, 0x7f7ffffd ; encoding: [0xff,0x38,0x00,0x7e,0xfd,0xff,0x7f,0x7f] +// SICI: v_trunc_f32_e32 v0, 0x7f7ffffd ; encoding: [0xff,0x42,0x00,0x7e,0xfd,0xff,0x7f,0x7f] v_fract_f64 v[0:1], 2.3509886e-38 -// SICI: v_fract_f64_e32 v[0:1], 0x381fffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0x1f,0x38] -// GFX89: v_fract_f64_e32 v[0:1], 0x381fffff ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0x1f,0x38] -// NOSICI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero -// NOGFX89: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero // GFX11: v_fract_f64_e32 v[0:1], 0x381fffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0x1f,0x38] // GFX12: v_fract_f64_e32 v[0:1], 0x381fffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0x1f,0x38] // GFX1250: v_fract_f64_e32 v[0:1], 0x381fffffe8c9d9fb ; encoding: [0xfe,0x7c,0x00,0x7e,0xfb,0xd9,0xc9,0xe8,0xff,0xff,0x1f,0x38] -// NOGFX11: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero -// NOGFX12: :[[@LINE-9]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// GFX89: v_fract_f64_e32 v[0:1], 0x381fffff ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0x1f,0x38] +// NOGFX11: :[[@LINE-5]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// NOGFX12: :[[@LINE-6]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// NOGFX89: :[[@LINE-7]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// NOSICI: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// SICI: v_fract_f64_e32 v[0:1], 0x381fffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0x1f,0x38] // NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero v_trunc_f32 v0, 2.3509886e-38 -// SICI: v_trunc_f32_e32 v0, 0xffffff ; encoding: [0xff,0x42,0x00,0x7e,0xff,0xff,0xff,0x00] -// GFX89: v_trunc_f32_e32 v0, 0xffffff ; encoding: [0xff,0x38,0x00,0x7e,0xff,0xff,0xff,0x00] -// GFX12XX: v_trunc_f32_e32 v0, 0xffffff ; encoding: [0xff,0x42,0x00,0x7e,0xff,0xff,0xff,0x00] // GFX11: v_trunc_f32_e32 v0, 0xffffff ; encoding: [0xff,0x42,0x00,0x7e,0xff,0xff,0xff,0x00] +// GFX12XX: v_trunc_f32_e32 v0, 0xffffff ; encoding: [0xff,0x42,0x00,0x7e,0xff,0xff,0xff,0x00] +// GFX89: v_trunc_f32_e32 v0, 0xffffff ; encoding: [0xff,0x38,0x00,0x7e,0xff,0xff,0xff,0x00] +// SICI: v_trunc_f32_e32 v0, 0xffffff ; encoding: [0xff,0x42,0x00,0x7e,0xff,0xff,0xff,0x00] v_fract_f64 v[0:1], 2.3509886e-70 -// SICI: v_fract_f64_e32 v[0:1], 0x3179f623 ; encoding: [0xff,0x7c,0x00,0x7e,0x23,0xf6,0x79,0x31] -// GFX89: v_fract_f64_e32 v[0:1], 0x3179f623 ; encoding: [0xff,0x64,0x00,0x7e,0x23,0xf6,0x79,0x31] -// NOSICI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero -// NOGFX89: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero // GFX11: v_fract_f64_e32 v[0:1], 0x3179f623 ; encoding: [0xff,0x7c,0x00,0x7e,0x23,0xf6,0x79,0x31] // GFX12: v_fract_f64_e32 v[0:1], 0x3179f623 ; encoding: [0xff,0x7c,0x00,0x7e,0x23,0xf6,0x79,0x31] // GFX1250: v_fract_f64_e32 v[0:1], 0x3179f623c2d3cf3c ; encoding: [0xfe,0x7c,0x00,0x7e,0x3c,0xcf,0xd3,0xc2,0x23,0xf6,0x79,0x31] -// NOGFX11: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero -// NOGFX12: :[[@LINE-9]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// GFX89: v_fract_f64_e32 v[0:1], 0x3179f623 ; encoding: [0xff,0x64,0x00,0x7e,0x23,0xf6,0x79,0x31] +// NOGFX11: :[[@LINE-5]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// NOGFX12: :[[@LINE-6]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// NOGFX89: :[[@LINE-7]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// NOSICI: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero +// SICI: v_fract_f64_e32 v[0:1], 0x3179f623 ; encoding: [0xff,0x7c,0x00,0x7e,0x23,0xf6,0x79,0x31] // NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero v_trunc_f32 v0, 2.3509886e-70 // NOGCN: :[[@LINE-1]]:17: error: invalid operand for instruction v_fract_f64_e32 v[0:1], 1.0 -// SICI: v_fract_f64_e32 v[0:1], 1.0 ; encoding: [0xf2,0x7c,0x00,0x7e] -// GFX89: v_fract_f64_e32 v[0:1], 1.0 ; encoding: [0xf2,0x64,0x00,0x7e] -// GFX12XX: v_fract_f64_e32 v[0:1], 1.0 ; encoding: [0xf2,0x7c,0x00,0x7e] // GFX11: v_fract_f64_e32 v[0:1], 1.0 ; encoding: [0xf2,0x7c,0x00,0x7e] +// GFX12XX: v_fract_f64_e32 v[0:1], 1.0 ; encoding: [0xf2,0x7c,0x00,0x7e] +// GFX89: v_fract_f64_e32 v[0:1], 1.0 ; encoding: [0xf2,0x64,0x00,0x7e] +// SICI: v_fract_f64_e32 v[0:1], 1.0 ; encoding: [0xf2,0x7c,0x00,0x7e] v_fract_f64_e32 v[0:1], lit(1.0) -// SICI: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf0,0x3f] -// GFX89: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x64,0x00,0x7e,0x00,0x00,0xf0,0x3f] // GFX11: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf0,0x3f] // GFX12: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf0,0x3f] // GFX1250: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xfe,0x7c,0x00,0x7e,0x00,0x00,0xf0,0x3f,0x00,0x00,0x00,0x00] +// GFX89: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x64,0x00,0x7e,0x00,0x00,0xf0,0x3f] +// SICI: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf0,0x3f] v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1.0 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0xca,0x1b] -// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode -// NOGFX1250: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode +// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode +// NOGFX1250: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], lit(1.0) -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU -// NOGFX11: :[[@LINE-3]]:54: error: invalid operand for instruction -// NOGFX12: :[[@LINE-4]]:54: error: invalid operand for instruction -// NOGFX1250: :[[@LINE-5]]:54: error: invalid operand for instruction +// NOGFX11: :[[@LINE-1]]:54: error: invalid operand for instruction +// NOGFX12: :[[@LINE-2]]:54: error: invalid operand for instruction +// NOGFX1250: :[[@LINE-3]]:54: error: invalid operand for instruction +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_cos_f16_e32 v5.l, 1.0 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode // GFX11: v_cos_f16_e32 v5.l, 1.0 ; encoding: [0xf2,0xc2,0x0a,0x7e] // GFX1250: v_cos_f16_e32 v5.l, 1.0 ; encoding: [0xf2,0xc2,0x0a,0x7e] -// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode +// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode +// NOGFX89: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_cos_f16_e32 v5.l, lit(1.0) -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode // GFX11: v_cos_f16_e32 v5.l, lit(0x3c00) ; encoding: [0xff,0xc2,0x0a,0x7e,0x00,0x3c,0x00,0x00] // GFX1250: v_cos_f16_e32 v5.l, lit(0x3c00) ; encoding: [0xff,0xc2,0x0a,0x7e,0x00,0x3c,0x00,0x00] -// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode +// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode +// NOGFX89: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_tanh_bf16 v5, 1.0 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX1250: v_tanh_bf16_e32 v5, 1.0 ; encoding: [0xf2,0x94,0x0a,0x7e] -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_tanh_bf16 v5, lit(1.0) -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX1250: v_tanh_bf16_e32 v5, lit(0x3f80) ; encoding: [0xff,0x94,0x0a,0x7e,0x80,0x3f,0x00,0x00] -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_trunc_f32_e32 v0, 1.0 -// SICI: v_trunc_f32_e32 v0, 1.0 ; encoding: [0xf2,0x42,0x00,0x7e] -// GFX89: v_trunc_f32_e32 v0, 1.0 ; encoding: [0xf2,0x38,0x00,0x7e] -// GFX12XX: v_trunc_f32_e32 v0, 1.0 ; encoding: [0xf2,0x42,0x00,0x7e] // GFX11: v_trunc_f32_e32 v0, 1.0 ; encoding: [0xf2,0x42,0x00,0x7e] +// GFX12XX: v_trunc_f32_e32 v0, 1.0 ; encoding: [0xf2,0x42,0x00,0x7e] +// GFX89: v_trunc_f32_e32 v0, 1.0 ; encoding: [0xf2,0x38,0x00,0x7e] +// SICI: v_trunc_f32_e32 v0, 1.0 ; encoding: [0xf2,0x42,0x00,0x7e] v_trunc_f32_e32 v0, lit(1.0) -// SICI: v_trunc_f32_e32 v0, lit(0x3f800000) ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0x80,0x3f] -// GFX89: v_trunc_f32_e32 v0, lit(0x3f800000) ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x00,0x80,0x3f] -// GFX12XX: v_trunc_f32_e32 v0, lit(0x3f800000) ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0x80,0x3f] // GFX11: v_trunc_f32_e32 v0, lit(0x3f800000) ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0x80,0x3f] +// GFX12XX: v_trunc_f32_e32 v0, lit(0x3f800000) ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0x80,0x3f] +// GFX89: v_trunc_f32_e32 v0, lit(0x3f800000) ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x00,0x80,0x3f] +// SICI: v_trunc_f32_e32 v0, lit(0x3f800000) ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0x80,0x3f] v_dot2_bf16_bf16 v5.l, v1, v2, 1.0 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_dot2_bf16_bf16 v5.l, v1, v2, 1.0 ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0xca,0x03] -// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_dot2_bf16_bf16 v5.l, v1, v2, lit(1.0) -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_dot2_bf16_bf16 v5.l, v1, v2, lit(0x3f80) ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0xfe,0x03,0x80,0x3f,0x00,0x00] -// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_dot2_f32_f16 v5, v1, 1.0, v2 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_dot2_f32_f16 v5, v1, 1.0, v2 ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xe5,0x09,0x1c] // GFX12: v_dot2_f32_f16 v5, v1, 1.0, v2 ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xe5,0x09,0x1c] -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_dot2_f32_f16 v5, v1, lit(1.0), v2 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_dot2_f32_f16 v5, v1, lit(0x3c00), v2 ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xff,0x09,0x1c,0x00,0x3c,0x00,0x00] // GFX12: v_dot2_f32_f16 v5, v1, lit(0x3c00), v2 ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xff,0x09,0x1c,0x00,0x3c,0x00,0x00] -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_cvt_pk_fp8_f16 v1.l, 1.0 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX1250: v_cvt_pk_fp8_f16 v1.l, 0x3c00 ; encoding: [0x01,0x00,0x72,0xd7,0xff,0x00,0x00,0x00,0x00,0x3c,0x00,0x00] -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_cvt_pk_fp8_f16 v1.l, lit(1.0) -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX1250: v_cvt_pk_fp8_f16 v1.l, lit(0x3c00) ; encoding: [0x01,0x00,0x72,0xd7,0xff,0x00,0x00,0x00,0x00,0x3c,0x00,0x00] -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU //---------------------------------------------------------------------------// // fp literal, expected int operand @@ -309,118 +309,118 @@ s_mov_b64 s[0:1], lit(0.5) // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction v_and_b32_e32 v0, 0.5, v1 -// SICI: v_and_b32_e32 v0, 0.5, v1 ; encoding: [0xf0,0x02,0x00,0x36] -// GFX89: v_and_b32_e32 v0, 0.5, v1 ; encoding: [0xf0,0x02,0x00,0x26] -// GFX12XX: v_and_b32_e32 v0, 0.5, v1 ; encoding: [0xf0,0x02,0x00,0x36] // GFX11: v_and_b32_e32 v0, 0.5, v1 ; encoding: [0xf0,0x02,0x00,0x36] +// GFX12XX: v_and_b32_e32 v0, 0.5, v1 ; encoding: [0xf0,0x02,0x00,0x36] +// GFX89: v_and_b32_e32 v0, 0.5, v1 ; encoding: [0xf0,0x02,0x00,0x26] +// SICI: v_and_b32_e32 v0, 0.5, v1 ; encoding: [0xf0,0x02,0x00,0x36] v_and_b32_e64 v0, 0.5, v1 -// SICI: v_and_b32_e64 v0, 0.5, v1 ; encoding: [0x00,0x00,0x36,0xd2,0xf0,0x02,0x02,0x00] -// GFX89: v_and_b32_e64 v0, 0.5, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xf0,0x02,0x02,0x00] -// GFX12XX: v_and_b32_e64 v0, 0.5, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf0,0x02,0x02,0x00] // GFX11: v_and_b32_e64 v0, 0.5, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf0,0x02,0x02,0x00] +// GFX12XX: v_and_b32_e64 v0, 0.5, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf0,0x02,0x02,0x00] +// GFX89: v_and_b32_e64 v0, 0.5, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xf0,0x02,0x02,0x00] +// SICI: v_and_b32_e64 v0, 0.5, v1 ; encoding: [0x00,0x00,0x36,0xd2,0xf0,0x02,0x02,0x00] s_mov_b64_e32 s[0:1], -1.0 // GFX8PLUS: s_mov_b64 s[0:1], -1.0 ; encoding: [0xf3,0x01,0x80,0xbe] // SICI: s_mov_b64 s[0:1], -1.0 ; encoding: [0xf3,0x04,0x80,0xbe] v_and_b32_e32 v0, -1.0, v1 -// SICI: v_and_b32_e32 v0, -1.0, v1 ; encoding: [0xf3,0x02,0x00,0x36] -// GFX89: v_and_b32_e32 v0, -1.0, v1 ; encoding: [0xf3,0x02,0x00,0x26] -// GFX12XX: v_and_b32_e32 v0, -1.0, v1 ; encoding: [0xf3,0x02,0x00,0x36] // GFX11: v_and_b32_e32 v0, -1.0, v1 ; encoding: [0xf3,0x02,0x00,0x36] +// GFX12XX: v_and_b32_e32 v0, -1.0, v1 ; encoding: [0xf3,0x02,0x00,0x36] +// GFX89: v_and_b32_e32 v0, -1.0, v1 ; encoding: [0xf3,0x02,0x00,0x26] +// SICI: v_and_b32_e32 v0, -1.0, v1 ; encoding: [0xf3,0x02,0x00,0x36] v_and_b32_e64 v0, -1.0, v1 -// SICI: v_and_b32_e64 v0, -1.0, v1 ; encoding: [0x00,0x00,0x36,0xd2,0xf3,0x02,0x02,0x00] -// GFX89: v_and_b32_e64 v0, -1.0, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xf3,0x02,0x02,0x00] -// GFX12XX: v_and_b32_e64 v0, -1.0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf3,0x02,0x02,0x00] // GFX11: v_and_b32_e64 v0, -1.0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf3,0x02,0x02,0x00] +// GFX12XX: v_and_b32_e64 v0, -1.0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf3,0x02,0x02,0x00] +// GFX89: v_and_b32_e64 v0, -1.0, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xf3,0x02,0x02,0x00] +// SICI: v_and_b32_e64 v0, -1.0, v1 ; encoding: [0x00,0x00,0x36,0xd2,0xf3,0x02,0x02,0x00] s_mov_b64_e32 s[0:1], 4.0 // GFX8PLUS: s_mov_b64 s[0:1], 4.0 ; encoding: [0xf6,0x01,0x80,0xbe] // SICI: s_mov_b64 s[0:1], 4.0 ; encoding: [0xf6,0x04,0x80,0xbe] v_and_b32_e32 v0, 4.0, v1 -// SICI: v_and_b32_e32 v0, 4.0, v1 ; encoding: [0xf6,0x02,0x00,0x36] -// GFX89: v_and_b32_e32 v0, 4.0, v1 ; encoding: [0xf6,0x02,0x00,0x26] -// GFX12XX: v_and_b32_e32 v0, 4.0, v1 ; encoding: [0xf6,0x02,0x00,0x36] // GFX11: v_and_b32_e32 v0, 4.0, v1 ; encoding: [0xf6,0x02,0x00,0x36] +// GFX12XX: v_and_b32_e32 v0, 4.0, v1 ; encoding: [0xf6,0x02,0x00,0x36] +// GFX89: v_and_b32_e32 v0, 4.0, v1 ; encoding: [0xf6,0x02,0x00,0x26] +// SICI: v_and_b32_e32 v0, 4.0, v1 ; encoding: [0xf6,0x02,0x00,0x36] v_and_b32_e64 v0, 4.0, v1 -// SICI: v_and_b32_e64 v0, 4.0, v1 ; encoding: [0x00,0x00,0x36,0xd2,0xf6,0x02,0x02,0x00] -// GFX89: v_and_b32_e64 v0, 4.0, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xf6,0x02,0x02,0x00] -// GFX12XX: v_and_b32_e64 v0, 4.0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf6,0x02,0x02,0x00] // GFX11: v_and_b32_e64 v0, 4.0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf6,0x02,0x02,0x00] +// GFX12XX: v_and_b32_e64 v0, 4.0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf6,0x02,0x02,0x00] +// GFX89: v_and_b32_e64 v0, 4.0, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xf6,0x02,0x02,0x00] +// SICI: v_and_b32_e64 v0, 4.0, v1 ; encoding: [0x00,0x00,0x36,0xd2,0xf6,0x02,0x02,0x00] s_mov_b64_e32 s[0:1], 0.0 // GFX8PLUS: s_mov_b64 s[0:1], 0 ; encoding: [0x80,0x01,0x80,0xbe] // SICI: s_mov_b64 s[0:1], 0 ; encoding: [0x80,0x04,0x80,0xbe] v_and_b32_e32 v0, 0.0, v1 -// SICI: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x36] -// GFX89: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x26] -// GFX12XX: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x36] // GFX11: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x36] +// GFX12XX: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x36] +// GFX89: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x26] +// SICI: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x36] v_and_b32_e64 v0, 0.0, v1 -// SICI: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x36,0xd2,0x80,0x02,0x02,0x00] -// GFX89: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x13,0xd1,0x80,0x02,0x02,0x00] -// GFX12XX: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00] // GFX11: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00] +// GFX12XX: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00] +// GFX89: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x13,0xd1,0x80,0x02,0x02,0x00] +// SICI: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x36,0xd2,0x80,0x02,0x02,0x00] s_mov_b64_e32 s[0:1], 1.5 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction v_and_b32_e32 v0, 1.5, v1 -// SICI: v_and_b32_e32 v0, 0x3fc00000, v1 ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0xc0,0x3f] -// GFX89: v_and_b32_e32 v0, 0x3fc00000, v1 ; encoding: [0xff,0x02,0x00,0x26,0x00,0x00,0xc0,0x3f] -// GFX12XX: v_and_b32_e32 v0, 0x3fc00000, v1 ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0xc0,0x3f] // GFX11: v_and_b32_e32 v0, 0x3fc00000, v1 ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0xc0,0x3f] +// GFX12XX: v_and_b32_e32 v0, 0x3fc00000, v1 ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0xc0,0x3f] +// GFX89: v_and_b32_e32 v0, 0x3fc00000, v1 ; encoding: [0xff,0x02,0x00,0x26,0x00,0x00,0xc0,0x3f] +// SICI: v_and_b32_e32 v0, 0x3fc00000, v1 ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0xc0,0x3f] s_mov_b64_e32 s[0:1], -3.1415 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction v_and_b32_e32 v0, -3.1415, v1 -// SICI: v_and_b32_e32 v0, 0xc0490e56, v1 ; encoding: [0xff,0x02,0x00,0x36,0x56,0x0e,0x49,0xc0] -// GFX89: v_and_b32_e32 v0, 0xc0490e56, v1 ; encoding: [0xff,0x02,0x00,0x26,0x56,0x0e,0x49,0xc0] -// GFX12XX: v_and_b32_e32 v0, 0xc0490e56, v1 ; encoding: [0xff,0x02,0x00,0x36,0x56,0x0e,0x49,0xc0] // GFX11: v_and_b32_e32 v0, 0xc0490e56, v1 ; encoding: [0xff,0x02,0x00,0x36,0x56,0x0e,0x49,0xc0] +// GFX12XX: v_and_b32_e32 v0, 0xc0490e56, v1 ; encoding: [0xff,0x02,0x00,0x36,0x56,0x0e,0x49,0xc0] +// GFX89: v_and_b32_e32 v0, 0xc0490e56, v1 ; encoding: [0xff,0x02,0x00,0x26,0x56,0x0e,0x49,0xc0] +// SICI: v_and_b32_e32 v0, 0xc0490e56, v1 ; encoding: [0xff,0x02,0x00,0x36,0x56,0x0e,0x49,0xc0] s_mov_b64_e32 s[0:1], 100000000000000000000000.0 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction v_and_b32_e32 v0, 100000000000000000000000.0, v1 -// SICI: v_and_b32_e32 v0, 0x65a96816, v1 ; encoding: [0xff,0x02,0x00,0x36,0x16,0x68,0xa9,0x65] -// GFX89: v_and_b32_e32 v0, 0x65a96816, v1 ; encoding: [0xff,0x02,0x00,0x26,0x16,0x68,0xa9,0x65] -// GFX12XX: v_and_b32_e32 v0, 0x65a96816, v1 ; encoding: [0xff,0x02,0x00,0x36,0x16,0x68,0xa9,0x65] // GFX11: v_and_b32_e32 v0, 0x65a96816, v1 ; encoding: [0xff,0x02,0x00,0x36,0x16,0x68,0xa9,0x65] +// GFX12XX: v_and_b32_e32 v0, 0x65a96816, v1 ; encoding: [0xff,0x02,0x00,0x36,0x16,0x68,0xa9,0x65] +// GFX89: v_and_b32_e32 v0, 0x65a96816, v1 ; encoding: [0xff,0x02,0x00,0x26,0x16,0x68,0xa9,0x65] +// SICI: v_and_b32_e32 v0, 0x65a96816, v1 ; encoding: [0xff,0x02,0x00,0x36,0x16,0x68,0xa9,0x65] s_mov_b64_e32 s[0:1], 10000000.0 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction v_and_b32_e32 v0, 10000000.0, v1 -// SICI: v_and_b32_e32 v0, 0x4b189680, v1 ; encoding: [0xff,0x02,0x00,0x36,0x80,0x96,0x18,0x4b] -// GFX89: v_and_b32_e32 v0, 0x4b189680, v1 ; encoding: [0xff,0x02,0x00,0x26,0x80,0x96,0x18,0x4b] -// GFX12XX: v_and_b32_e32 v0, 0x4b189680, v1 ; encoding: [0xff,0x02,0x00,0x36,0x80,0x96,0x18,0x4b] // GFX11: v_and_b32_e32 v0, 0x4b189680, v1 ; encoding: [0xff,0x02,0x00,0x36,0x80,0x96,0x18,0x4b] +// GFX12XX: v_and_b32_e32 v0, 0x4b189680, v1 ; encoding: [0xff,0x02,0x00,0x36,0x80,0x96,0x18,0x4b] +// GFX89: v_and_b32_e32 v0, 0x4b189680, v1 ; encoding: [0xff,0x02,0x00,0x26,0x80,0x96,0x18,0x4b] +// SICI: v_and_b32_e32 v0, 0x4b189680, v1 ; encoding: [0xff,0x02,0x00,0x36,0x80,0x96,0x18,0x4b] s_mov_b64_e32 s[0:1], 3.402823e+38 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction v_and_b32_e32 v0, 3.402823e+38, v1 -// SICI: v_and_b32_e32 v0, 0x7f7ffffd, v1 ; encoding: [0xff,0x02,0x00,0x36,0xfd,0xff,0x7f,0x7f] -// GFX89: v_and_b32_e32 v0, 0x7f7ffffd, v1 ; encoding: [0xff,0x02,0x00,0x26,0xfd,0xff,0x7f,0x7f] -// GFX12XX: v_and_b32_e32 v0, 0x7f7ffffd, v1 ; encoding: [0xff,0x02,0x00,0x36,0xfd,0xff,0x7f,0x7f] // GFX11: v_and_b32_e32 v0, 0x7f7ffffd, v1 ; encoding: [0xff,0x02,0x00,0x36,0xfd,0xff,0x7f,0x7f] +// GFX12XX: v_and_b32_e32 v0, 0x7f7ffffd, v1 ; encoding: [0xff,0x02,0x00,0x36,0xfd,0xff,0x7f,0x7f] +// GFX89: v_and_b32_e32 v0, 0x7f7ffffd, v1 ; encoding: [0xff,0x02,0x00,0x26,0xfd,0xff,0x7f,0x7f] +// SICI: v_and_b32_e32 v0, 0x7f7ffffd, v1 ; encoding: [0xff,0x02,0x00,0x36,0xfd,0xff,0x7f,0x7f] s_mov_b64_e32 s[0:1], 2.3509886e-38 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction v_and_b32_e32 v0, 2.3509886e-38, v1 -// SICI: v_and_b32_e32 v0, 0xffffff, v1 ; encoding: [0xff,0x02,0x00,0x36,0xff,0xff,0xff,0x00] -// GFX89: v_and_b32_e32 v0, 0xffffff, v1 ; encoding: [0xff,0x02,0x00,0x26,0xff,0xff,0xff,0x00] -// GFX12XX: v_and_b32_e32 v0, 0xffffff, v1 ; encoding: [0xff,0x02,0x00,0x36,0xff,0xff,0xff,0x00] // GFX11: v_and_b32_e32 v0, 0xffffff, v1 ; encoding: [0xff,0x02,0x00,0x36,0xff,0xff,0xff,0x00] +// GFX12XX: v_and_b32_e32 v0, 0xffffff, v1 ; encoding: [0xff,0x02,0x00,0x36,0xff,0xff,0xff,0x00] +// GFX89: v_and_b32_e32 v0, 0xffffff, v1 ; encoding: [0xff,0x02,0x00,0x26,0xff,0xff,0xff,0x00] +// SICI: v_and_b32_e32 v0, 0xffffff, v1 ; encoding: [0xff,0x02,0x00,0x36,0xff,0xff,0xff,0x00] s_mov_b64_e32 s[0:1], 2.3509886e-70 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction @@ -429,322 +429,322 @@ v_and_b32_e32 v0, 2.3509886e-70, v1 // NOGCN: :[[@LINE-1]]:19: error: invalid operand for instruction v_not_b16 v5.l, 1.0 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_not_b16_e32 v5.l, 1.0 ; encoding: [0xf2,0xd2,0x0a,0x7e] // GFX1250: v_not_b16_e32 v5.l, 1.0 ; encoding: [0xf2,0xd2,0x0a,0x7e] -// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode +// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_not_b16 v5.l, lit(1.0) -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_not_b16_e32 v5.l, lit(0x3f800000) ; encoding: [0xff,0xd2,0x0a,0x7e,0x00,0x00,0x80,0x3f] // GFX1250: v_not_b16_e32 v5.l, lit(0x3f800000) ; encoding: [0xff,0xd2,0x0a,0x7e,0x00,0x00,0x80,0x3f] -// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode +// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_and_b32_e32 v0, 1.0, v1 -// SICI: v_and_b32_e32 v0, 1.0, v1 ; encoding: [0xf2,0x02,0x00,0x36] -// GFX89: v_and_b32_e32 v0, 1.0, v1 ; encoding: [0xf2,0x02,0x00,0x26] -// GFX12XX: v_and_b32_e32 v0, 1.0, v1 ; encoding: [0xf2,0x02,0x00,0x36] // GFX11: v_and_b32_e32 v0, 1.0, v1 ; encoding: [0xf2,0x02,0x00,0x36] +// GFX12XX: v_and_b32_e32 v0, 1.0, v1 ; encoding: [0xf2,0x02,0x00,0x36] +// GFX89: v_and_b32_e32 v0, 1.0, v1 ; encoding: [0xf2,0x02,0x00,0x26] +// SICI: v_and_b32_e32 v0, 1.0, v1 ; encoding: [0xf2,0x02,0x00,0x36] v_and_b32_e32 v0, lit(1.0), v1 -// SICI: v_and_b32_e32 v0, lit(0x3f800000), v1 ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0x80,0x3f] -// GFX89: v_and_b32_e32 v0, lit(0x3f800000), v1 ; encoding: [0xff,0x02,0x00,0x26,0x00,0x00,0x80,0x3f] -// GFX12XX: v_and_b32_e32 v0, lit(0x3f800000), v1 ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0x80,0x3f] // GFX11: v_and_b32_e32 v0, lit(0x3f800000), v1 ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0x80,0x3f] +// GFX12XX: v_and_b32_e32 v0, lit(0x3f800000), v1 ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0x80,0x3f] +// GFX89: v_and_b32_e32 v0, lit(0x3f800000), v1 ; encoding: [0xff,0x02,0x00,0x26,0x00,0x00,0x80,0x3f] +// SICI: v_and_b32_e32 v0, lit(0x3f800000), v1 ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0x80,0x3f] v_pk_add_u16 v5, exec_lo, 1.0 +// GFX11: v_pk_add_u16 v5, exec_lo, 1.0 ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xe4,0x01,0x18] // GFX12XX: v_pk_add_u16 v5, exec_lo, 1.0 ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xe4,0x01,0x18] -// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX9: v_pk_add_u16 v5, exec_lo, 1.0 ; encoding: [0x05,0x40,0x8a,0xd3,0x7e,0xe4,0x01,0x18] -// GFX11: v_pk_add_u16 v5, exec_lo, 1.0 ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xe4,0x01,0x18] +// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU // NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_pk_add_u16 v5, exec_lo, lit(1.0) -// GFX12XX: v_pk_add_u16 v5, exec_lo, lit(0x3f800000) ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x00,0x00,0x80,0x3f] -// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_pk_add_u16 v5, exec_lo, lit(0x3f800000) ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x00,0x00,0x80,0x3f] -// NOVI: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX9: :[[@LINE-5]]:31: error: invalid operand (violates constant bus restrictions) +// GFX12XX: v_pk_add_u16 v5, exec_lo, lit(0x3f800000) ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x00,0x00,0x80,0x3f] +// NOGFX9: :[[@LINE-3]]:31: error: invalid operand (violates constant bus restrictions) +// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 1.0 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 1.0 ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x09,0xca,0x03] -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], lit(1.0) -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], lit(0x3f800000) ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x09,0xfe,0x03,0x00,0x00,0x80,0x3f] -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU //---------------------------------------------------------------------------// // int literal, expected fp operand //---------------------------------------------------------------------------// v_trunc_f32_e32 v0, 0 -// SICI: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x42,0x00,0x7e] -// GFX89: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x38,0x00,0x7e] -// GFX12XX: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x42,0x00,0x7e] // GFX11: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x42,0x00,0x7e] +// GFX12XX: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x42,0x00,0x7e] +// GFX89: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x38,0x00,0x7e] +// SICI: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x42,0x00,0x7e] v_fract_f64_e32 v[0:1], 1 -// SICI: v_fract_f64_e32 v[0:1], 1 ; encoding: [0x81,0x7c,0x00,0x7e] -// GFX89: v_fract_f64_e32 v[0:1], 1 ; encoding: [0x81,0x64,0x00,0x7e] -// GFX12XX: v_fract_f64_e32 v[0:1], 1 ; encoding: [0x81,0x7c,0x00,0x7e] // GFX11: v_fract_f64_e32 v[0:1], 1 ; encoding: [0x81,0x7c,0x00,0x7e] +// GFX12XX: v_fract_f64_e32 v[0:1], 1 ; encoding: [0x81,0x7c,0x00,0x7e] +// GFX89: v_fract_f64_e32 v[0:1], 1 ; encoding: [0x81,0x64,0x00,0x7e] +// SICI: v_fract_f64_e32 v[0:1], 1 ; encoding: [0x81,0x7c,0x00,0x7e] v_fract_f64_e32 v[0:1], lit(1) -// SICI: v_fract_f64_e32 v[0:1], lit(0x1) ; encoding: [0xff,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00] -// GFX89: v_fract_f64_e32 v[0:1], lit(0x1) ; encoding: [0xff,0x64,0x00,0x7e,0x01,0x00,0x00,0x00] // GFX11: v_fract_f64_e32 v[0:1], lit(0x1) ; encoding: [0xff,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00] // GFX12: v_fract_f64_e32 v[0:1], lit(0x1) ; encoding: [0xff,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00] // GFX1250: v_fract_f64_e32 v[0:1], lit(0x1) ; encoding: [0xfe,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00] +// GFX89: v_fract_f64_e32 v[0:1], lit(0x1) ; encoding: [0xff,0x64,0x00,0x7e,0x01,0x00,0x00,0x00] +// SICI: v_fract_f64_e32 v[0:1], lit(0x1) ; encoding: [0xff,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00] v_trunc_f32_e64 v0, 0 -// SICI: v_trunc_f32_e64 v0, 0 ; encoding: [0x00,0x00,0x42,0xd3,0x80,0x00,0x00,0x00] -// GFX89: v_trunc_f32_e64 v0, 0 ; encoding: [0x00,0x00,0x5c,0xd1,0x80,0x00,0x00,0x00] -// GFX12XX: v_trunc_f32_e64 v0, 0 ; encoding: [0x00,0x00,0xa1,0xd5,0x80,0x00,0x00,0x00] // GFX11: v_trunc_f32_e64 v0, 0 ; encoding: [0x00,0x00,0xa1,0xd5,0x80,0x00,0x00,0x00] +// GFX12XX: v_trunc_f32_e64 v0, 0 ; encoding: [0x00,0x00,0xa1,0xd5,0x80,0x00,0x00,0x00] +// GFX89: v_trunc_f32_e64 v0, 0 ; encoding: [0x00,0x00,0x5c,0xd1,0x80,0x00,0x00,0x00] +// SICI: v_trunc_f32_e64 v0, 0 ; encoding: [0x00,0x00,0x42,0xd3,0x80,0x00,0x00,0x00] v_fract_f64_e64 v[0:1], 0 -// SICI: v_fract_f64_e64 v[0:1], 0 ; encoding: [0x00,0x00,0x7c,0xd3,0x80,0x00,0x00,0x00] -// GFX89: v_fract_f64_e64 v[0:1], 0 ; encoding: [0x00,0x00,0x72,0xd1,0x80,0x00,0x00,0x00] -// GFX12XX: v_fract_f64_e64 v[0:1], 0 ; encoding: [0x00,0x00,0xbe,0xd5,0x80,0x00,0x00,0x00] // GFX11: v_fract_f64_e64 v[0:1], 0 ; encoding: [0x00,0x00,0xbe,0xd5,0x80,0x00,0x00,0x00] +// GFX12XX: v_fract_f64_e64 v[0:1], 0 ; encoding: [0x00,0x00,0xbe,0xd5,0x80,0x00,0x00,0x00] +// GFX89: v_fract_f64_e64 v[0:1], 0 ; encoding: [0x00,0x00,0x72,0xd1,0x80,0x00,0x00,0x00] +// SICI: v_fract_f64_e64 v[0:1], 0 ; encoding: [0x00,0x00,0x7c,0xd3,0x80,0x00,0x00,0x00] v_trunc_f32_e32 v0, -13 -// SICI: v_trunc_f32_e32 v0, -13 ; encoding: [0xcd,0x42,0x00,0x7e] -// GFX89: v_trunc_f32_e32 v0, -13 ; encoding: [0xcd,0x38,0x00,0x7e] -// GFX12XX: v_trunc_f32_e32 v0, -13 ; encoding: [0xcd,0x42,0x00,0x7e] // GFX11: v_trunc_f32_e32 v0, -13 ; encoding: [0xcd,0x42,0x00,0x7e] +// GFX12XX: v_trunc_f32_e32 v0, -13 ; encoding: [0xcd,0x42,0x00,0x7e] +// GFX89: v_trunc_f32_e32 v0, -13 ; encoding: [0xcd,0x38,0x00,0x7e] +// SICI: v_trunc_f32_e32 v0, -13 ; encoding: [0xcd,0x42,0x00,0x7e] v_fract_f64_e32 v[0:1], -13 -// SICI: v_fract_f64_e32 v[0:1], -13 ; encoding: [0xcd,0x7c,0x00,0x7e] -// GFX89: v_fract_f64_e32 v[0:1], -13 ; encoding: [0xcd,0x64,0x00,0x7e] -// GFX12XX: v_fract_f64_e32 v[0:1], -13 ; encoding: [0xcd,0x7c,0x00,0x7e] // GFX11: v_fract_f64_e32 v[0:1], -13 ; encoding: [0xcd,0x7c,0x00,0x7e] +// GFX12XX: v_fract_f64_e32 v[0:1], -13 ; encoding: [0xcd,0x7c,0x00,0x7e] +// GFX89: v_fract_f64_e32 v[0:1], -13 ; encoding: [0xcd,0x64,0x00,0x7e] +// SICI: v_fract_f64_e32 v[0:1], -13 ; encoding: [0xcd,0x7c,0x00,0x7e] v_trunc_f32_e64 v0, -13 -// SICI: v_trunc_f32_e64 v0, -13 ; encoding: [0x00,0x00,0x42,0xd3,0xcd,0x00,0x00,0x00] -// GFX89: v_trunc_f32_e64 v0, -13 ; encoding: [0x00,0x00,0x5c,0xd1,0xcd,0x00,0x00,0x00] -// GFX12XX: v_trunc_f32_e64 v0, -13 ; encoding: [0x00,0x00,0xa1,0xd5,0xcd,0x00,0x00,0x00] // GFX11: v_trunc_f32_e64 v0, -13 ; encoding: [0x00,0x00,0xa1,0xd5,0xcd,0x00,0x00,0x00] +// GFX12XX: v_trunc_f32_e64 v0, -13 ; encoding: [0x00,0x00,0xa1,0xd5,0xcd,0x00,0x00,0x00] +// GFX89: v_trunc_f32_e64 v0, -13 ; encoding: [0x00,0x00,0x5c,0xd1,0xcd,0x00,0x00,0x00] +// SICI: v_trunc_f32_e64 v0, -13 ; encoding: [0x00,0x00,0x42,0xd3,0xcd,0x00,0x00,0x00] v_fract_f64_e64 v[0:1], -13 -// SICI: v_fract_f64_e64 v[0:1], -13 ; encoding: [0x00,0x00,0x7c,0xd3,0xcd,0x00,0x00,0x00] -// GFX89: v_fract_f64_e64 v[0:1], -13 ; encoding: [0x00,0x00,0x72,0xd1,0xcd,0x00,0x00,0x00] -// GFX12XX: v_fract_f64_e64 v[0:1], -13 ; encoding: [0x00,0x00,0xbe,0xd5,0xcd,0x00,0x00,0x00] // GFX11: v_fract_f64_e64 v[0:1], -13 ; encoding: [0x00,0x00,0xbe,0xd5,0xcd,0x00,0x00,0x00] +// GFX12XX: v_fract_f64_e64 v[0:1], -13 ; encoding: [0x00,0x00,0xbe,0xd5,0xcd,0x00,0x00,0x00] +// GFX89: v_fract_f64_e64 v[0:1], -13 ; encoding: [0x00,0x00,0x72,0xd1,0xcd,0x00,0x00,0x00] +// SICI: v_fract_f64_e64 v[0:1], -13 ; encoding: [0x00,0x00,0x7c,0xd3,0xcd,0x00,0x00,0x00] v_trunc_f32_e32 v0, 35 -// SICI: v_trunc_f32_e32 v0, 35 ; encoding: [0xa3,0x42,0x00,0x7e] -// GFX89: v_trunc_f32_e32 v0, 35 ; encoding: [0xa3,0x38,0x00,0x7e] -// GFX12XX: v_trunc_f32_e32 v0, 35 ; encoding: [0xa3,0x42,0x00,0x7e] // GFX11: v_trunc_f32_e32 v0, 35 ; encoding: [0xa3,0x42,0x00,0x7e] +// GFX12XX: v_trunc_f32_e32 v0, 35 ; encoding: [0xa3,0x42,0x00,0x7e] +// GFX89: v_trunc_f32_e32 v0, 35 ; encoding: [0xa3,0x38,0x00,0x7e] +// SICI: v_trunc_f32_e32 v0, 35 ; encoding: [0xa3,0x42,0x00,0x7e] v_fract_f64_e32 v[0:1], 35 -// SICI: v_fract_f64_e32 v[0:1], 35 ; encoding: [0xa3,0x7c,0x00,0x7e] -// GFX89: v_fract_f64_e32 v[0:1], 35 ; encoding: [0xa3,0x64,0x00,0x7e] -// GFX12XX: v_fract_f64_e32 v[0:1], 35 ; encoding: [0xa3,0x7c,0x00,0x7e] // GFX11: v_fract_f64_e32 v[0:1], 35 ; encoding: [0xa3,0x7c,0x00,0x7e] +// GFX12XX: v_fract_f64_e32 v[0:1], 35 ; encoding: [0xa3,0x7c,0x00,0x7e] +// GFX89: v_fract_f64_e32 v[0:1], 35 ; encoding: [0xa3,0x64,0x00,0x7e] +// SICI: v_fract_f64_e32 v[0:1], 35 ; encoding: [0xa3,0x7c,0x00,0x7e] v_trunc_f32_e64 v0, 35 -// SICI: v_trunc_f32_e64 v0, 35 ; encoding: [0x00,0x00,0x42,0xd3,0xa3,0x00,0x00,0x00] -// GFX89: v_trunc_f32_e64 v0, 35 ; encoding: [0x00,0x00,0x5c,0xd1,0xa3,0x00,0x00,0x00] -// GFX12XX: v_trunc_f32_e64 v0, 35 ; encoding: [0x00,0x00,0xa1,0xd5,0xa3,0x00,0x00,0x00] // GFX11: v_trunc_f32_e64 v0, 35 ; encoding: [0x00,0x00,0xa1,0xd5,0xa3,0x00,0x00,0x00] +// GFX12XX: v_trunc_f32_e64 v0, 35 ; encoding: [0x00,0x00,0xa1,0xd5,0xa3,0x00,0x00,0x00] +// GFX89: v_trunc_f32_e64 v0, 35 ; encoding: [0x00,0x00,0x5c,0xd1,0xa3,0x00,0x00,0x00] +// SICI: v_trunc_f32_e64 v0, 35 ; encoding: [0x00,0x00,0x42,0xd3,0xa3,0x00,0x00,0x00] v_fract_f64_e64 v[0:1], 35 -// SICI: v_fract_f64_e64 v[0:1], 35 ; encoding: [0x00,0x00,0x7c,0xd3,0xa3,0x00,0x00,0x00] -// GFX89: v_fract_f64_e64 v[0:1], 35 ; encoding: [0x00,0x00,0x72,0xd1,0xa3,0x00,0x00,0x00] -// GFX12XX: v_fract_f64_e64 v[0:1], 35 ; encoding: [0x00,0x00,0xbe,0xd5,0xa3,0x00,0x00,0x00] // GFX11: v_fract_f64_e64 v[0:1], 35 ; encoding: [0x00,0x00,0xbe,0xd5,0xa3,0x00,0x00,0x00] +// GFX12XX: v_fract_f64_e64 v[0:1], 35 ; encoding: [0x00,0x00,0xbe,0xd5,0xa3,0x00,0x00,0x00] +// GFX89: v_fract_f64_e64 v[0:1], 35 ; encoding: [0x00,0x00,0x72,0xd1,0xa3,0x00,0x00,0x00] +// SICI: v_fract_f64_e64 v[0:1], 35 ; encoding: [0x00,0x00,0x7c,0xd3,0xa3,0x00,0x00,0x00] v_trunc_f32_e32 v0, 1234 -// SICI: v_trunc_f32_e32 v0, 0x4d2 ; encoding: [0xff,0x42,0x00,0x7e,0xd2,0x04,0x00,0x00] -// GFX89: v_trunc_f32_e32 v0, 0x4d2 ; encoding: [0xff,0x38,0x00,0x7e,0xd2,0x04,0x00,0x00] -// GFX12XX: v_trunc_f32_e32 v0, 0x4d2 ; encoding: [0xff,0x42,0x00,0x7e,0xd2,0x04,0x00,0x00] // GFX11: v_trunc_f32_e32 v0, 0x4d2 ; encoding: [0xff,0x42,0x00,0x7e,0xd2,0x04,0x00,0x00] +// GFX12XX: v_trunc_f32_e32 v0, 0x4d2 ; encoding: [0xff,0x42,0x00,0x7e,0xd2,0x04,0x00,0x00] +// GFX89: v_trunc_f32_e32 v0, 0x4d2 ; encoding: [0xff,0x38,0x00,0x7e,0xd2,0x04,0x00,0x00] +// SICI: v_trunc_f32_e32 v0, 0x4d2 ; encoding: [0xff,0x42,0x00,0x7e,0xd2,0x04,0x00,0x00] v_fract_f64_e32 v[0:1], 1234 -// SICI: v_fract_f64_e32 v[0:1], 0x4d2 ; encoding: [0xff,0x7c,0x00,0x7e,0xd2,0x04,0x00,0x00] -// GFX89: v_fract_f64_e32 v[0:1], 0x4d2 ; encoding: [0xff,0x64,0x00,0x7e,0xd2,0x04,0x00,0x00] -// GFX12XX: v_fract_f64_e32 v[0:1], 0x4d2 ; encoding: [0xff,0x7c,0x00,0x7e,0xd2,0x04,0x00,0x00] // GFX11: v_fract_f64_e32 v[0:1], 0x4d2 ; encoding: [0xff,0x7c,0x00,0x7e,0xd2,0x04,0x00,0x00] +// GFX12XX: v_fract_f64_e32 v[0:1], 0x4d2 ; encoding: [0xff,0x7c,0x00,0x7e,0xd2,0x04,0x00,0x00] +// GFX89: v_fract_f64_e32 v[0:1], 0x4d2 ; encoding: [0xff,0x64,0x00,0x7e,0xd2,0x04,0x00,0x00] +// SICI: v_fract_f64_e32 v[0:1], 0x4d2 ; encoding: [0xff,0x7c,0x00,0x7e,0xd2,0x04,0x00,0x00] v_trunc_f32_e64 v0, 1234 +// GFX11: v_trunc_f32_e64 v0, 0x4d2 ; encoding: [0x00,0x00,0xa1,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00] // GFX12XX: v_trunc_f32_e64 v0, 0x4d2 ; encoding: [0x00,0x00,0xa1,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00] -// NOSICI: :[[@LINE-2]]:21: error: literal operands are not supported // NOGFX89: :[[@LINE-3]]:21: error: literal operands are not supported -// GFX11: v_trunc_f32_e64 v0, 0x4d2 ; encoding: [0x00,0x00,0xa1,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00] +// NOSICI: :[[@LINE-4]]:21: error: literal operands are not supported // NOSICIVI: :[[@LINE-1]]:21: error: literal operands are not supported v_fract_f64_e64 v[0:1], 1234 +// GFX11: v_fract_f64_e64 v[0:1], 0x4d2 ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00] // GFX12XX: v_fract_f64_e64 v[0:1], 0x4d2 ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00] -// NOSICI: :[[@LINE-2]]:25: error: literal operands are not supported // NOGFX89: :[[@LINE-3]]:25: error: literal operands are not supported -// GFX11: v_fract_f64_e64 v[0:1], 0x4d2 ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00] +// NOSICI: :[[@LINE-4]]:25: error: literal operands are not supported // NOSICIVI: :[[@LINE-1]]:25: error: literal operands are not supported v_trunc_f32_e32 v0, -54321 -// SICI: v_trunc_f32_e32 v0, 0xffff2bcf ; encoding: [0xff,0x42,0x00,0x7e,0xcf,0x2b,0xff,0xff] -// GFX89: v_trunc_f32_e32 v0, 0xffff2bcf ; encoding: [0xff,0x38,0x00,0x7e,0xcf,0x2b,0xff,0xff] -// GFX12XX: v_trunc_f32_e32 v0, 0xffff2bcf ; encoding: [0xff,0x42,0x00,0x7e,0xcf,0x2b,0xff,0xff] // GFX11: v_trunc_f32_e32 v0, 0xffff2bcf ; encoding: [0xff,0x42,0x00,0x7e,0xcf,0x2b,0xff,0xff] +// GFX12XX: v_trunc_f32_e32 v0, 0xffff2bcf ; encoding: [0xff,0x42,0x00,0x7e,0xcf,0x2b,0xff,0xff] +// GFX89: v_trunc_f32_e32 v0, 0xffff2bcf ; encoding: [0xff,0x38,0x00,0x7e,0xcf,0x2b,0xff,0xff] +// SICI: v_trunc_f32_e32 v0, 0xffff2bcf ; encoding: [0xff,0x42,0x00,0x7e,0xcf,0x2b,0xff,0xff] v_fract_f64_e32 v[0:1], -54321 -// SICI: v_fract_f64_e32 v[0:1], 0xffff2bcf ; encoding: [0xff,0x7c,0x00,0x7e,0xcf,0x2b,0xff,0xff] -// GFX89: v_fract_f64_e32 v[0:1], 0xffff2bcf ; encoding: [0xff,0x64,0x00,0x7e,0xcf,0x2b,0xff,0xff] -// GFX12XX: v_fract_f64_e32 v[0:1], 0xffff2bcf ; encoding: [0xff,0x7c,0x00,0x7e,0xcf,0x2b,0xff,0xff] // GFX11: v_fract_f64_e32 v[0:1], 0xffff2bcf ; encoding: [0xff,0x7c,0x00,0x7e,0xcf,0x2b,0xff,0xff] +// GFX12XX: v_fract_f64_e32 v[0:1], 0xffff2bcf ; encoding: [0xff,0x7c,0x00,0x7e,0xcf,0x2b,0xff,0xff] +// GFX89: v_fract_f64_e32 v[0:1], 0xffff2bcf ; encoding: [0xff,0x64,0x00,0x7e,0xcf,0x2b,0xff,0xff] +// SICI: v_fract_f64_e32 v[0:1], 0xffff2bcf ; encoding: [0xff,0x7c,0x00,0x7e,0xcf,0x2b,0xff,0xff] v_trunc_f32_e32 v0, 0xdeadbeef -// SICI: v_trunc_f32_e32 v0, 0xdeadbeef ; encoding: [0xff,0x42,0x00,0x7e,0xef,0xbe,0xad,0xde] -// GFX89: v_trunc_f32_e32 v0, 0xdeadbeef ; encoding: [0xff,0x38,0x00,0x7e,0xef,0xbe,0xad,0xde] -// GFX12XX: v_trunc_f32_e32 v0, 0xdeadbeef ; encoding: [0xff,0x42,0x00,0x7e,0xef,0xbe,0xad,0xde] // GFX11: v_trunc_f32_e32 v0, 0xdeadbeef ; encoding: [0xff,0x42,0x00,0x7e,0xef,0xbe,0xad,0xde] +// GFX12XX: v_trunc_f32_e32 v0, 0xdeadbeef ; encoding: [0xff,0x42,0x00,0x7e,0xef,0xbe,0xad,0xde] +// GFX89: v_trunc_f32_e32 v0, 0xdeadbeef ; encoding: [0xff,0x38,0x00,0x7e,0xef,0xbe,0xad,0xde] +// SICI: v_trunc_f32_e32 v0, 0xdeadbeef ; encoding: [0xff,0x42,0x00,0x7e,0xef,0xbe,0xad,0xde] v_fract_f64_e32 v[0:1], 0xdeadbeef -// SICI: v_fract_f64_e32 v[0:1], 0xdeadbeef ; encoding: [0xff,0x7c,0x00,0x7e,0xef,0xbe,0xad,0xde] -// GFX89: v_fract_f64_e32 v[0:1], 0xdeadbeef ; encoding: [0xff,0x64,0x00,0x7e,0xef,0xbe,0xad,0xde] -// GFX12XX: v_fract_f64_e32 v[0:1], 0xdeadbeef ; encoding: [0xff,0x7c,0x00,0x7e,0xef,0xbe,0xad,0xde] // GFX11: v_fract_f64_e32 v[0:1], 0xdeadbeef ; encoding: [0xff,0x7c,0x00,0x7e,0xef,0xbe,0xad,0xde] +// GFX12XX: v_fract_f64_e32 v[0:1], 0xdeadbeef ; encoding: [0xff,0x7c,0x00,0x7e,0xef,0xbe,0xad,0xde] +// GFX89: v_fract_f64_e32 v[0:1], 0xdeadbeef ; encoding: [0xff,0x64,0x00,0x7e,0xef,0xbe,0xad,0xde] +// SICI: v_fract_f64_e32 v[0:1], 0xdeadbeef ; encoding: [0xff,0x7c,0x00,0x7e,0xef,0xbe,0xad,0xde] v_trunc_f32_e32 v0, 0xffffffff -// SICI: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x42,0x00,0x7e] -// GFX89: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x38,0x00,0x7e] -// GFX12XX: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x42,0x00,0x7e] // GFX11: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x42,0x00,0x7e] +// GFX12XX: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x42,0x00,0x7e] +// GFX89: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x38,0x00,0x7e] +// SICI: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x42,0x00,0x7e] v_fract_f64_e32 v[0:1], 0xffffffff -// SICI: v_fract_f64_e32 v[0:1], 0xffffffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xff,0xff] -// GFX89: v_fract_f64_e32 v[0:1], 0xffffffff ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0xff,0xff] -// GFX12XX: v_fract_f64_e32 v[0:1], 0xffffffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xff,0xff] // GFX11: v_fract_f64_e32 v[0:1], 0xffffffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xff,0xff] +// GFX12XX: v_fract_f64_e32 v[0:1], 0xffffffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xff,0xff] +// GFX89: v_fract_f64_e32 v[0:1], 0xffffffff ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0xff,0xff] +// SICI: v_fract_f64_e32 v[0:1], 0xffffffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xff,0xff] v_trunc_f32_e32 v0, 0x123456789abcdef0 // NOGCN: :[[@LINE-1]]:21: error: invalid operand for instruction v_fract_f64_e32 v[0:1], 0x123456789abcdef0 -// NOSICI: :[[@LINE-1]]:25: error: invalid operand for instruction -// NOGFX89: :[[@LINE-2]]:25: error: invalid operand for instruction // GFX1250: v_fract_f64_e32 v[0:1], 0x123456789abcdef0 ; encoding: [0xfe,0x7c,0x00,0x7e,0xf0,0xde,0xbc,0x9a,0x78,0x56,0x34,0x12] -// NOGFX11: :[[@LINE-4]]:25: error: invalid operand for instruction -// NOGFX12: :[[@LINE-5]]:25: error: invalid operand for instruction +// NOGFX11: :[[@LINE-2]]:25: error: invalid operand for instruction +// NOGFX12: :[[@LINE-3]]:25: error: invalid operand for instruction +// NOGFX89: :[[@LINE-4]]:25: error: invalid operand for instruction +// NOSICI: :[[@LINE-5]]:25: error: invalid operand for instruction // NOSICIVI: :[[@LINE-1]]:25: error: invalid operand for instruction v_trunc_f32_e32 v0, 0xffffffffffffffff -// SICI: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x42,0x00,0x7e] -// GFX89: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x38,0x00,0x7e] -// GFX12XX: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x42,0x00,0x7e] // GFX11: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x42,0x00,0x7e] +// GFX12XX: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x42,0x00,0x7e] +// GFX89: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x38,0x00,0x7e] +// SICI: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x42,0x00,0x7e] v_fract_f64_e32 v[0:1], 0xffffffffffffffff -// SICI: v_fract_f64_e32 v[0:1], -1 ; encoding: [0xc1,0x7c,0x00,0x7e] -// GFX89: v_fract_f64_e32 v[0:1], -1 ; encoding: [0xc1,0x64,0x00,0x7e] -// GFX12XX: v_fract_f64_e32 v[0:1], -1 ; encoding: [0xc1,0x7c,0x00,0x7e] // GFX11: v_fract_f64_e32 v[0:1], -1 ; encoding: [0xc1,0x7c,0x00,0x7e] +// GFX12XX: v_fract_f64_e32 v[0:1], -1 ; encoding: [0xc1,0x7c,0x00,0x7e] +// GFX89: v_fract_f64_e32 v[0:1], -1 ; encoding: [0xc1,0x64,0x00,0x7e] +// SICI: v_fract_f64_e32 v[0:1], -1 ; encoding: [0xc1,0x7c,0x00,0x7e] v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0x06,0x1a] -// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode -// NOGFX1250: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode +// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode +// NOGFX1250: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], lit(1) -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU -// NOGFX11: :[[@LINE-3]]:54: error: invalid operand for instruction -// NOGFX12: :[[@LINE-4]]:54: error: invalid operand for instruction -// NOGFX1250: :[[@LINE-5]]:54: error: invalid operand for instruction +// NOGFX11: :[[@LINE-1]]:54: error: invalid operand for instruction +// NOGFX12: :[[@LINE-2]]:54: error: invalid operand for instruction +// NOGFX1250: :[[@LINE-3]]:54: error: invalid operand for instruction +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_cos_f16_e32 v5.l, 1 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode // GFX11: v_cos_f16_e32 v5.l, 1 ; encoding: [0x81,0xc2,0x0a,0x7e] // GFX1250: v_cos_f16_e32 v5.l, 1 ; encoding: [0x81,0xc2,0x0a,0x7e] -// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode +// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode +// NOGFX89: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_cos_f16_e32 v5.l, lit(1) -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode // GFX11: v_cos_f16_e32 v5.l, lit(0x1) ; encoding: [0xff,0xc2,0x0a,0x7e,0x01,0x00,0x00,0x00] // GFX1250: v_cos_f16_e32 v5.l, lit(0x1) ; encoding: [0xff,0xc2,0x0a,0x7e,0x01,0x00,0x00,0x00] -// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode +// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode +// NOGFX89: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_tanh_bf16 v5, 1 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX1250: v_tanh_bf16_e32 v5, 1 ; encoding: [0x81,0x94,0x0a,0x7e] -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_tanh_bf16 v5, lit(1) -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX1250: v_tanh_bf16_e32 v5, lit(0x1) ; encoding: [0xff,0x94,0x0a,0x7e,0x01,0x00,0x00,0x00] -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_trunc_f32_e32 v0, 1 -// SICI: v_trunc_f32_e32 v0, 1 ; encoding: [0x81,0x42,0x00,0x7e] -// GFX89: v_trunc_f32_e32 v0, 1 ; encoding: [0x81,0x38,0x00,0x7e] -// GFX12XX: v_trunc_f32_e32 v0, 1 ; encoding: [0x81,0x42,0x00,0x7e] // GFX11: v_trunc_f32_e32 v0, 1 ; encoding: [0x81,0x42,0x00,0x7e] +// GFX12XX: v_trunc_f32_e32 v0, 1 ; encoding: [0x81,0x42,0x00,0x7e] +// GFX89: v_trunc_f32_e32 v0, 1 ; encoding: [0x81,0x38,0x00,0x7e] +// SICI: v_trunc_f32_e32 v0, 1 ; encoding: [0x81,0x42,0x00,0x7e] v_trunc_f32_e32 v0, lit(1) -// SICI: v_trunc_f32_e32 v0, lit(0x1) ; encoding: [0xff,0x42,0x00,0x7e,0x01,0x00,0x00,0x00] -// GFX89: v_trunc_f32_e32 v0, lit(0x1) ; encoding: [0xff,0x38,0x00,0x7e,0x01,0x00,0x00,0x00] -// GFX12XX: v_trunc_f32_e32 v0, lit(0x1) ; encoding: [0xff,0x42,0x00,0x7e,0x01,0x00,0x00,0x00] // GFX11: v_trunc_f32_e32 v0, lit(0x1) ; encoding: [0xff,0x42,0x00,0x7e,0x01,0x00,0x00,0x00] +// GFX12XX: v_trunc_f32_e32 v0, lit(0x1) ; encoding: [0xff,0x42,0x00,0x7e,0x01,0x00,0x00,0x00] +// GFX89: v_trunc_f32_e32 v0, lit(0x1) ; encoding: [0xff,0x38,0x00,0x7e,0x01,0x00,0x00,0x00] +// SICI: v_trunc_f32_e32 v0, lit(0x1) ; encoding: [0xff,0x42,0x00,0x7e,0x01,0x00,0x00,0x00] v_dot2_bf16_bf16 v5.l, v1, v2, 1 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_dot2_bf16_bf16 v5.l, v1, v2, 1 ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0x06,0x02] -// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_dot2_bf16_bf16 v5.l, v1, v2, lit(1) -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_dot2_bf16_bf16 v5.l, v1, v2, lit(0x1) ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0xfe,0x03,0x01,0x00,0x00,0x00] -// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_dot2_f32_f16 v5, v1, 1, v2 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_dot2_f32_f16 v5, v1, 1, v2 ; encoding: [0x05,0x40,0x13,0xcc,0x01,0x03,0x09,0x1c] // GFX12: v_dot2_f32_f16 v5, v1, 1, v2 ; encoding: [0x05,0x40,0x13,0xcc,0x01,0x03,0x09,0x1c] -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_dot2_f32_f16 v5, v1, lit(1), v2 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_dot2_f32_f16 v5, v1, lit(0x1), v2 ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xff,0x09,0x1c,0x01,0x00,0x00,0x00] // GFX12: v_dot2_f32_f16 v5, v1, lit(0x1), v2 ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xff,0x09,0x1c,0x01,0x00,0x00,0x00] -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_cvt_pk_fp8_f16 v1.l, 1 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX1250: v_cvt_pk_fp8_f16 v1.l, 1 ; encoding: [0x01,0x00,0x72,0xd7,0xff,0x00,0x00,0x00,0x01,0x00,0x00,0x00] -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_cvt_pk_fp8_f16 v1.l, lit(1) -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX1250: v_cvt_pk_fp8_f16 v1.l, lit(0x1) ; encoding: [0x01,0x00,0x72,0xd7,0xff,0x00,0x00,0x00,0x01,0x00,0x00,0x00] -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU //---------------------------------------------------------------------------// // int literal, expected int operand @@ -755,111 +755,111 @@ s_mov_b64_e32 s[0:1], 0 // SICI: s_mov_b64 s[0:1], 0 ; encoding: [0x80,0x04,0x80,0xbe] v_and_b32_e32 v0, 0, v1 -// SICI: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x36] -// GFX89: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x26] -// GFX12XX: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x36] // GFX11: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x36] +// GFX12XX: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x36] +// GFX89: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x26] +// SICI: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x36] v_and_b32_e64 v0, 0, v1 -// SICI: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x36,0xd2,0x80,0x02,0x02,0x00] -// GFX89: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x13,0xd1,0x80,0x02,0x02,0x00] -// GFX12XX: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00] // GFX11: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00] +// GFX12XX: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00] +// GFX89: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x13,0xd1,0x80,0x02,0x02,0x00] +// SICI: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x36,0xd2,0x80,0x02,0x02,0x00] s_mov_b64_e32 s[0:1], -13 // GFX8PLUS: s_mov_b64 s[0:1], -13 ; encoding: [0xcd,0x01,0x80,0xbe] // SICI: s_mov_b64 s[0:1], -13 ; encoding: [0xcd,0x04,0x80,0xbe] v_and_b32_e32 v0, -13, v1 -// SICI: v_and_b32_e32 v0, -13, v1 ; encoding: [0xcd,0x02,0x00,0x36] -// GFX89: v_and_b32_e32 v0, -13, v1 ; encoding: [0xcd,0x02,0x00,0x26] -// GFX12XX: v_and_b32_e32 v0, -13, v1 ; encoding: [0xcd,0x02,0x00,0x36] // GFX11: v_and_b32_e32 v0, -13, v1 ; encoding: [0xcd,0x02,0x00,0x36] +// GFX12XX: v_and_b32_e32 v0, -13, v1 ; encoding: [0xcd,0x02,0x00,0x36] +// GFX89: v_and_b32_e32 v0, -13, v1 ; encoding: [0xcd,0x02,0x00,0x26] +// SICI: v_and_b32_e32 v0, -13, v1 ; encoding: [0xcd,0x02,0x00,0x36] v_and_b32_e64 v0, -13, v1 -// SICI: v_and_b32_e64 v0, -13, v1 ; encoding: [0x00,0x00,0x36,0xd2,0xcd,0x02,0x02,0x00] -// GFX89: v_and_b32_e64 v0, -13, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xcd,0x02,0x02,0x00] -// GFX12XX: v_and_b32_e64 v0, -13, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xcd,0x02,0x02,0x00] // GFX11: v_and_b32_e64 v0, -13, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xcd,0x02,0x02,0x00] +// GFX12XX: v_and_b32_e64 v0, -13, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xcd,0x02,0x02,0x00] +// GFX89: v_and_b32_e64 v0, -13, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xcd,0x02,0x02,0x00] +// SICI: v_and_b32_e64 v0, -13, v1 ; encoding: [0x00,0x00,0x36,0xd2,0xcd,0x02,0x02,0x00] s_mov_b64_e32 s[0:1], 35 // GFX8PLUS: s_mov_b64 s[0:1], 35 ; encoding: [0xa3,0x01,0x80,0xbe] // SICI: s_mov_b64 s[0:1], 35 ; encoding: [0xa3,0x04,0x80,0xbe] v_and_b32_e32 v0, 35, v1 -// SICI: v_and_b32_e32 v0, 35, v1 ; encoding: [0xa3,0x02,0x00,0x36] -// GFX89: v_and_b32_e32 v0, 35, v1 ; encoding: [0xa3,0x02,0x00,0x26] -// GFX12XX: v_and_b32_e32 v0, 35, v1 ; encoding: [0xa3,0x02,0x00,0x36] // GFX11: v_and_b32_e32 v0, 35, v1 ; encoding: [0xa3,0x02,0x00,0x36] +// GFX12XX: v_and_b32_e32 v0, 35, v1 ; encoding: [0xa3,0x02,0x00,0x36] +// GFX89: v_and_b32_e32 v0, 35, v1 ; encoding: [0xa3,0x02,0x00,0x26] +// SICI: v_and_b32_e32 v0, 35, v1 ; encoding: [0xa3,0x02,0x00,0x36] v_and_b32_e64 v0, 35, v1 -// SICI: v_and_b32_e64 v0, 35, v1 ; encoding: [0x00,0x00,0x36,0xd2,0xa3,0x02,0x02,0x00] -// GFX89: v_and_b32_e64 v0, 35, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xa3,0x02,0x02,0x00] -// GFX12XX: v_and_b32_e64 v0, 35, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xa3,0x02,0x02,0x00] // GFX11: v_and_b32_e64 v0, 35, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xa3,0x02,0x02,0x00] +// GFX12XX: v_and_b32_e64 v0, 35, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xa3,0x02,0x02,0x00] +// GFX89: v_and_b32_e64 v0, 35, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xa3,0x02,0x02,0x00] +// SICI: v_and_b32_e64 v0, 35, v1 ; encoding: [0x00,0x00,0x36,0xd2,0xa3,0x02,0x02,0x00] s_mov_b64_e32 s[0:1], 1234 // GFX8PLUS: s_mov_b64 s[0:1], 0x4d2 ; encoding: [0xff,0x01,0x80,0xbe,0xd2,0x04,0x00,0x00] // SICI: s_mov_b64 s[0:1], 0x4d2 ; encoding: [0xff,0x04,0x80,0xbe,0xd2,0x04,0x00,0x00] v_and_b32_e32 v0, 1234, v1 -// SICI: v_and_b32_e32 v0, 0x4d2, v1 ; encoding: [0xff,0x02,0x00,0x36,0xd2,0x04,0x00,0x00] -// GFX89: v_and_b32_e32 v0, 0x4d2, v1 ; encoding: [0xff,0x02,0x00,0x26,0xd2,0x04,0x00,0x00] -// GFX12XX: v_and_b32_e32 v0, 0x4d2, v1 ; encoding: [0xff,0x02,0x00,0x36,0xd2,0x04,0x00,0x00] // GFX11: v_and_b32_e32 v0, 0x4d2, v1 ; encoding: [0xff,0x02,0x00,0x36,0xd2,0x04,0x00,0x00] +// GFX12XX: v_and_b32_e32 v0, 0x4d2, v1 ; encoding: [0xff,0x02,0x00,0x36,0xd2,0x04,0x00,0x00] +// GFX89: v_and_b32_e32 v0, 0x4d2, v1 ; encoding: [0xff,0x02,0x00,0x26,0xd2,0x04,0x00,0x00] +// SICI: v_and_b32_e32 v0, 0x4d2, v1 ; encoding: [0xff,0x02,0x00,0x36,0xd2,0x04,0x00,0x00] v_and_b32_e64 v0, 1234, v1 +// GFX11: v_and_b32_e64 v0, 0x4d2, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xff,0x02,0x02,0x00,0xd2,0x04,0x00,0x00] // GFX12XX: v_and_b32_e64 v0, 0x4d2, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xff,0x02,0x02,0x00,0xd2,0x04,0x00,0x00] -// NOSICI: :[[@LINE-2]]:19: error: literal operands are not supported // NOGFX89: :[[@LINE-3]]:19: error: literal operands are not supported -// GFX11: v_and_b32_e64 v0, 0x4d2, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xff,0x02,0x02,0x00,0xd2,0x04,0x00,0x00] +// NOSICI: :[[@LINE-4]]:19: error: literal operands are not supported // NOSICIVI: :[[@LINE-1]]:19: error: literal operands are not supported s_mov_b64_e32 s[0:1], -54321 -// SICI: s_mov_b64 s[0:1], 0xffff2bcf ; encoding: [0xff,0x04,0x80,0xbe,0xcf,0x2b,0xff,0xff] -// GFX89: s_mov_b64 s[0:1], 0xffff2bcf ; encoding: [0xff,0x01,0x80,0xbe,0xcf,0x2b,0xff,0xff] // GFX11: s_mov_b64 s[0:1], 0xffff2bcf ; encoding: [0xff,0x01,0x80,0xbe,0xcf,0x2b,0xff,0xff] // GFX12: s_mov_b64 s[0:1], 0xffff2bcf ; encoding: [0xff,0x01,0x80,0xbe,0xcf,0x2b,0xff,0xff] // GFX1250: s_mov_b64 s[0:1], 0xffffffffffff2bcf ; encoding: [0xfe,0x01,0x80,0xbe,0xcf,0x2b,0xff,0xff,0xff,0xff,0xff,0xff] +// GFX89: s_mov_b64 s[0:1], 0xffff2bcf ; encoding: [0xff,0x01,0x80,0xbe,0xcf,0x2b,0xff,0xff] +// SICI: s_mov_b64 s[0:1], 0xffff2bcf ; encoding: [0xff,0x04,0x80,0xbe,0xcf,0x2b,0xff,0xff] v_and_b32_e32 v0, -54321, v1 -// SICI: v_and_b32_e32 v0, 0xffff2bcf, v1 ; encoding: [0xff,0x02,0x00,0x36,0xcf,0x2b,0xff,0xff] -// GFX89: v_and_b32_e32 v0, 0xffff2bcf, v1 ; encoding: [0xff,0x02,0x00,0x26,0xcf,0x2b,0xff,0xff] -// GFX12XX: v_and_b32_e32 v0, 0xffff2bcf, v1 ; encoding: [0xff,0x02,0x00,0x36,0xcf,0x2b,0xff,0xff] // GFX11: v_and_b32_e32 v0, 0xffff2bcf, v1 ; encoding: [0xff,0x02,0x00,0x36,0xcf,0x2b,0xff,0xff] +// GFX12XX: v_and_b32_e32 v0, 0xffff2bcf, v1 ; encoding: [0xff,0x02,0x00,0x36,0xcf,0x2b,0xff,0xff] +// GFX89: v_and_b32_e32 v0, 0xffff2bcf, v1 ; encoding: [0xff,0x02,0x00,0x26,0xcf,0x2b,0xff,0xff] +// SICI: v_and_b32_e32 v0, 0xffff2bcf, v1 ; encoding: [0xff,0x02,0x00,0x36,0xcf,0x2b,0xff,0xff] s_mov_b64_e32 s[0:1], 0xdeadbeef -// SICI: s_mov_b64 s[0:1], 0xdeadbeef ; encoding: [0xff,0x04,0x80,0xbe,0xef,0xbe,0xad,0xde] -// GFX89: s_mov_b64 s[0:1], 0xdeadbeef ; encoding: [0xff,0x01,0x80,0xbe,0xef,0xbe,0xad,0xde] // GFX11: s_mov_b64 s[0:1], 0xdeadbeef ; encoding: [0xff,0x01,0x80,0xbe,0xef,0xbe,0xad,0xde] // GFX12: s_mov_b64 s[0:1], 0xdeadbeef ; encoding: [0xff,0x01,0x80,0xbe,0xef,0xbe,0xad,0xde] // GFX1250: s_mov_b64 s[0:1], 0xdeadbeef ; encoding: [0xfe,0x01,0x80,0xbe,0xef,0xbe,0xad,0xde,0x00,0x00,0x00,0x00] +// GFX89: s_mov_b64 s[0:1], 0xdeadbeef ; encoding: [0xff,0x01,0x80,0xbe,0xef,0xbe,0xad,0xde] +// SICI: s_mov_b64 s[0:1], 0xdeadbeef ; encoding: [0xff,0x04,0x80,0xbe,0xef,0xbe,0xad,0xde] v_and_b32_e32 v0, 0xdeadbeef, v1 -// SICI: v_and_b32_e32 v0, 0xdeadbeef, v1 ; encoding: [0xff,0x02,0x00,0x36,0xef,0xbe,0xad,0xde] -// GFX89: v_and_b32_e32 v0, 0xdeadbeef, v1 ; encoding: [0xff,0x02,0x00,0x26,0xef,0xbe,0xad,0xde] -// GFX12XX: v_and_b32_e32 v0, 0xdeadbeef, v1 ; encoding: [0xff,0x02,0x00,0x36,0xef,0xbe,0xad,0xde] // GFX11: v_and_b32_e32 v0, 0xdeadbeef, v1 ; encoding: [0xff,0x02,0x00,0x36,0xef,0xbe,0xad,0xde] +// GFX12XX: v_and_b32_e32 v0, 0xdeadbeef, v1 ; encoding: [0xff,0x02,0x00,0x36,0xef,0xbe,0xad,0xde] +// GFX89: v_and_b32_e32 v0, 0xdeadbeef, v1 ; encoding: [0xff,0x02,0x00,0x26,0xef,0xbe,0xad,0xde] +// SICI: v_and_b32_e32 v0, 0xdeadbeef, v1 ; encoding: [0xff,0x02,0x00,0x36,0xef,0xbe,0xad,0xde] s_mov_b64_e32 s[0:1], 0xffffffff -// SICI: s_mov_b64 s[0:1], 0xffffffff ; encoding: [0xff,0x04,0x80,0xbe,0xff,0xff,0xff,0xff] -// GFX89: s_mov_b64 s[0:1], 0xffffffff ; encoding: [0xff,0x01,0x80,0xbe,0xff,0xff,0xff,0xff] // GFX11: s_mov_b64 s[0:1], 0xffffffff ; encoding: [0xff,0x01,0x80,0xbe,0xff,0xff,0xff,0xff] // GFX12: s_mov_b64 s[0:1], 0xffffffff ; encoding: [0xff,0x01,0x80,0xbe,0xff,0xff,0xff,0xff] // GFX1250: s_mov_b64 s[0:1], 0xffffffff ; encoding: [0xfe,0x01,0x80,0xbe,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00] +// GFX89: s_mov_b64 s[0:1], 0xffffffff ; encoding: [0xff,0x01,0x80,0xbe,0xff,0xff,0xff,0xff] +// SICI: s_mov_b64 s[0:1], 0xffffffff ; encoding: [0xff,0x04,0x80,0xbe,0xff,0xff,0xff,0xff] v_and_b32_e32 v0, 0xffffffff, v1 -// SICI: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x36] -// GFX89: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x26] -// GFX12XX: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x36] // GFX11: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x36] +// GFX12XX: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x36] +// GFX89: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x26] +// SICI: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x36] s_mov_b64_e32 s[0:1], 0x123456789abcdef0 -// NOSICI: :[[@LINE-1]]:23: error: invalid operand for instruction -// NOGFX89: :[[@LINE-2]]:23: error: invalid operand for instruction // GFX1250: s_mov_b64 s[0:1], 0x123456789abcdef0 ; encoding: [0xfe,0x01,0x80,0xbe,0xf0,0xde,0xbc,0x9a,0x78,0x56,0x34,0x12] -// NOGFX11: :[[@LINE-4]]:23: error: invalid operand for instruction -// NOGFX12: :[[@LINE-5]]:23: error: invalid operand for instruction +// NOGFX11: :[[@LINE-2]]:23: error: invalid operand for instruction +// NOGFX12: :[[@LINE-3]]:23: error: invalid operand for instruction +// NOGFX89: :[[@LINE-4]]:23: error: invalid operand for instruction +// NOSICI: :[[@LINE-5]]:23: error: invalid operand for instruction // NOSICIVI: :[[@LINE-1]]:23: error: invalid operand for instruction v_and_b32_e32 v0, 0x123456789abcdef0, v1 @@ -870,75 +870,75 @@ s_mov_b64_e32 s[0:1], 0xffffffffffffffff // SICI: s_mov_b64 s[0:1], -1 ; encoding: [0xc1,0x04,0x80,0xbe] v_and_b32_e32 v0, 0xffffffffffffffff, v1 -// SICI: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x36] -// GFX89: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x26] -// GFX12XX: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x36] // GFX11: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x36] +// GFX12XX: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x36] +// GFX89: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x26] +// SICI: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x36] v_not_b16 v5.l, 1 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_not_b16_e32 v5.l, 1 ; encoding: [0x81,0xd2,0x0a,0x7e] // GFX1250: v_not_b16_e32 v5.l, 1 ; encoding: [0x81,0xd2,0x0a,0x7e] -// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode +// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_not_b16 v5.l, lit(1) -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_not_b16_e32 v5.l, lit(0x1) ; encoding: [0xff,0xd2,0x0a,0x7e,0x01,0x00,0x00,0x00] // GFX1250: v_not_b16_e32 v5.l, lit(0x1) ; encoding: [0xff,0xd2,0x0a,0x7e,0x01,0x00,0x00,0x00] -// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode +// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU s_mov_b64 s[0:1], 1 // GFX8PLUS: s_mov_b64 s[0:1], 1 ; encoding: [0x81,0x01,0x80,0xbe] // SICI: s_mov_b64 s[0:1], 1 ; encoding: [0x81,0x04,0x80,0xbe] s_mov_b64 s[0:1], lit(1) -// SICI: s_mov_b64 s[0:1], lit(0x1) ; encoding: [0xff,0x04,0x80,0xbe,0x01,0x00,0x00,0x00] -// GFX89: s_mov_b64 s[0:1], lit(0x1) ; encoding: [0xff,0x01,0x80,0xbe,0x01,0x00,0x00,0x00] // GFX11: s_mov_b64 s[0:1], lit(0x1) ; encoding: [0xff,0x01,0x80,0xbe,0x01,0x00,0x00,0x00] // GFX12: s_mov_b64 s[0:1], lit(0x1) ; encoding: [0xff,0x01,0x80,0xbe,0x01,0x00,0x00,0x00] // GFX1250: s_mov_b64 s[0:1], lit(0x1) ; encoding: [0xfe,0x01,0x80,0xbe,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00] +// GFX89: s_mov_b64 s[0:1], lit(0x1) ; encoding: [0xff,0x01,0x80,0xbe,0x01,0x00,0x00,0x00] +// SICI: s_mov_b64 s[0:1], lit(0x1) ; encoding: [0xff,0x04,0x80,0xbe,0x01,0x00,0x00,0x00] v_and_b32_e32 v0, 1, v1 -// SICI: v_and_b32_e32 v0, 1, v1 ; encoding: [0x81,0x02,0x00,0x36] -// GFX89: v_and_b32_e32 v0, 1, v1 ; encoding: [0x81,0x02,0x00,0x26] -// GFX12XX: v_and_b32_e32 v0, 1, v1 ; encoding: [0x81,0x02,0x00,0x36] // GFX11: v_and_b32_e32 v0, 1, v1 ; encoding: [0x81,0x02,0x00,0x36] +// GFX12XX: v_and_b32_e32 v0, 1, v1 ; encoding: [0x81,0x02,0x00,0x36] +// GFX89: v_and_b32_e32 v0, 1, v1 ; encoding: [0x81,0x02,0x00,0x26] +// SICI: v_and_b32_e32 v0, 1, v1 ; encoding: [0x81,0x02,0x00,0x36] v_and_b32_e32 v0, lit(1), v1 -// SICI: v_and_b32_e32 v0, lit(0x1), v1 ; encoding: [0xff,0x02,0x00,0x36,0x01,0x00,0x00,0x00] -// GFX89: v_and_b32_e32 v0, lit(0x1), v1 ; encoding: [0xff,0x02,0x00,0x26,0x01,0x00,0x00,0x00] -// GFX12XX: v_and_b32_e32 v0, lit(0x1), v1 ; encoding: [0xff,0x02,0x00,0x36,0x01,0x00,0x00,0x00] // GFX11: v_and_b32_e32 v0, lit(0x1), v1 ; encoding: [0xff,0x02,0x00,0x36,0x01,0x00,0x00,0x00] +// GFX12XX: v_and_b32_e32 v0, lit(0x1), v1 ; encoding: [0xff,0x02,0x00,0x36,0x01,0x00,0x00,0x00] +// GFX89: v_and_b32_e32 v0, lit(0x1), v1 ; encoding: [0xff,0x02,0x00,0x26,0x01,0x00,0x00,0x00] +// SICI: v_and_b32_e32 v0, lit(0x1), v1 ; encoding: [0xff,0x02,0x00,0x36,0x01,0x00,0x00,0x00] v_pk_add_u16 v5, exec_lo, 1 +// GFX11: v_pk_add_u16 v5, exec_lo, 1 ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0x02,0x01,0x18] // GFX12XX: v_pk_add_u16 v5, exec_lo, 1 ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0x02,0x01,0x18] -// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX9: v_pk_add_u16 v5, exec_lo, 1 ; encoding: [0x05,0x40,0x8a,0xd3,0x7e,0x02,0x01,0x18] -// GFX11: v_pk_add_u16 v5, exec_lo, 1 ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0x02,0x01,0x18] +// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU // NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_pk_add_u16 v5, exec_lo, lit(1) -// GFX12XX: v_pk_add_u16 v5, exec_lo, lit(0x1) ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x01,0x00,0x00,0x00] -// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_pk_add_u16 v5, exec_lo, lit(0x1) ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x01,0x00,0x00,0x00] -// NOVI: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX9: :[[@LINE-5]]:31: error: invalid operand (violates constant bus restrictions) +// GFX12XX: v_pk_add_u16 v5, exec_lo, lit(0x1) ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x01,0x00,0x00,0x00] +// NOGFX9: :[[@LINE-3]]:31: error: invalid operand (violates constant bus restrictions) +// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 1 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 1 ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x09,0x06,0x02] -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], lit(1) -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], lit(0x1) ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x09,0xfe,0x03,0x01,0x00,0x00,0x00] -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU //---------------------------------------------------------------------------// // 1/(2*PI) @@ -948,46 +948,46 @@ v_trunc_f32_e32 v0, 0x3fc45f306dc9c882 // NOGCN: :[[@LINE-1]]:21: error: invalid operand for instruction v_fract_f64_e32 v[0:1], 0x3fc45f306dc9c882 -// GFX89: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x64,0x00,0x7e] -// GFX12XX: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e] -// NOSICI: :[[@LINE-3]]:25: error: invalid operand for instruction // GFX11: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e] +// GFX12XX: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e] +// GFX89: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x64,0x00,0x7e] +// NOSICI: :[[@LINE-4]]:25: error: invalid operand for instruction // NOSICIVI: :[[@LINE-2]]:25: error: invalid operand for instruction v_trunc_f32_e32 v0, 0x3e22f983 -// SICI: v_trunc_f32_e32 v0, 0x3e22f983 ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e] -// GFX89: v_trunc_f32_e32 v0, 0.15915494 ; encoding: [0xf8,0x38,0x00,0x7e] -// GFX12XX: v_trunc_f32_e32 v0, 0.15915494 ; encoding: [0xf8,0x42,0x00,0x7e] // GFX11: v_trunc_f32_e32 v0, 0.15915494 ; encoding: [0xf8,0x42,0x00,0x7e] +// GFX12XX: v_trunc_f32_e32 v0, 0.15915494 ; encoding: [0xf8,0x42,0x00,0x7e] +// GFX89: v_trunc_f32_e32 v0, 0.15915494 ; encoding: [0xf8,0x38,0x00,0x7e] +// SICI: v_trunc_f32_e32 v0, 0x3e22f983 ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e] v_fract_f64_e32 v[0:1], 0x3e22f983 -// SICI: v_fract_f64_e32 v[0:1], 0x3e22f983 ; encoding: [0xff,0x7c,0x00,0x7e,0x83,0xf9,0x22,0x3e] -// GFX89: v_fract_f64_e32 v[0:1], 0x3e22f983 ; encoding: [0xff,0x64,0x00,0x7e,0x83,0xf9,0x22,0x3e] -// GFX12XX: v_fract_f64_e32 v[0:1], 0x3e22f983 ; encoding: [0xff,0x7c,0x00,0x7e,0x83,0xf9,0x22,0x3e] // GFX11: v_fract_f64_e32 v[0:1], 0x3e22f983 ; encoding: [0xff,0x7c,0x00,0x7e,0x83,0xf9,0x22,0x3e] +// GFX12XX: v_fract_f64_e32 v[0:1], 0x3e22f983 ; encoding: [0xff,0x7c,0x00,0x7e,0x83,0xf9,0x22,0x3e] +// GFX89: v_fract_f64_e32 v[0:1], 0x3e22f983 ; encoding: [0xff,0x64,0x00,0x7e,0x83,0xf9,0x22,0x3e] +// SICI: v_fract_f64_e32 v[0:1], 0x3e22f983 ; encoding: [0xff,0x7c,0x00,0x7e,0x83,0xf9,0x22,0x3e] v_trunc_f32_e64 v0, 0x3fc45f306dc9c882 // NOGCN: :[[@LINE-1]]:21: error: invalid operand for instruction v_fract_f64_e64 v[0:1], 0x3fc45f306dc9c882 -// GFX89: v_fract_f64_e64 v[0:1], 0.15915494309189532 ; encoding: [0x00,0x00,0x72,0xd1,0xf8,0x00,0x00,0x00] -// GFX12XX: v_fract_f64_e64 v[0:1], 0.15915494309189532 ; encoding: [0x00,0x00,0xbe,0xd5,0xf8,0x00,0x00,0x00] -// NOSICI: :[[@LINE-3]]:25: error: invalid operand for instruction // GFX11: v_fract_f64_e64 v[0:1], 0.15915494309189532 ; encoding: [0x00,0x00,0xbe,0xd5,0xf8,0x00,0x00,0x00] +// GFX12XX: v_fract_f64_e64 v[0:1], 0.15915494309189532 ; encoding: [0x00,0x00,0xbe,0xd5,0xf8,0x00,0x00,0x00] +// GFX89: v_fract_f64_e64 v[0:1], 0.15915494309189532 ; encoding: [0x00,0x00,0x72,0xd1,0xf8,0x00,0x00,0x00] +// NOSICI: :[[@LINE-4]]:25: error: invalid operand for instruction // NOSICIVI: :[[@LINE-2]]:25: error: invalid operand for instruction v_trunc_f32_e64 v0, 0x3e22f983 -// GFX89: v_trunc_f32_e64 v0, 0.15915494 ; encoding: [0x00,0x00,0x5c,0xd1,0xf8,0x00,0x00,0x00] -// GFX12XX: v_trunc_f32_e64 v0, 0.15915494 ; encoding: [0x00,0x00,0xa1,0xd5,0xf8,0x00,0x00,0x00] -// NOSICI: :[[@LINE-3]]:21: error: literal operands are not supported // GFX11: v_trunc_f32_e64 v0, 0.15915494 ; encoding: [0x00,0x00,0xa1,0xd5,0xf8,0x00,0x00,0x00] +// GFX12XX: v_trunc_f32_e64 v0, 0.15915494 ; encoding: [0x00,0x00,0xa1,0xd5,0xf8,0x00,0x00,0x00] +// GFX89: v_trunc_f32_e64 v0, 0.15915494 ; encoding: [0x00,0x00,0x5c,0xd1,0xf8,0x00,0x00,0x00] +// NOSICI: :[[@LINE-4]]:21: error: literal operands are not supported // NOSICIVI: :[[@LINE-2]]:21: error: literal operands are not supported v_fract_f64_e64 v[0:1], 0x3e22f983 +// GFX11: v_fract_f64_e64 v[0:1], 0x3e22f983 ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0x83,0xf9,0x22,0x3e] // GFX12XX: v_fract_f64_e64 v[0:1], 0x3e22f983 ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0x83,0xf9,0x22,0x3e] -// NOSICI: :[[@LINE-2]]:25: error: literal operands are not supported // NOGFX89: :[[@LINE-3]]:25: error: literal operands are not supported -// GFX11: v_fract_f64_e64 v[0:1], 0x3e22f983 ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0x83,0xf9,0x22,0x3e] +// NOSICI: :[[@LINE-4]]:25: error: literal operands are not supported // NOSICIVI: :[[@LINE-1]]:25: error: literal operands are not supported s_mov_b64_e32 s[0:1], 0.159154943091895317852646485335 @@ -996,37 +996,37 @@ s_mov_b64_e32 s[0:1], 0.159154943091895317852646485335 // NOSICIVI: :[[@LINE-2]]:23: error: invalid operand for instruction v_and_b32_e32 v0, 0.159154943091895317852646485335, v1 -// SICI: v_and_b32_e32 v0, 0x3e22f983, v1 ; encoding: [0xff,0x02,0x00,0x36,0x83,0xf9,0x22,0x3e] -// GFX89: v_and_b32_e32 v0, 0.15915494, v1 ; encoding: [0xf8,0x02,0x00,0x26] -// GFX12XX: v_and_b32_e32 v0, 0.15915494, v1 ; encoding: [0xf8,0x02,0x00,0x36] // GFX11: v_and_b32_e32 v0, 0.15915494, v1 ; encoding: [0xf8,0x02,0x00,0x36] +// GFX12XX: v_and_b32_e32 v0, 0.15915494, v1 ; encoding: [0xf8,0x02,0x00,0x36] +// GFX89: v_and_b32_e32 v0, 0.15915494, v1 ; encoding: [0xf8,0x02,0x00,0x26] +// SICI: v_and_b32_e32 v0, 0x3e22f983, v1 ; encoding: [0xff,0x02,0x00,0x36,0x83,0xf9,0x22,0x3e] v_and_b32_e64 v0, 0.159154943091895317852646485335, v1 -// GFX89: v_and_b32_e64 v0, 0.15915494, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xf8,0x02,0x02,0x00] -// GFX12XX: v_and_b32_e64 v0, 0.15915494, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf8,0x02,0x02,0x00] -// NOSICI: :[[@LINE-3]]:19: error: literal operands are not supported // GFX11: v_and_b32_e64 v0, 0.15915494, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf8,0x02,0x02,0x00] +// GFX12XX: v_and_b32_e64 v0, 0.15915494, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf8,0x02,0x02,0x00] +// GFX89: v_and_b32_e64 v0, 0.15915494, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xf8,0x02,0x02,0x00] +// NOSICI: :[[@LINE-4]]:19: error: literal operands are not supported // NOSICIVI: :[[@LINE-2]]:19: error: literal operands are not supported v_fract_f64 v[0:1], 0.159154943091895317852646485335 -// SICI: v_fract_f64_e32 v[0:1], 0x3fc45f30 ; encoding: [0xff,0x7c,0x00,0x7e,0x30,0x5f,0xc4,0x3f] -// GFX89: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x64,0x00,0x7e] +// GFX11: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e] // GFX12XX: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e] +// GFX89: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x64,0x00,0x7e] // NOSICI: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero -// GFX11: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e] +// SICI: v_fract_f64_e32 v[0:1], 0x3fc45f30 ; encoding: [0xff,0x7c,0x00,0x7e,0x30,0x5f,0xc4,0x3f] // NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero v_trunc_f32 v0, 0.159154943091895317852646485335 -// SICI: v_trunc_f32_e32 v0, 0x3e22f983 ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e] -// GFX89: v_trunc_f32_e32 v0, 0.15915494 ; encoding: [0xf8,0x38,0x00,0x7e] -// GFX12XX: v_trunc_f32_e32 v0, 0.15915494 ; encoding: [0xf8,0x42,0x00,0x7e] // GFX11: v_trunc_f32_e32 v0, 0.15915494 ; encoding: [0xf8,0x42,0x00,0x7e] +// GFX12XX: v_trunc_f32_e32 v0, 0.15915494 ; encoding: [0xf8,0x42,0x00,0x7e] +// GFX89: v_trunc_f32_e32 v0, 0.15915494 ; encoding: [0xf8,0x38,0x00,0x7e] +// SICI: v_trunc_f32_e32 v0, 0x3e22f983 ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e] v_trunc_f32 v0, lit(0.159154943091895317852646485335) -// SICI: v_trunc_f32_e32 v0, lit(0x3e22f983) ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e] -// GFX89: v_trunc_f32_e32 v0, lit(0x3e22f983) ; encoding: [0xff,0x38,0x00,0x7e,0x83,0xf9,0x22,0x3e] -// GFX12XX: v_trunc_f32_e32 v0, lit(0x3e22f983) ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e] // GFX11: v_trunc_f32_e32 v0, lit(0x3e22f983) ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e] +// GFX12XX: v_trunc_f32_e32 v0, lit(0x3e22f983) ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e] +// GFX89: v_trunc_f32_e32 v0, lit(0x3e22f983) ; encoding: [0xff,0x38,0x00,0x7e,0x83,0xf9,0x22,0x3e] +// SICI: v_trunc_f32_e32 v0, lit(0x3e22f983) ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e] //---------------------------------------------------------------------------// // integer literal truncation checks @@ -1051,54 +1051,54 @@ v_trunc_f32 v0, 0x1fffffff000 // NOGCN: :[[@LINE-1]]:17: error: invalid operand for instruction s_mov_b64 s[0:1], 0x101ffffffff -// NOSICI: :[[@LINE-1]]:19: error: invalid operand for instruction -// NOGFX89: :[[@LINE-2]]:19: error: invalid operand for instruction // GFX1250: s_mov_b64 s[0:1], 0x101ffffffff ; encoding: [0xfe,0x01,0x80,0xbe,0xff,0xff,0xff,0xff,0x01,0x01,0x00,0x00] -// NOGFX11: :[[@LINE-4]]:19: error: invalid operand for instruction -// NOGFX12: :[[@LINE-5]]:19: error: invalid operand for instruction +// NOGFX11: :[[@LINE-2]]:19: error: invalid operand for instruction +// NOGFX12: :[[@LINE-3]]:19: error: invalid operand for instruction +// NOGFX89: :[[@LINE-4]]:19: error: invalid operand for instruction +// NOSICI: :[[@LINE-5]]:19: error: invalid operand for instruction // NOSICIVI: :[[@LINE-1]]:19: error: invalid operand for instruction s_mov_b64 s[0:1], 0x1000000001 -// NOSICI: :[[@LINE-1]]:19: error: invalid operand for instruction -// NOGFX89: :[[@LINE-2]]:19: error: invalid operand for instruction // GFX1250: s_mov_b64 s[0:1], 0x1000000001 ; encoding: [0xfe,0x01,0x80,0xbe,0x01,0x00,0x00,0x00,0x10,0x00,0x00,0x00] -// NOGFX11: :[[@LINE-4]]:19: error: invalid operand for instruction -// NOGFX12: :[[@LINE-5]]:19: error: invalid operand for instruction +// NOGFX11: :[[@LINE-2]]:19: error: invalid operand for instruction +// NOGFX12: :[[@LINE-3]]:19: error: invalid operand for instruction +// NOGFX89: :[[@LINE-4]]:19: error: invalid operand for instruction +// NOSICI: :[[@LINE-5]]:19: error: invalid operand for instruction // NOSICIVI: :[[@LINE-1]]:19: error: invalid operand for instruction s_mov_b64 s[0:1], 0x1000000fff -// NOSICI: :[[@LINE-1]]:19: error: invalid operand for instruction -// NOGFX89: :[[@LINE-2]]:19: error: invalid operand for instruction // GFX1250: s_mov_b64 s[0:1], 0x1000000fff ; encoding: [0xfe,0x01,0x80,0xbe,0xff,0x0f,0x00,0x00,0x10,0x00,0x00,0x00] -// NOGFX11: :[[@LINE-4]]:19: error: invalid operand for instruction -// NOGFX12: :[[@LINE-5]]:19: error: invalid operand for instruction +// NOGFX11: :[[@LINE-2]]:19: error: invalid operand for instruction +// NOGFX12: :[[@LINE-3]]:19: error: invalid operand for instruction +// NOGFX89: :[[@LINE-4]]:19: error: invalid operand for instruction +// NOSICI: :[[@LINE-5]]:19: error: invalid operand for instruction // NOSICIVI: :[[@LINE-1]]:19: error: invalid operand for instruction v_trunc_f64 v[0:1], 0x1fffffffff0 -// NOGFX89: :[[@LINE-1]]:21: error: invalid operand for instruction // GFX1250: v_trunc_f64_e32 v[0:1], 0x1fffffffff0 ; encoding: [0xfe,0x2e,0x00,0x7e,0xf0,0xff,0xff,0xff,0xff,0x01,0x00,0x00] -// NOSI: :[[@LINE-3]]:1: error: instruction not supported on this GPU -// NOCI: :[[@LINE-4]]:21: error: invalid operand for instruction -// NOGFX11: :[[@LINE-5]]:21: error: invalid operand for instruction -// NOGFX12: :[[@LINE-6]]:21: error: invalid operand for instruction +// NOCI: :[[@LINE-2]]:21: error: invalid operand for instruction +// NOGFX11: :[[@LINE-3]]:21: error: invalid operand for instruction +// NOGFX12: :[[@LINE-4]]:21: error: invalid operand for instruction +// NOGFX89: :[[@LINE-5]]:21: error: invalid operand for instruction +// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU // NOCIVI: :[[@LINE-4]]:21: error: invalid operand for instruction v_trunc_f64 v[0:1], 0x100000001 -// NOGFX89: :[[@LINE-1]]:21: error: invalid operand for instruction // GFX1250: v_trunc_f64_e32 v[0:1], 0x100000001 ; encoding: [0xfe,0x2e,0x00,0x7e,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00] -// NOSI: :[[@LINE-3]]:1: error: instruction not supported on this GPU -// NOCI: :[[@LINE-4]]:21: error: invalid operand for instruction -// NOGFX11: :[[@LINE-5]]:21: error: invalid operand for instruction -// NOGFX12: :[[@LINE-6]]:21: error: invalid operand for instruction +// NOCI: :[[@LINE-2]]:21: error: invalid operand for instruction +// NOGFX11: :[[@LINE-3]]:21: error: invalid operand for instruction +// NOGFX12: :[[@LINE-4]]:21: error: invalid operand for instruction +// NOGFX89: :[[@LINE-5]]:21: error: invalid operand for instruction +// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU // NOCIVI: :[[@LINE-4]]:21: error: invalid operand for instruction v_trunc_f64 v[0:1], 0x1fffffff000 -// NOGFX89: :[[@LINE-1]]:21: error: invalid operand for instruction // GFX1250: v_trunc_f64_e32 v[0:1], 0x1fffffff000 ; encoding: [0xfe,0x2e,0x00,0x7e,0x00,0xf0,0xff,0xff,0xff,0x01,0x00,0x00] -// NOSI: :[[@LINE-3]]:1: error: instruction not supported on this GPU -// NOCI: :[[@LINE-4]]:21: error: invalid operand for instruction -// NOGFX11: :[[@LINE-5]]:21: error: invalid operand for instruction -// NOGFX12: :[[@LINE-6]]:21: error: invalid operand for instruction +// NOCI: :[[@LINE-2]]:21: error: invalid operand for instruction +// NOGFX11: :[[@LINE-3]]:21: error: invalid operand for instruction +// NOGFX12: :[[@LINE-4]]:21: error: invalid operand for instruction +// NOGFX89: :[[@LINE-5]]:21: error: invalid operand for instruction +// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU // NOCIVI: :[[@LINE-4]]:21: error: invalid operand for instruction //---------------------------------------------------------------------------// @@ -1106,210 +1106,210 @@ v_trunc_f64 v[0:1], 0x1fffffff000 //---------------------------------------------------------------------------// buffer_atomic_add v0, off, s[0:3], scc offset:4095 -// SICI: buffer_atomic_add v0, off, s[0:3], src_scc offset:4095 ; encoding: [0xff,0x0f,0xc8,0xe0,0x00,0x00,0x00,0xfd] -// GFX89: buffer_atomic_add v0, off, s[0:3], src_scc offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x00,0x00,0xfd] -// GFX12XX: buffer_atomic_add_u32 v0, off, s[0:3], src_scc offset:4095 ; encoding: [0x7d,0x40,0x0d,0xc4,0x00,0x00,0x80,0x00,0x00,0xff,0x0f,0x00] // GFX11: buffer_atomic_add_u32 v0, off, s[0:3], src_scc offset:4095 ; encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x00,0x00,0xfd] +// GFX12XX: buffer_atomic_add_u32 v0, off, s[0:3], src_scc offset:4095 ; encoding: [0x7d,0x40,0x0d,0xc4,0x00,0x00,0x80,0x00,0x00,0xff,0x0f,0x00] +// GFX89: buffer_atomic_add v0, off, s[0:3], src_scc offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x00,0x00,0xfd] +// SICI: buffer_atomic_add v0, off, s[0:3], src_scc offset:4095 ; encoding: [0xff,0x0f,0xc8,0xe0,0x00,0x00,0x00,0xfd] s_add_i32 s0, vccz, s0 -// SICI: s_add_i32 s0, src_vccz, s0 ; encoding: [0xfb,0x00,0x00,0x81] // GFX89: s_add_i32 s0, src_vccz, s0 ; encoding: [0xfb,0x00,0x00,0x81] -// NOGFX11: :[[@LINE-3]]:15: error: src_vccz register not available on this GPU -// NOGFX12: :[[@LINE-4]]:15: error: src_vccz register not available on this GPU -// NOGFX1250: :[[@LINE-5]]:15: error: src_vccz register not available on this GPU +// NOGFX11: :[[@LINE-2]]:15: error: src_vccz register not available on this GPU +// NOGFX12: :[[@LINE-3]]:15: error: src_vccz register not available on this GPU +// NOGFX1250: :[[@LINE-4]]:15: error: src_vccz register not available on this GPU +// SICI: s_add_i32 s0, src_vccz, s0 ; encoding: [0xfb,0x00,0x00,0x81] s_add_i32 s0, execz, s0 -// SICI: s_add_i32 s0, src_execz, s0 ; encoding: [0xfc,0x00,0x00,0x81] // GFX89: s_add_i32 s0, src_execz, s0 ; encoding: [0xfc,0x00,0x00,0x81] -// NOGFX11: :[[@LINE-3]]:15: error: src_execz register not available on this GPU -// NOGFX12: :[[@LINE-4]]:15: error: src_execz register not available on this GPU -// NOGFX1250: :[[@LINE-5]]:15: error: src_execz register not available on this GPU +// NOGFX11: :[[@LINE-2]]:15: error: src_execz register not available on this GPU +// NOGFX12: :[[@LINE-3]]:15: error: src_execz register not available on this GPU +// NOGFX1250: :[[@LINE-4]]:15: error: src_execz register not available on this GPU +// SICI: s_add_i32 s0, src_execz, s0 ; encoding: [0xfc,0x00,0x00,0x81] s_add_i32 s0, scc, s0 -// SICI: s_add_i32 s0, src_scc, s0 ; encoding: [0xfd,0x00,0x00,0x81] -// GFX89: s_add_i32 s0, src_scc, s0 ; encoding: [0xfd,0x00,0x00,0x81] -// GFX12XX: s_add_co_i32 s0, src_scc, s0 ; encoding: [0xfd,0x00,0x00,0x81] // GFX11: s_add_i32 s0, src_scc, s0 ; encoding: [0xfd,0x00,0x00,0x81] +// GFX12XX: s_add_co_i32 s0, src_scc, s0 ; encoding: [0xfd,0x00,0x00,0x81] +// GFX89: s_add_i32 s0, src_scc, s0 ; encoding: [0xfd,0x00,0x00,0x81] +// SICI: s_add_i32 s0, src_scc, s0 ; encoding: [0xfd,0x00,0x00,0x81] s_and_b64 s[0:1], s[0:1], src_vccz -// SICI: s_and_b64 s[0:1], s[0:1], src_vccz ; encoding: [0x00,0xfb,0x80,0x87] // GFX89: s_and_b64 s[0:1], s[0:1], src_vccz ; encoding: [0x00,0xfb,0x80,0x86] -// NOGFX11: :[[@LINE-3]]:27: error: src_vccz register not available on this GPU -// NOGFX12: :[[@LINE-4]]:27: error: src_vccz register not available on this GPU -// NOGFX1250: :[[@LINE-5]]:27: error: src_vccz register not available on this GPU +// NOGFX11: :[[@LINE-2]]:27: error: src_vccz register not available on this GPU +// NOGFX12: :[[@LINE-3]]:27: error: src_vccz register not available on this GPU +// NOGFX1250: :[[@LINE-4]]:27: error: src_vccz register not available on this GPU +// SICI: s_and_b64 s[0:1], s[0:1], src_vccz ; encoding: [0x00,0xfb,0x80,0x87] s_and_b64 s[0:1], s[0:1], src_execz -// SICI: s_and_b64 s[0:1], s[0:1], src_execz ; encoding: [0x00,0xfc,0x80,0x87] // GFX89: s_and_b64 s[0:1], s[0:1], src_execz ; encoding: [0x00,0xfc,0x80,0x86] -// NOGFX11: :[[@LINE-3]]:27: error: src_execz register not available on this GPU -// NOGFX12: :[[@LINE-4]]:27: error: src_execz register not available on this GPU -// NOGFX1250: :[[@LINE-5]]:27: error: src_execz register not available on this GPU +// NOGFX11: :[[@LINE-2]]:27: error: src_execz register not available on this GPU +// NOGFX12: :[[@LINE-3]]:27: error: src_execz register not available on this GPU +// NOGFX1250: :[[@LINE-4]]:27: error: src_execz register not available on this GPU +// SICI: s_and_b64 s[0:1], s[0:1], src_execz ; encoding: [0x00,0xfc,0x80,0x87] s_and_b64 s[0:1], s[0:1], src_scc -// SICI: s_and_b64 s[0:1], s[0:1], src_scc ; encoding: [0x00,0xfd,0x80,0x87] -// GFX89: s_and_b64 s[0:1], s[0:1], src_scc ; encoding: [0x00,0xfd,0x80,0x86] -// GFX12XX: s_and_b64 s[0:1], s[0:1], src_scc ; encoding: [0x00,0xfd,0x80,0x8b] // GFX11: s_and_b64 s[0:1], s[0:1], src_scc ; encoding: [0x00,0xfd,0x80,0x8b] +// GFX12XX: s_and_b64 s[0:1], s[0:1], src_scc ; encoding: [0x00,0xfd,0x80,0x8b] +// GFX89: s_and_b64 s[0:1], s[0:1], src_scc ; encoding: [0x00,0xfd,0x80,0x86] +// SICI: s_and_b64 s[0:1], s[0:1], src_scc ; encoding: [0x00,0xfd,0x80,0x87] v_add_u16 v0, vccz, v0 // GFX89: v_add_u16_e32 v0, src_vccz, v0 ; encoding: [0xfb,0x00,0x00,0x4c] -// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU -// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU // NOSICIVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU v_add_u16_sdwa v0, scc, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX9: v_add_u16_sdwa v0, src_scc, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x00,0x00,0x4c,0xfd,0x06,0x86,0x06] -// NOVI: :[[@LINE-3]]:20: error: invalid operand for instruction -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-6]]:20: error: invalid operand for instruction // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_add_u16_sdwa v0, v0, scc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX9: v_add_u16_sdwa v0, v0, src_scc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0xfa,0x01,0x4c,0x00,0x06,0x06,0x86] -// NOVI: :[[@LINE-3]]:24: error: invalid operand for instruction -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-6]]:24: error: invalid operand for instruction // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_add_u32 v0, execz, v0 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX9: v_add_u32_e32 v0, src_execz, v0 ; encoding: [0xfc,0x00,0x00,0x68] -// NOVI: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode -// NOGFX11: :[[@LINE-4]]:15: error: src_execz register not available on this GPU -// NOGFX12: :[[@LINE-5]]:15: error: src_execz register not available on this GPU -// NOGFX1250: :[[@LINE-6]]:15: error: src_execz register not available on this GPU +// NOGFX11: :[[@LINE-2]]:15: error: src_execz register not available on this GPU +// NOGFX12: :[[@LINE-3]]:15: error: src_execz register not available on this GPU +// NOGFX1250: :[[@LINE-4]]:15: error: src_execz register not available on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-6]]:1: error: operands are not valid for this GPU or mode // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_add_u32_e64 v0, scc, v0 +// GFX11: v_add_nc_u32_e64 v0, src_scc, v0 ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x02,0x00] // GFX12XX: v_add_nc_u32_e64 v0, src_scc, v0 ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x02,0x00] -// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX9: v_add_u32_e64 v0, src_scc, v0 ; encoding: [0x00,0x00,0x34,0xd1,0xfd,0x00,0x02,0x00] -// GFX11: v_add_nc_u32_e64 v0, src_scc, v0 ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x02,0x00] +// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU // NOVI: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_cmp_eq_i64 vcc, scc, v[0:1] -// SICI: v_cmp_eq_i64_e32 vcc, src_scc, v[0:1] ; encoding: [0xfd,0x00,0x44,0x7d] // GFX89: v_cmp_eq_i64_e32 vcc, src_scc, v[0:1] ; encoding: [0xfd,0x00,0xc4,0x7d] -// NOGFX11: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode -// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode -// NOGFX1250: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode +// NOGFX11: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode +// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode +// NOGFX1250: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode +// SICI: v_cmp_eq_i64_e32 vcc, src_scc, v[0:1] ; encoding: [0xfd,0x00,0x44,0x7d] v_max_f16 v0, execz, v0 // GFX89: v_max_f16_e32 v0, src_execz, v0 ; encoding: [0xfc,0x00,0x00,0x5a] -// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU -// NOGFX11: :[[@LINE-3]]:15: error: src_execz register not available on this GPU -// NOGFX12: :[[@LINE-4]]:15: error: src_execz register not available on this GPU -// NOGFX1250: :[[@LINE-5]]:15: error: src_execz register not available on this GPU +// NOGFX11: :[[@LINE-2]]:15: error: src_execz register not available on this GPU +// NOGFX12: :[[@LINE-3]]:15: error: src_execz register not available on this GPU +// NOGFX1250: :[[@LINE-4]]:15: error: src_execz register not available on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU // NOSICIVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU v_max_f32 v0, vccz, v0 -// SICI: v_max_f32_e32 v0, src_vccz, v0 ; encoding: [0xfb,0x00,0x00,0x20] // GFX89: v_max_f32_e32 v0, src_vccz, v0 ; encoding: [0xfb,0x00,0x00,0x16] -// NOGFX11: :[[@LINE-3]]:15: error: src_vccz register not available on this GPU -// NOGFX12: :[[@LINE-4]]:15: error: src_vccz register not available on this GPU -// NOGFX1250: :[[@LINE-5]]:15: error: src_vccz register not available on this GPU +// NOGFX11: :[[@LINE-2]]:15: error: src_vccz register not available on this GPU +// NOGFX12: :[[@LINE-3]]:15: error: src_vccz register not available on this GPU +// NOGFX1250: :[[@LINE-4]]:15: error: src_vccz register not available on this GPU +// SICI: v_max_f32_e32 v0, src_vccz, v0 ; encoding: [0xfb,0x00,0x00,0x20] v_max_f64 v[0:1], scc, v[0:1] -// SICI: v_max_f64 v[0:1], src_scc, v[0:1] ; encoding: [0x00,0x00,0xce,0xd2,0xfd,0x00,0x02,0x00] -// GFX89: v_max_f64 v[0:1], src_scc, v[0:1] ; encoding: [0x00,0x00,0x83,0xd2,0xfd,0x00,0x02,0x00] -// GFX12XX: v_max_num_f64_e32 v[0:1], src_scc, v[0:1] ; encoding: [0xfd,0x00,0x00,0x1c] // GFX11: v_max_f64 v[0:1], src_scc, v[0:1] ; encoding: [0x00,0x00,0x2a,0xd7,0xfd,0x00,0x02,0x00] +// GFX12XX: v_max_num_f64_e32 v[0:1], src_scc, v[0:1] ; encoding: [0xfd,0x00,0x00,0x1c] +// GFX89: v_max_f64 v[0:1], src_scc, v[0:1] ; encoding: [0x00,0x00,0x83,0xd2,0xfd,0x00,0x02,0x00] +// SICI: v_max_f64 v[0:1], src_scc, v[0:1] ; encoding: [0x00,0x00,0xce,0xd2,0xfd,0x00,0x02,0x00] v_pk_add_f16 v0, execz, v0 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX9: v_pk_add_f16 v0, src_execz, v0 ; encoding: [0x00,0x40,0x8f,0xd3,0xfc,0x00,0x02,0x18] -// NOVI: :[[@LINE-3]]:1: error: instruction not supported on this GPU -// NOGFX11: :[[@LINE-4]]:18: error: src_execz register not available on this GPU -// NOGFX12: :[[@LINE-5]]:18: error: src_execz register not available on this GPU -// NOGFX1250: :[[@LINE-6]]:18: error: src_execz register not available on this GPU +// NOGFX11: :[[@LINE-2]]:18: error: src_execz register not available on this GPU +// NOGFX12: :[[@LINE-3]]:18: error: src_execz register not available on this GPU +// NOGFX1250: :[[@LINE-4]]:18: error: src_execz register not available on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-6]]:1: error: instruction not supported on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_ceil_f16 v0, neg(vccz) // GFX89: v_ceil_f16_e64 v0, -src_vccz ; encoding: [0x00,0x00,0x85,0xd1,0xfb,0x00,0x00,0x20] -// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU -// NOGFX11: :[[@LINE-3]]:20: error: src_vccz register not available on this GPU -// NOGFX12: :[[@LINE-4]]:20: error: src_vccz register not available on this GPU -// NOGFX1250: :[[@LINE-5]]:20: error: src_vccz register not available on this GPU +// NOGFX11: :[[@LINE-2]]:20: error: src_vccz register not available on this GPU +// NOGFX12: :[[@LINE-3]]:20: error: src_vccz register not available on this GPU +// NOGFX1250: :[[@LINE-4]]:20: error: src_vccz register not available on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU // NOSICIVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU v_ceil_f16 v0, abs(scc) -// GFX89: v_ceil_f16_e64 v0, |src_scc| ; encoding: [0x00,0x01,0x85,0xd1,0xfd,0x00,0x00,0x00] -// GFX12XX: v_ceil_f16_e64 v0, |src_scc| ; encoding: [0x00,0x01,0xdc,0xd5,0xfd,0x00,0x00,0x00] -// NOSICI: :[[@LINE-3]]:1: error: instruction not supported on this GPU // GFX11: v_ceil_f16_e64 v0, |src_scc| ; encoding: [0x00,0x01,0xdc,0xd5,0xfd,0x00,0x00,0x00] +// GFX12XX: v_ceil_f16_e64 v0, |src_scc| ; encoding: [0x00,0x01,0xdc,0xd5,0xfd,0x00,0x00,0x00] +// GFX89: v_ceil_f16_e64 v0, |src_scc| ; encoding: [0x00,0x01,0x85,0xd1,0xfd,0x00,0x00,0x00] +// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU // NOSICIVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU v_ceil_f64 v[5:6], |execz| -// GFX89: v_ceil_f64_e64 v[5:6], |src_execz| ; encoding: [0x05,0x01,0x58,0xd1,0xfc,0x00,0x00,0x00] // CI: v_ceil_f64_e64 v[5:6], |src_execz| ; encoding: [0x05,0x01,0x30,0xd3,0xfc,0x00,0x00,0x00] -// NOSI: :[[@LINE-3]]:1: error: instruction not supported on this GPU -// NOGFX11: :[[@LINE-4]]:21: error: src_execz register not available on this GPU -// NOGFX12: :[[@LINE-5]]:21: error: src_execz register not available on this GPU -// NOGFX1250: :[[@LINE-6]]:21: error: src_execz register not available on this GPU +// GFX89: v_ceil_f64_e64 v[5:6], |src_execz| ; encoding: [0x05,0x01,0x58,0xd1,0xfc,0x00,0x00,0x00] +// NOGFX11: :[[@LINE-3]]:21: error: src_execz register not available on this GPU +// NOGFX12: :[[@LINE-4]]:21: error: src_execz register not available on this GPU +// NOGFX1250: :[[@LINE-5]]:21: error: src_execz register not available on this GPU +// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU v_ceil_f64 v[5:6], -vcc -// GFX89: v_ceil_f64_e64 v[5:6], -vcc ; encoding: [0x05,0x00,0x58,0xd1,0x6a,0x00,0x00,0x20] // CI: v_ceil_f64_e64 v[5:6], -vcc ; encoding: [0x05,0x00,0x30,0xd3,0x6a,0x00,0x00,0x20] // GFX11: v_ceil_f64_e64 v[5:6], -vcc ; encoding: [0x05,0x00,0x98,0xd5,0x6a,0x00,0x00,0x20] // GFX12: v_ceil_f64_e64 v[5:6], -vcc ; encoding: [0x05,0x00,0x98,0xd5,0x6a,0x00,0x00,0x20] -// NOSI: :[[@LINE-5]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-6]]:12: error: invalid operand for instruction +// GFX89: v_ceil_f64_e64 v[5:6], -vcc ; encoding: [0x05,0x00,0x58,0xd1,0x6a,0x00,0x00,0x20] +// NOGFX1250: :[[@LINE-5]]:12: error: invalid operand for instruction +// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU v_ceil_f32 v0, -vccz -// SICI: v_ceil_f32_e64 v0, -src_vccz ; encoding: [0x00,0x00,0x44,0xd3,0xfb,0x00,0x00,0x20] // GFX89: v_ceil_f32_e64 v0, -src_vccz ; encoding: [0x00,0x00,0x5d,0xd1,0xfb,0x00,0x00,0x20] -// NOGFX11: :[[@LINE-3]]:17: error: src_vccz register not available on this GPU -// NOGFX12: :[[@LINE-4]]:17: error: src_vccz register not available on this GPU -// NOGFX1250: :[[@LINE-5]]:17: error: src_vccz register not available on this GPU +// NOGFX11: :[[@LINE-2]]:17: error: src_vccz register not available on this GPU +// NOGFX12: :[[@LINE-3]]:17: error: src_vccz register not available on this GPU +// NOGFX1250: :[[@LINE-4]]:17: error: src_vccz register not available on this GPU +// SICI: v_ceil_f32_e64 v0, -src_vccz ; encoding: [0x00,0x00,0x44,0xd3,0xfb,0x00,0x00,0x20] v_ceil_f32 v0, |execz| -// SICI: v_ceil_f32_e64 v0, |src_execz| ; encoding: [0x00,0x01,0x44,0xd3,0xfc,0x00,0x00,0x00] // GFX89: v_ceil_f32_e64 v0, |src_execz| ; encoding: [0x00,0x01,0x5d,0xd1,0xfc,0x00,0x00,0x00] -// NOGFX11: :[[@LINE-3]]:17: error: src_execz register not available on this GPU -// NOGFX12: :[[@LINE-4]]:17: error: src_execz register not available on this GPU -// NOGFX1250: :[[@LINE-5]]:17: error: src_execz register not available on this GPU +// NOGFX11: :[[@LINE-2]]:17: error: src_execz register not available on this GPU +// NOGFX12: :[[@LINE-3]]:17: error: src_execz register not available on this GPU +// NOGFX1250: :[[@LINE-4]]:17: error: src_execz register not available on this GPU +// SICI: v_ceil_f32_e64 v0, |src_execz| ; encoding: [0x00,0x01,0x44,0xd3,0xfc,0x00,0x00,0x00] v_ceil_f16_sdwa v5, |vccz| dst_sel:DWORD dst_unused:UNUSED_PRESERVE -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX9: v_ceil_f16_sdwa v5, |src_vccz| dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x8a,0x0a,0x7e,0xfb,0x16,0xa6,0x00] -// NOVI: :[[@LINE-3]]:22: error: invalid operand for instruction -// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported -// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported -// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported +// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported +// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported +// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-6]]:22: error: invalid operand for instruction // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_ceil_f16_sdwa v5, -scc dst_sel:DWORD dst_unused:UNUSED_PRESERVE -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX9: v_ceil_f16_sdwa v5, -src_scc dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x8a,0x0a,0x7e,0xfd,0x16,0x96,0x00] -// NOVI: :[[@LINE-3]]:22: error: invalid operand for instruction -// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported -// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported -// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported +// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported +// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported +// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-6]]:22: error: invalid operand for instruction // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_ceil_f32_sdwa v5, vccz dst_sel:DWORD src0_sel:DWORD -// NOSICI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported // GFX9: v_ceil_f32_sdwa v5, src_vccz dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x3a,0x0a,0x7e,0xfb,0x16,0x86,0x00] -// NOVI: :[[@LINE-3]]:21: error: invalid operand for instruction -// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported -// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported -// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported +// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported +// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported +// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported +// NOSICI: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported +// NOVI: :[[@LINE-6]]:21: error: invalid operand for instruction // NOSICIVI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported v_ceil_f32_sdwa v5, |execz| dst_sel:DWORD src0_sel:DWORD -// NOSICI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported // GFX9: v_ceil_f32_sdwa v5, |src_execz| dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x3a,0x0a,0x7e,0xfc,0x16,0xa6,0x00] -// NOVI: :[[@LINE-3]]:22: error: invalid operand for instruction -// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported -// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported -// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported +// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported +// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported +// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported +// NOSICI: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported +// NOVI: :[[@LINE-6]]:22: error: invalid operand for instruction // NOSICIVI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported //---------------------------------------------------------------------------// @@ -1317,266 +1317,266 @@ v_ceil_f32_sdwa v5, |execz| dst_sel:DWORD src0_sel:DWORD //---------------------------------------------------------------------------// buffer_atomic_add v0, off, s[0:3], src_shared_base offset:4095 -// NOSICI: :[[@LINE-1]]:36: error: src_shared_base register not available on this GPU -// GFX9: buffer_atomic_add v0, off, s[0:3], src_shared_base offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x00,0x00,0xeb] // GFX11: buffer_atomic_add_u32 v0, off, s[0:3], src_shared_base offset:4095 ; encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x00,0x00,0xeb] -// NOVI: :[[@LINE-4]]:36: error: src_shared_base register not available on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode -// NOGFX1250: :[[@LINE-6]]:1: error: operands are not valid for this GPU or mode +// GFX9: buffer_atomic_add v0, off, s[0:3], src_shared_base offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x00,0x00,0xeb] +// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode +// NOGFX1250: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode +// NOSICI: :[[@LINE-5]]:36: error: src_shared_base register not available on this GPU +// NOVI: :[[@LINE-6]]:36: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:36: error: src_shared_base register not available on this GPU s_add_i32 s0, src_shared_base, s0 +// GFX11: s_add_i32 s0, src_shared_base, s0 ; encoding: [0xeb,0x00,0x00,0x81] // GFX12XX: s_add_co_i32 s0, src_shared_base, s0 ; encoding: [0xeb,0x00,0x00,0x81] -// NOSICI: :[[@LINE-2]]:15: error: src_shared_base register not available on this GPU // GFX9: s_add_i32 s0, src_shared_base, s0 ; encoding: [0xeb,0x00,0x00,0x81] -// GFX11: s_add_i32 s0, src_shared_base, s0 ; encoding: [0xeb,0x00,0x00,0x81] +// NOSICI: :[[@LINE-4]]:15: error: src_shared_base register not available on this GPU // NOVI: :[[@LINE-5]]:15: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:15: error: src_shared_base register not available on this GPU s_add_i32 s0, src_shared_limit, s0 +// GFX11: s_add_i32 s0, src_shared_limit, s0 ; encoding: [0xec,0x00,0x00,0x81] // GFX12XX: s_add_co_i32 s0, src_shared_limit, s0 ; encoding: [0xec,0x00,0x00,0x81] -// NOSICI: :[[@LINE-2]]:15: error: src_shared_limit register not available on this GPU // GFX9: s_add_i32 s0, src_shared_limit, s0 ; encoding: [0xec,0x00,0x00,0x81] -// GFX11: s_add_i32 s0, src_shared_limit, s0 ; encoding: [0xec,0x00,0x00,0x81] +// NOSICI: :[[@LINE-4]]:15: error: src_shared_limit register not available on this GPU // NOVI: :[[@LINE-5]]:15: error: src_shared_limit register not available on this GPU // NOSICIVI: :[[@LINE-1]]:15: error: src_shared_limit register not available on this GPU s_add_i32 s0, src_private_base, s0 +// GFX11: s_add_i32 s0, src_private_base, s0 ; encoding: [0xed,0x00,0x00,0x81] // GFX12XX: s_add_co_i32 s0, src_private_base, s0 ; encoding: [0xed,0x00,0x00,0x81] -// NOSICI: :[[@LINE-2]]:15: error: src_private_base register not available on this GPU // GFX9: s_add_i32 s0, src_private_base, s0 ; encoding: [0xed,0x00,0x00,0x81] -// GFX11: s_add_i32 s0, src_private_base, s0 ; encoding: [0xed,0x00,0x00,0x81] +// NOSICI: :[[@LINE-4]]:15: error: src_private_base register not available on this GPU // NOVI: :[[@LINE-5]]:15: error: src_private_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:15: error: src_private_base register not available on this GPU s_add_i32 s0, src_private_limit, s0 +// GFX11: s_add_i32 s0, src_private_limit, s0 ; encoding: [0xee,0x00,0x00,0x81] // GFX12XX: s_add_co_i32 s0, src_private_limit, s0 ; encoding: [0xee,0x00,0x00,0x81] -// NOSICI: :[[@LINE-2]]:15: error: src_private_limit register not available on this GPU // GFX9: s_add_i32 s0, src_private_limit, s0 ; encoding: [0xee,0x00,0x00,0x81] -// GFX11: s_add_i32 s0, src_private_limit, s0 ; encoding: [0xee,0x00,0x00,0x81] +// NOSICI: :[[@LINE-4]]:15: error: src_private_limit register not available on this GPU // NOVI: :[[@LINE-5]]:15: error: src_private_limit register not available on this GPU // NOSICIVI: :[[@LINE-1]]:15: error: src_private_limit register not available on this GPU s_add_i32 s0, src_pops_exiting_wave_id, s0 -// NOSICI: :[[@LINE-1]]:15: error: src_pops_exiting_wave_id register not available on this GPU // GFX9: s_add_i32 s0, src_pops_exiting_wave_id, s0 ; encoding: [0xef,0x00,0x00,0x81] -// NOVI: :[[@LINE-3]]:15: error: src_pops_exiting_wave_id register not available on this GPU -// NOGFX11: :[[@LINE-4]]:15: error: src_pops_exiting_wave_id register not available on this GPU -// NOGFX12: :[[@LINE-5]]:15: error: src_pops_exiting_wave_id register not available on this GPU -// NOGFX1250: :[[@LINE-6]]:15: error: src_pops_exiting_wave_id register not available on this GPU +// NOGFX11: :[[@LINE-2]]:15: error: src_pops_exiting_wave_id register not available on this GPU +// NOGFX12: :[[@LINE-3]]:15: error: src_pops_exiting_wave_id register not available on this GPU +// NOGFX1250: :[[@LINE-4]]:15: error: src_pops_exiting_wave_id register not available on this GPU +// NOSICI: :[[@LINE-5]]:15: error: src_pops_exiting_wave_id register not available on this GPU +// NOVI: :[[@LINE-6]]:15: error: src_pops_exiting_wave_id register not available on this GPU // NOSICIVI: :[[@LINE-1]]:15: error: src_pops_exiting_wave_id register not available on this GPU s_and_b64 s[0:1], s[0:1], src_shared_base +// GFX11: s_and_b64 s[0:1], s[0:1], src_shared_base ; encoding: [0x00,0xeb,0x80,0x8b] // GFX12XX: s_and_b64 s[0:1], s[0:1], src_shared_base ; encoding: [0x00,0xeb,0x80,0x8b] -// NOSICI: :[[@LINE-2]]:27: error: src_shared_base register not available on this GPU // GFX9: s_and_b64 s[0:1], s[0:1], src_shared_base ; encoding: [0x00,0xeb,0x80,0x86] -// GFX11: s_and_b64 s[0:1], s[0:1], src_shared_base ; encoding: [0x00,0xeb,0x80,0x8b] +// NOSICI: :[[@LINE-4]]:27: error: src_shared_base register not available on this GPU // NOVI: :[[@LINE-5]]:27: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:27: error: src_shared_base register not available on this GPU s_and_b64 s[0:1], s[0:1], src_shared_limit +// GFX11: s_and_b64 s[0:1], s[0:1], src_shared_limit ; encoding: [0x00,0xec,0x80,0x8b] // GFX12XX: s_and_b64 s[0:1], s[0:1], src_shared_limit ; encoding: [0x00,0xec,0x80,0x8b] -// NOSICI: :[[@LINE-2]]:27: error: src_shared_limit register not available on this GPU // GFX9: s_and_b64 s[0:1], s[0:1], src_shared_limit ; encoding: [0x00,0xec,0x80,0x86] -// GFX11: s_and_b64 s[0:1], s[0:1], src_shared_limit ; encoding: [0x00,0xec,0x80,0x8b] +// NOSICI: :[[@LINE-4]]:27: error: src_shared_limit register not available on this GPU // NOVI: :[[@LINE-5]]:27: error: src_shared_limit register not available on this GPU // NOSICIVI: :[[@LINE-1]]:27: error: src_shared_limit register not available on this GPU s_and_b64 s[0:1], s[0:1], src_private_base +// GFX11: s_and_b64 s[0:1], s[0:1], src_private_base ; encoding: [0x00,0xed,0x80,0x8b] // GFX12XX: s_and_b64 s[0:1], s[0:1], src_private_base ; encoding: [0x00,0xed,0x80,0x8b] -// NOSICI: :[[@LINE-2]]:27: error: src_private_base register not available on this GPU // GFX9: s_and_b64 s[0:1], s[0:1], src_private_base ; encoding: [0x00,0xed,0x80,0x86] -// GFX11: s_and_b64 s[0:1], s[0:1], src_private_base ; encoding: [0x00,0xed,0x80,0x8b] +// NOSICI: :[[@LINE-4]]:27: error: src_private_base register not available on this GPU // NOVI: :[[@LINE-5]]:27: error: src_private_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:27: error: src_private_base register not available on this GPU s_and_b64 s[0:1], s[0:1], src_private_limit +// GFX11: s_and_b64 s[0:1], s[0:1], src_private_limit ; encoding: [0x00,0xee,0x80,0x8b] // GFX12XX: s_and_b64 s[0:1], s[0:1], src_private_limit ; encoding: [0x00,0xee,0x80,0x8b] -// NOSICI: :[[@LINE-2]]:27: error: src_private_limit register not available on this GPU // GFX9: s_and_b64 s[0:1], s[0:1], src_private_limit ; encoding: [0x00,0xee,0x80,0x86] -// GFX11: s_and_b64 s[0:1], s[0:1], src_private_limit ; encoding: [0x00,0xee,0x80,0x8b] +// NOSICI: :[[@LINE-4]]:27: error: src_private_limit register not available on this GPU // NOVI: :[[@LINE-5]]:27: error: src_private_limit register not available on this GPU // NOSICIVI: :[[@LINE-1]]:27: error: src_private_limit register not available on this GPU s_and_b64 s[0:1], s[0:1], src_pops_exiting_wave_id -// NOSICI: :[[@LINE-1]]:27: error: src_pops_exiting_wave_id register not available on this GPU // GFX9: s_and_b64 s[0:1], s[0:1], src_pops_exiting_wave_id ; encoding: [0x00,0xef,0x80,0x86] -// NOVI: :[[@LINE-3]]:27: error: src_pops_exiting_wave_id register not available on this GPU -// NOGFX11: :[[@LINE-4]]:27: error: src_pops_exiting_wave_id register not available on this GPU -// NOGFX12: :[[@LINE-5]]:27: error: src_pops_exiting_wave_id register not available on this GPU -// NOGFX1250: :[[@LINE-6]]:27: error: src_pops_exiting_wave_id register not available on this GPU +// NOGFX11: :[[@LINE-2]]:27: error: src_pops_exiting_wave_id register not available on this GPU +// NOGFX12: :[[@LINE-3]]:27: error: src_pops_exiting_wave_id register not available on this GPU +// NOGFX1250: :[[@LINE-4]]:27: error: src_pops_exiting_wave_id register not available on this GPU +// NOSICI: :[[@LINE-5]]:27: error: src_pops_exiting_wave_id register not available on this GPU +// NOVI: :[[@LINE-6]]:27: error: src_pops_exiting_wave_id register not available on this GPU // NOSICIVI: :[[@LINE-1]]:27: error: src_pops_exiting_wave_id register not available on this GPU v_add_u16 v0, src_shared_base, v0 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX9: v_add_u16_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x4c] -// NOVI: :[[@LINE-3]]:15: error: src_shared_base register not available on this GPU -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-6]]:15: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_add_u16_sdwa v0, src_shared_base, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX9: v_add_u16_sdwa v0, src_shared_base, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x00,0x00,0x4c,0xeb,0x06,0x86,0x06] -// NOVI: :[[@LINE-3]]:20: error: src_shared_base register not available on this GPU -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-6]]:20: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_add_u16_sdwa v0, v0, src_shared_base dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX9: v_add_u16_sdwa v0, v0, src_shared_base dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0xd6,0x01,0x4c,0x00,0x06,0x06,0x86] -// NOVI: :[[@LINE-3]]:24: error: src_shared_base register not available on this GPU -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-6]]:24: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_add_u32 v0, src_shared_base, v0 +// GFX11: v_add_nc_u32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x4a] // GFX12XX: v_add_nc_u32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x4a] -// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX9: v_add_u32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x68] -// GFX11: v_add_nc_u32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x4a] +// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU // NOVI: :[[@LINE-5]]:15: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_add_u32_e64 v0, src_shared_base, v0 +// GFX11: v_add_nc_u32_e64 v0, src_shared_base, v0 ; encoding: [0x00,0x00,0x25,0xd5,0xeb,0x00,0x02,0x00] // GFX12XX: v_add_nc_u32_e64 v0, src_shared_base, v0 ; encoding: [0x00,0x00,0x25,0xd5,0xeb,0x00,0x02,0x00] -// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX9: v_add_u32_e64 v0, src_shared_base, v0 ; encoding: [0x00,0x00,0x34,0xd1,0xeb,0x00,0x02,0x00] -// GFX11: v_add_nc_u32_e64 v0, src_shared_base, v0 ; encoding: [0x00,0x00,0x25,0xd5,0xeb,0x00,0x02,0x00] +// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU // NOVI: :[[@LINE-5]]:19: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_cmp_eq_i64 vcc, src_shared_base, v[0:1] -// NOSICI: :[[@LINE-1]]:19: error: src_shared_base register not available on this GPU // GFX9: v_cmp_eq_i64_e32 vcc, src_shared_base, v[0:1] ; encoding: [0xeb,0x00,0xc4,0x7d] -// NOVI: :[[@LINE-3]]:19: error: src_shared_base register not available on this GPU -// NOGFX11: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode -// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode -// NOGFX1250: :[[@LINE-6]]:1: error: operands are not valid for this GPU or mode +// NOGFX11: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode +// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode +// NOGFX1250: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode +// NOSICI: :[[@LINE-5]]:19: error: src_shared_base register not available on this GPU +// NOVI: :[[@LINE-6]]:19: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:19: error: src_shared_base register not available on this GPU v_max_f16 v0, src_shared_base, v0 +// GFX11: v_max_f16_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x72] // GFX12XX: v_max_num_f16_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x62] -// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX9: v_max_f16_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x5a] -// GFX11: v_max_f16_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x72] +// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU // NOVI: :[[@LINE-5]]:15: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_max_f32 v0, src_shared_base, v0 +// GFX11: v_max_f32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x20] // GFX12XX: v_max_num_f32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x2c] -// NOSICI: :[[@LINE-2]]:15: error: src_shared_base register not available on this GPU // GFX9: v_max_f32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x16] -// GFX11: v_max_f32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x20] +// NOSICI: :[[@LINE-4]]:15: error: src_shared_base register not available on this GPU // NOVI: :[[@LINE-5]]:15: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:15: error: src_shared_base register not available on this GPU v_max_f64 v[0:1], src_shared_base, v[0:1] +// GFX11: v_max_f64 v[0:1], src_shared_base, v[0:1] ; encoding: [0x00,0x00,0x2a,0xd7,0xeb,0x00,0x02,0x00] // GFX12XX: v_max_num_f64_e32 v[0:1], src_shared_base, v[0:1] ; encoding: [0xeb,0x00,0x00,0x1c] -// NOSICI: :[[@LINE-2]]:19: error: src_shared_base register not available on this GPU // GFX9: v_max_f64 v[0:1], src_shared_base, v[0:1] ; encoding: [0x00,0x00,0x83,0xd2,0xeb,0x00,0x02,0x00] -// GFX11: v_max_f64 v[0:1], src_shared_base, v[0:1] ; encoding: [0x00,0x00,0x2a,0xd7,0xeb,0x00,0x02,0x00] +// NOSICI: :[[@LINE-4]]:19: error: src_shared_base register not available on this GPU // NOVI: :[[@LINE-5]]:19: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:19: error: src_shared_base register not available on this GPU v_pk_add_f16 v0, src_shared_base, v0 +// GFX11: v_pk_add_f16 v0, src_shared_base, v0 ; encoding: [0x00,0x40,0x0f,0xcc,0xeb,0x00,0x02,0x18] // GFX12XX: v_pk_add_f16 v0, src_shared_base, v0 ; encoding: [0x00,0x40,0x0f,0xcc,0xeb,0x00,0x02,0x18] -// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX9: v_pk_add_f16 v0, src_shared_base, v0 ; encoding: [0x00,0x40,0x8f,0xd3,0xeb,0x00,0x02,0x18] -// GFX11: v_pk_add_f16 v0, src_shared_base, v0 ; encoding: [0x00,0x40,0x0f,0xcc,0xeb,0x00,0x02,0x18] +// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU // NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_ceil_f16 v0, neg(src_shared_base) +// GFX11: v_ceil_f16_e64 v0, -src_shared_base ; encoding: [0x00,0x00,0xdc,0xd5,0xeb,0x00,0x00,0x20] // GFX12XX: v_ceil_f16_e64 v0, -src_shared_base ; encoding: [0x00,0x00,0xdc,0xd5,0xeb,0x00,0x00,0x20] -// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX9: v_ceil_f16_e64 v0, -src_shared_base ; encoding: [0x00,0x00,0x85,0xd1,0xeb,0x00,0x00,0x20] -// GFX11: v_ceil_f16_e64 v0, -src_shared_base ; encoding: [0x00,0x00,0xdc,0xd5,0xeb,0x00,0x00,0x20] +// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU // NOVI: :[[@LINE-5]]:20: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_ceil_f16 v0, abs(src_shared_base) +// GFX11: v_ceil_f16_e64 v0, |src_shared_base| ; encoding: [0x00,0x01,0xdc,0xd5,0xeb,0x00,0x00,0x00] // GFX12XX: v_ceil_f16_e64 v0, |src_shared_base| ; encoding: [0x00,0x01,0xdc,0xd5,0xeb,0x00,0x00,0x00] -// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX9: v_ceil_f16_e64 v0, |src_shared_base| ; encoding: [0x00,0x01,0x85,0xd1,0xeb,0x00,0x00,0x00] -// GFX11: v_ceil_f16_e64 v0, |src_shared_base| ; encoding: [0x00,0x01,0xdc,0xd5,0xeb,0x00,0x00,0x00] +// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU // NOVI: :[[@LINE-5]]:20: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_ceil_f64 v[5:6], |src_shared_base| -// GFX9: v_ceil_f64_e64 v[5:6], |src_shared_base| ; encoding: [0x05,0x01,0x58,0xd1,0xeb,0x00,0x00,0x00] // GFX11: v_ceil_f64_e64 v[5:6], |src_shared_base| ; encoding: [0x05,0x01,0x98,0xd5,0xeb,0x00,0x00,0x00] // GFX12: v_ceil_f64_e64 v[5:6], |src_shared_base| ; encoding: [0x05,0x01,0x98,0xd5,0xeb,0x00,0x00,0x00] -// NOSI: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOCI: :[[@LINE-5]]:21: error: src_shared_base register not available on this GPU -// NOVI: :[[@LINE-6]]:21: error: src_shared_base register not available on this GPU -// NOGFX1250: :[[@LINE-7]]:12: error: invalid operand for instruction +// GFX9: v_ceil_f64_e64 v[5:6], |src_shared_base| ; encoding: [0x05,0x01,0x58,0xd1,0xeb,0x00,0x00,0x00] +// NOCI: :[[@LINE-4]]:21: error: src_shared_base register not available on this GPU +// NOGFX1250: :[[@LINE-5]]:12: error: invalid operand for instruction +// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-7]]:21: error: src_shared_base register not available on this GPU // NOCIVI: :[[@LINE-5]]:21: error: src_shared_base register not available on this GPU v_ceil_f64 v[5:6], -src_shared_base -// GFX9: v_ceil_f64_e64 v[5:6], -src_shared_base ; encoding: [0x05,0x00,0x58,0xd1,0xeb,0x00,0x00,0x20] // GFX11: v_ceil_f64_e64 v[5:6], -src_shared_base ; encoding: [0x05,0x00,0x98,0xd5,0xeb,0x00,0x00,0x20] // GFX12: v_ceil_f64_e64 v[5:6], -src_shared_base ; encoding: [0x05,0x00,0x98,0xd5,0xeb,0x00,0x00,0x20] -// NOSI: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOCI: :[[@LINE-5]]:21: error: src_shared_base register not available on this GPU -// NOVI: :[[@LINE-6]]:21: error: src_shared_base register not available on this GPU -// NOGFX1250: :[[@LINE-7]]:12: error: invalid operand for instruction +// GFX9: v_ceil_f64_e64 v[5:6], -src_shared_base ; encoding: [0x05,0x00,0x58,0xd1,0xeb,0x00,0x00,0x20] +// NOCI: :[[@LINE-4]]:21: error: src_shared_base register not available on this GPU +// NOGFX1250: :[[@LINE-5]]:12: error: invalid operand for instruction +// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-7]]:21: error: src_shared_base register not available on this GPU // NOCIVI: :[[@LINE-5]]:21: error: src_shared_base register not available on this GPU v_ceil_f32 v0, -src_shared_base +// GFX11: v_ceil_f32_e64 v0, -src_shared_base ; encoding: [0x00,0x00,0xa2,0xd5,0xeb,0x00,0x00,0x20] // GFX12XX: v_ceil_f32_e64 v0, -src_shared_base ; encoding: [0x00,0x00,0xa2,0xd5,0xeb,0x00,0x00,0x20] -// NOSICI: :[[@LINE-2]]:17: error: src_shared_base register not available on this GPU // GFX9: v_ceil_f32_e64 v0, -src_shared_base ; encoding: [0x00,0x00,0x5d,0xd1,0xeb,0x00,0x00,0x20] -// GFX11: v_ceil_f32_e64 v0, -src_shared_base ; encoding: [0x00,0x00,0xa2,0xd5,0xeb,0x00,0x00,0x20] +// NOSICI: :[[@LINE-4]]:17: error: src_shared_base register not available on this GPU // NOVI: :[[@LINE-5]]:17: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:17: error: src_shared_base register not available on this GPU v_ceil_f32 v0, |src_shared_base| +// GFX11: v_ceil_f32_e64 v0, |src_shared_base| ; encoding: [0x00,0x01,0xa2,0xd5,0xeb,0x00,0x00,0x00] // GFX12XX: v_ceil_f32_e64 v0, |src_shared_base| ; encoding: [0x00,0x01,0xa2,0xd5,0xeb,0x00,0x00,0x00] -// NOSICI: :[[@LINE-2]]:17: error: src_shared_base register not available on this GPU // GFX9: v_ceil_f32_e64 v0, |src_shared_base| ; encoding: [0x00,0x01,0x5d,0xd1,0xeb,0x00,0x00,0x00] -// GFX11: v_ceil_f32_e64 v0, |src_shared_base| ; encoding: [0x00,0x01,0xa2,0xd5,0xeb,0x00,0x00,0x00] +// NOSICI: :[[@LINE-4]]:17: error: src_shared_base register not available on this GPU // NOVI: :[[@LINE-5]]:17: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:17: error: src_shared_base register not available on this GPU v_ceil_f16_sdwa v5, |src_shared_base| dst_sel:DWORD dst_unused:UNUSED_PRESERVE -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX9: v_ceil_f16_sdwa v5, |src_shared_base| dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x8a,0x0a,0x7e,0xeb,0x16,0xa6,0x00] -// NOVI: :[[@LINE-3]]:22: error: src_shared_base register not available on this GPU -// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported -// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported -// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported +// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported +// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported +// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-6]]:22: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_ceil_f16_sdwa v5, -src_shared_base dst_sel:DWORD dst_unused:UNUSED_PRESERVE -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU // GFX9: v_ceil_f16_sdwa v5, -src_shared_base dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x8a,0x0a,0x7e,0xeb,0x16,0x96,0x00] -// NOVI: :[[@LINE-3]]:22: error: src_shared_base register not available on this GPU -// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported -// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported -// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported +// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported +// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported +// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-6]]:22: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_ceil_f32_sdwa v5, src_shared_base dst_sel:DWORD src0_sel:DWORD -// NOSICI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported // GFX9: v_ceil_f32_sdwa v5, src_shared_base dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x3a,0x0a,0x7e,0xeb,0x16,0x86,0x00] -// NOVI: :[[@LINE-3]]:21: error: src_shared_base register not available on this GPU -// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported -// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported -// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported +// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported +// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported +// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported +// NOSICI: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported +// NOVI: :[[@LINE-6]]:21: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported v_ceil_f32_sdwa v5, |src_shared_base| dst_sel:DWORD src0_sel:DWORD -// NOSICI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported // GFX9: v_ceil_f32_sdwa v5, |src_shared_base| dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x3a,0x0a,0x7e,0xeb,0x16,0xa6,0x00] -// NOVI: :[[@LINE-3]]:22: error: src_shared_base register not available on this GPU -// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported -// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported -// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported +// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported +// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported +// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported +// NOSICI: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported +// NOVI: :[[@LINE-6]]:22: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported //---------------------------------------------------------------------------// @@ -1584,206 +1584,206 @@ v_ceil_f32_sdwa v5, |src_shared_base| dst_sel:DWORD src0_sel:DWORD //---------------------------------------------------------------------------// v_add_u32 v0, private_base, s0 -// GFX12XX: v_add_nc_u32_e64 v0, src_private_base, s0 ; encoding: [0x00,0x00,0x25,0xd5,0xed,0x00,0x00,0x00] -// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_add_nc_u32_e64 v0, src_private_base, s0 ; encoding: [0x00,0x00,0x25,0xd5,0xed,0x00,0x00,0x00] -// NOVI: :[[@LINE-4]]:15: error: src_private_base register not available on this GPU -// NOGFX9: :[[@LINE-5]]:29: error: invalid operand (violates constant bus restrictions) +// GFX12XX: v_add_nc_u32_e64 v0, src_private_base, s0 ; encoding: [0x00,0x00,0x25,0xd5,0xed,0x00,0x00,0x00] +// NOGFX9: :[[@LINE-3]]:29: error: invalid operand (violates constant bus restrictions) +// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-5]]:15: error: src_private_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_add_u32 v0, scc, s0 -// GFX12XX: v_add_nc_u32_e64 v0, src_scc, s0 ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x00,0x00] -// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_add_nc_u32_e64 v0, src_scc, s0 ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x00,0x00] -// NOVI: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode -// NOGFX9: :[[@LINE-5]]:20: error: invalid operand (violates constant bus restrictions) +// GFX12XX: v_add_nc_u32_e64 v0, src_scc, s0 ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x00,0x00] +// NOGFX9: :[[@LINE-3]]:20: error: invalid operand (violates constant bus restrictions) +// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU // v_div_fmas implicitly reads VCC v_div_fmas_f32 v0, shared_base, v0, v1 -// GFX12XX: v_div_fmas_f32 v0, src_shared_base, v0, v1 ; encoding: [0x00,0x00,0x37,0xd6,0xeb,0x00,0x06,0x04] -// NOSICI: :[[@LINE-2]]:20: error: src_shared_base register not available on this GPU // GFX11: v_div_fmas_f32 v0, src_shared_base, v0, v1 ; encoding: [0x00,0x00,0x37,0xd6,0xeb,0x00,0x06,0x04] -// NOVI: :[[@LINE-4]]:20: error: src_shared_base register not available on this GPU -// NOGFX9: :[[@LINE-5]]:20: error: invalid operand (violates constant bus restrictions) +// GFX12XX: v_div_fmas_f32 v0, src_shared_base, v0, v1 ; encoding: [0x00,0x00,0x37,0xd6,0xeb,0x00,0x06,0x04] +// NOGFX9: :[[@LINE-3]]:20: error: invalid operand (violates constant bus restrictions) +// NOSICI: :[[@LINE-4]]:20: error: src_shared_base register not available on this GPU +// NOVI: :[[@LINE-5]]:20: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:20: error: src_shared_base register not available on this GPU // v_div_fmas implicitly reads VCC v_div_fmas_f32 v0, v0, shared_limit, v1 -// GFX12XX: v_div_fmas_f32 v0, v0, src_shared_limit, v1 ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xd9,0x05,0x04] -// NOSICI: :[[@LINE-2]]:24: error: src_shared_limit register not available on this GPU // GFX11: v_div_fmas_f32 v0, v0, src_shared_limit, v1 ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xd9,0x05,0x04] -// NOVI: :[[@LINE-4]]:24: error: src_shared_limit register not available on this GPU -// NOGFX9: :[[@LINE-5]]:24: error: invalid operand (violates constant bus restrictions) +// GFX12XX: v_div_fmas_f32 v0, v0, src_shared_limit, v1 ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xd9,0x05,0x04] +// NOGFX9: :[[@LINE-3]]:24: error: invalid operand (violates constant bus restrictions) +// NOSICI: :[[@LINE-4]]:24: error: src_shared_limit register not available on this GPU +// NOVI: :[[@LINE-5]]:24: error: src_shared_limit register not available on this GPU // NOSICIVI: :[[@LINE-1]]:24: error: src_shared_limit register not available on this GPU // v_div_fmas implicitly reads VCC v_div_fmas_f32 v0, v0, v1, private_limit -// GFX12XX: v_div_fmas_f32 v0, v0, v1, src_private_limit ; encoding: [0x00,0x00,0x37,0xd6,0x00,0x03,0xba,0x03] -// NOSICI: :[[@LINE-2]]:28: error: src_private_limit register not available on this GPU // GFX11: v_div_fmas_f32 v0, v0, v1, src_private_limit ; encoding: [0x00,0x00,0x37,0xd6,0x00,0x03,0xba,0x03] -// NOVI: :[[@LINE-4]]:28: error: src_private_limit register not available on this GPU -// NOGFX9: :[[@LINE-5]]:28: error: invalid operand (violates constant bus restrictions) +// GFX12XX: v_div_fmas_f32 v0, v0, v1, src_private_limit ; encoding: [0x00,0x00,0x37,0xd6,0x00,0x03,0xba,0x03] +// NOGFX9: :[[@LINE-3]]:28: error: invalid operand (violates constant bus restrictions) +// NOSICI: :[[@LINE-4]]:28: error: src_private_limit register not available on this GPU +// NOVI: :[[@LINE-5]]:28: error: src_private_limit register not available on this GPU // NOSICIVI: :[[@LINE-1]]:28: error: src_private_limit register not available on this GPU // v_div_fmas implicitly reads VCC v_div_fmas_f32 v0, execz, v0, v1 -// NOSICI: :[[@LINE-1]]:20: error: invalid operand (violates constant bus restrictions) -// NOGFX89: :[[@LINE-2]]:20: error: invalid operand (violates constant bus restrictions) -// NOGFX11: :[[@LINE-3]]:20: error: src_execz register not available on this GPU -// NOGFX12: :[[@LINE-4]]:20: error: src_execz register not available on this GPU -// NOGFX1250: :[[@LINE-5]]:20: error: src_execz register not available on this GPU +// NOGFX11: :[[@LINE-1]]:20: error: src_execz register not available on this GPU +// NOGFX12: :[[@LINE-2]]:20: error: src_execz register not available on this GPU +// NOGFX1250: :[[@LINE-3]]:20: error: src_execz register not available on this GPU +// NOGFX89: :[[@LINE-4]]:20: error: invalid operand (violates constant bus restrictions) +// NOSICI: :[[@LINE-5]]:20: error: invalid operand (violates constant bus restrictions) // NOSICIVI: :[[@LINE-1]]:20: error: invalid operand (violates constant bus restrictions) // v_div_fmas implicitly reads VCC v_div_fmas_f32 v0, v0, scc, v1 +// GFX11: v_div_fmas_f32 v0, v0, src_scc, v1 ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xfb,0x05,0x04] // GFX12XX: v_div_fmas_f32 v0, v0, src_scc, v1 ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xfb,0x05,0x04] -// NOSICI: :[[@LINE-2]]:24: error: invalid operand (violates constant bus restrictions) // NOGFX89: :[[@LINE-3]]:24: error: invalid operand (violates constant bus restrictions) -// GFX11: v_div_fmas_f32 v0, v0, src_scc, v1 ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xfb,0x05,0x04] +// NOSICI: :[[@LINE-4]]:24: error: invalid operand (violates constant bus restrictions) // NOSICIVI: :[[@LINE-1]]:24: error: invalid operand (violates constant bus restrictions) // v_div_fmas implicitly reads VCC v_div_fmas_f32 v0, v0, v1, vccz -// NOSICI: :[[@LINE-1]]:28: error: invalid operand (violates constant bus restrictions) -// NOGFX89: :[[@LINE-2]]:28: error: invalid operand (violates constant bus restrictions) -// NOGFX11: :[[@LINE-3]]:28: error: src_vccz register not available on this GPU -// NOGFX12: :[[@LINE-4]]:28: error: src_vccz register not available on this GPU -// NOGFX1250: :[[@LINE-5]]:28: error: src_vccz register not available on this GPU +// NOGFX11: :[[@LINE-1]]:28: error: src_vccz register not available on this GPU +// NOGFX12: :[[@LINE-2]]:28: error: src_vccz register not available on this GPU +// NOGFX1250: :[[@LINE-3]]:28: error: src_vccz register not available on this GPU +// NOGFX89: :[[@LINE-4]]:28: error: invalid operand (violates constant bus restrictions) +// NOSICI: :[[@LINE-5]]:28: error: invalid operand (violates constant bus restrictions) // NOSICIVI: :[[@LINE-1]]:28: error: invalid operand (violates constant bus restrictions) // v_addc_co_u32 implicitly reads VCC (VOP2) v_addc_co_u32 v0, vcc, shared_base, v0, vcc -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU -// NOGFX9: :[[@LINE-3]]:24: error: invalid operand (violates constant bus restrictions) -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX9: :[[@LINE-4]]:24: error: invalid operand (violates constant bus restrictions) +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-6]]:1: error: instruction not supported on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_madak_f32 v0, shared_base, v0, 0x11213141 -// NOSICI: :[[@LINE-1]]:17: error: src_shared_base register not available on this GPU -// NOVI: :[[@LINE-2]]:17: error: src_shared_base register not available on this GPU -// NOGFX9: :[[@LINE-3]]:17: error: invalid operand (violates constant bus restrictions) -// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX9: :[[@LINE-4]]:17: error: invalid operand (violates constant bus restrictions) +// NOSICI: :[[@LINE-5]]:17: error: src_shared_base register not available on this GPU +// NOVI: :[[@LINE-6]]:17: error: src_shared_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:17: error: src_shared_base register not available on this GPU v_madak_f32 v0, scc, v0, 0x11213141 -// NOSICI: :[[@LINE-1]]:17: error: invalid operand (violates constant bus restrictions) -// NOGFX89: :[[@LINE-2]]:17: error: invalid operand (violates constant bus restrictions) -// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:17: error: invalid operand (violates constant bus restrictions) +// NOSICI: :[[@LINE-5]]:17: error: invalid operand (violates constant bus restrictions) // NOSICIVI: :[[@LINE-1]]:17: error: invalid operand (violates constant bus restrictions) v_madak_f32 v0, 0xff32ff, v0, 0x11213141 -// NOSICI: :[[@LINE-1]]:31: error: only one unique literal operand is allowed -// NOGFX89: :[[@LINE-2]]:31: error: only one unique literal operand is allowed -// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:31: error: only one unique literal operand is allowed +// NOSICI: :[[@LINE-5]]:31: error: only one unique literal operand is allowed // NOSICIVI: :[[@LINE-1]]:31: error: only one unique literal operand is allowed v_madak_f32 v0, 0xff32ff, v0, 1 -// NOSICI: :[[@LINE-1]]:31: error: only one unique literal operand is allowed -// NOGFX89: :[[@LINE-2]]:31: error: only one unique literal operand is allowed -// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:31: error: only one unique literal operand is allowed +// NOSICI: :[[@LINE-5]]:31: error: only one unique literal operand is allowed // NOSICIVI: :[[@LINE-1]]:31: error: only one unique literal operand is allowed v_madmk_f32 v0, 0xff32ff, 0x11213141, v0 -// NOSICI: :[[@LINE-1]]:27: error: only one unique literal operand is allowed -// NOGFX89: :[[@LINE-2]]:27: error: only one unique literal operand is allowed -// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:27: error: only one unique literal operand is allowed +// NOSICI: :[[@LINE-5]]:27: error: only one unique literal operand is allowed // NOSICIVI: :[[@LINE-1]]:27: error: only one unique literal operand is allowed v_madmk_f32 v0, 0xff32ff, -1, v0 -// NOSICI: :[[@LINE-1]]:27: error: only one unique literal operand is allowed -// NOGFX89: :[[@LINE-2]]:27: error: only one unique literal operand is allowed -// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:27: error: only one unique literal operand is allowed +// NOSICI: :[[@LINE-5]]:27: error: only one unique literal operand is allowed // NOSICIVI: :[[@LINE-1]]:27: error: only one unique literal operand is allowed v_madak_f16 v0, 0xff32, v0, 0x1122 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:29: error: only one unique literal operand is allowed -// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:29: error: only one unique literal operand is allowed +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_madak_f16 v0, 0xff32, v0, 0 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:29: error: only one unique literal operand is allowed -// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:29: error: only one unique literal operand is allowed +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_madmk_f16 v0, 0xff32, 0x1122, v0 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:25: error: only one unique literal operand is allowed -// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:25: error: only one unique literal operand is allowed +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_madmk_f16 v0, 0xff32, 1, v0 -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOGFX89: :[[@LINE-2]]:25: error: only one unique literal operand is allowed -// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:25: error: only one unique literal operand is allowed +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_cmp_eq_f32 s[0:1], private_base, private_limit -// NOSICI: :[[@LINE-1]]:22: error: src_private_base register not available on this GPU -// NOVI: :[[@LINE-2]]:22: error: src_private_base register not available on this GPU -// NOGFX9: :[[@LINE-3]]:36: error: invalid operand (violates constant bus restrictions) -// NOGFX11: :[[@LINE-4]]:14: error: invalid operand for instruction -// NOGFX12: :[[@LINE-5]]:14: error: invalid operand for instruction -// NOGFX1250: :[[@LINE-6]]:14: error: invalid operand for instruction +// NOGFX11: :[[@LINE-1]]:14: error: invalid operand for instruction +// NOGFX12: :[[@LINE-2]]:14: error: invalid operand for instruction +// NOGFX1250: :[[@LINE-3]]:14: error: invalid operand for instruction +// NOGFX9: :[[@LINE-4]]:36: error: invalid operand (violates constant bus restrictions) +// NOSICI: :[[@LINE-5]]:22: error: src_private_base register not available on this GPU +// NOVI: :[[@LINE-6]]:22: error: src_private_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:22: error: src_private_base register not available on this GPU v_cmp_eq_f32 s[0:1], private_base, s0 -// NOSICI: :[[@LINE-1]]:22: error: src_private_base register not available on this GPU -// NOVI: :[[@LINE-2]]:22: error: src_private_base register not available on this GPU -// NOGFX9: :[[@LINE-3]]:36: error: invalid operand (violates constant bus restrictions) -// NOGFX11: :[[@LINE-4]]:14: error: invalid operand for instruction -// NOGFX12: :[[@LINE-5]]:14: error: invalid operand for instruction -// NOGFX1250: :[[@LINE-6]]:14: error: invalid operand for instruction +// NOGFX11: :[[@LINE-1]]:14: error: invalid operand for instruction +// NOGFX12: :[[@LINE-2]]:14: error: invalid operand for instruction +// NOGFX1250: :[[@LINE-3]]:14: error: invalid operand for instruction +// NOGFX9: :[[@LINE-4]]:36: error: invalid operand (violates constant bus restrictions) +// NOSICI: :[[@LINE-5]]:22: error: src_private_base register not available on this GPU +// NOVI: :[[@LINE-6]]:22: error: src_private_base register not available on this GPU // NOSICIVI: :[[@LINE-1]]:22: error: src_private_base register not available on this GPU v_cmp_eq_f32 s[0:1], execz, s0 -// NOSICI: :[[@LINE-1]]:29: error: invalid operand (violates constant bus restrictions) -// NOGFX89: :[[@LINE-2]]:29: error: invalid operand (violates constant bus restrictions) -// NOGFX11: :[[@LINE-3]]:22: error: src_execz register not available on this GPU -// NOGFX12: :[[@LINE-4]]:22: error: src_execz register not available on this GPU -// NOGFX1250: :[[@LINE-5]]:22: error: src_execz register not available on this GPU +// NOGFX11: :[[@LINE-1]]:22: error: src_execz register not available on this GPU +// NOGFX12: :[[@LINE-2]]:22: error: src_execz register not available on this GPU +// NOGFX1250: :[[@LINE-3]]:22: error: src_execz register not available on this GPU +// NOGFX89: :[[@LINE-4]]:29: error: invalid operand (violates constant bus restrictions) +// NOSICI: :[[@LINE-5]]:29: error: invalid operand (violates constant bus restrictions) // NOSICIVI: :[[@LINE-1]]:29: error: invalid operand (violates constant bus restrictions) v_pk_add_f16 v255, private_base, private_limit -// GFX12XX: v_pk_add_f16 v255, src_private_base, src_private_limit ; encoding: [0xff,0x40,0x0f,0xcc,0xed,0xdc,0x01,0x18] -// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU // GFX11: v_pk_add_f16 v255, src_private_base, src_private_limit ; encoding: [0xff,0x40,0x0f,0xcc,0xed,0xdc,0x01,0x18] -// NOVI: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX9: :[[@LINE-5]]:34: error: invalid operand (violates constant bus restrictions) +// GFX12XX: v_pk_add_f16 v255, src_private_base, src_private_limit ; encoding: [0xff,0x40,0x0f,0xcc,0xed,0xdc,0x01,0x18] +// NOGFX9: :[[@LINE-3]]:34: error: invalid operand (violates constant bus restrictions) +// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU v_pk_add_f16 v255, vccz, execz -// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU -// NOVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU -// NOGFX9: :[[@LINE-3]]:26: error: invalid operand (violates constant bus restrictions) -// NOGFX11: :[[@LINE-4]]:20: error: src_vccz register not available on this GPU -// NOGFX12: :[[@LINE-5]]:20: error: src_vccz register not available on this GPU -// NOGFX1250: :[[@LINE-6]]:20: error: src_vccz register not available on this GPU +// NOGFX11: :[[@LINE-1]]:20: error: src_vccz register not available on this GPU +// NOGFX12: :[[@LINE-2]]:20: error: src_vccz register not available on this GPU +// NOGFX1250: :[[@LINE-3]]:20: error: src_vccz register not available on this GPU +// NOGFX9: :[[@LINE-4]]:26: error: invalid operand (violates constant bus restrictions) +// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOVI: :[[@LINE-6]]:1: error: instruction not supported on this GPU // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU //---------------------------------------------------------------------------// @@ -1791,36 +1791,36 @@ v_pk_add_f16 v255, vccz, execz //---------------------------------------------------------------------------// v_sqrt_f32 v2, lit(123) -// SICI: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00] -// GFX89: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x4e,0x04,0x7e,0x7b,0x00,0x00,0x00] -// GFX12XX: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00] // GFX11: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00] +// GFX12XX: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00] +// GFX89: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x4e,0x04,0x7e,0x7b,0x00,0x00,0x00] +// SICI: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00] v_sqrt_f32 v2, abs(lit(123)) -// SICI: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00] -// GFX89: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x4e,0x04,0x7e,0x7b,0x00,0x00,0x00] -// GFX12XX: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00] // GFX11: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00] +// GFX12XX: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00] +// GFX89: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x4e,0x04,0x7e,0x7b,0x00,0x00,0x00] +// SICI: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00] v_sqrt_f32 v2, lit(123.0) -// SICI: v_sqrt_f32_e32 v2, lit(0x42f60000) ; encoding: [0xff,0x66,0x04,0x7e,0x00,0x00,0xf6,0x42] -// GFX89: v_sqrt_f32_e32 v2, lit(0x42f60000) ; encoding: [0xff,0x4e,0x04,0x7e,0x00,0x00,0xf6,0x42] -// GFX12XX: v_sqrt_f32_e32 v2, lit(0x42f60000) ; encoding: [0xff,0x66,0x04,0x7e,0x00,0x00,0xf6,0x42] // GFX11: v_sqrt_f32_e32 v2, lit(0x42f60000) ; encoding: [0xff,0x66,0x04,0x7e,0x00,0x00,0xf6,0x42] +// GFX12XX: v_sqrt_f32_e32 v2, lit(0x42f60000) ; encoding: [0xff,0x66,0x04,0x7e,0x00,0x00,0xf6,0x42] +// GFX89: v_sqrt_f32_e32 v2, lit(0x42f60000) ; encoding: [0xff,0x4e,0x04,0x7e,0x00,0x00,0xf6,0x42] +// SICI: v_sqrt_f32_e32 v2, lit(0x42f60000) ; encoding: [0xff,0x66,0x04,0x7e,0x00,0x00,0xf6,0x42] v_sqrt_f64 v[2:3], lit(123.0) -// SICI: v_sqrt_f64_e32 v[2:3], lit(0x405ec000) ; encoding: [0xff,0x68,0x04,0x7e,0x00,0xc0,0x5e,0x40] -// GFX89: v_sqrt_f64_e32 v[2:3], lit(0x405ec000) ; encoding: [0xff,0x50,0x04,0x7e,0x00,0xc0,0x5e,0x40] // GFX11: v_sqrt_f64_e32 v[2:3], lit(0x405ec000) ; encoding: [0xff,0x68,0x04,0x7e,0x00,0xc0,0x5e,0x40] // GFX12: v_sqrt_f64_e32 v[2:3], lit(0x405ec000) ; encoding: [0xff,0x68,0x04,0x7e,0x00,0xc0,0x5e,0x40] // GFX1250: v_sqrt_f64_e32 v[2:3], lit(0x405ec000) ; encoding: [0xfe,0x68,0x04,0x7e,0x00,0xc0,0x5e,0x40,0x00,0x00,0x00,0x00] +// GFX89: v_sqrt_f64_e32 v[2:3], lit(0x405ec000) ; encoding: [0xff,0x50,0x04,0x7e,0x00,0xc0,0x5e,0x40] +// SICI: v_sqrt_f64_e32 v[2:3], lit(0x405ec000) ; encoding: [0xff,0x68,0x04,0x7e,0x00,0xc0,0x5e,0x40] v_sqrt_f64 v[2:3], lit(123) -// SICI: v_sqrt_f64_e32 v[2:3], lit(0x7b) ; encoding: [0xff,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00] -// GFX89: v_sqrt_f64_e32 v[2:3], lit(0x7b) ; encoding: [0xff,0x50,0x04,0x7e,0x7b,0x00,0x00,0x00] // GFX11: v_sqrt_f64_e32 v[2:3], lit(0x7b) ; encoding: [0xff,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00] // GFX12: v_sqrt_f64_e32 v[2:3], lit(0x7b) ; encoding: [0xff,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00] // GFX1250: v_sqrt_f64_e32 v[2:3], lit(0x7b) ; encoding: [0xfe,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00,0x00,0x00,0x00,0x00] +// GFX89: v_sqrt_f64_e32 v[2:3], lit(0x7b) ; encoding: [0xff,0x50,0x04,0x7e,0x7b,0x00,0x00,0x00] +// SICI: v_sqrt_f64_e32 v[2:3], lit(0x7b) ; encoding: [0xff,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00] v_sqrt_f32 v2, lit 123.0 // NOGCN: :[[@LINE-1]]:20: error: expected left paren after lit @@ -1834,16 +1834,16 @@ v_sqrt_f32 v2, lit(v1) // Make sure lit() is accepted on operands without modifiers. v_madak_f32 v4, lit(0x7e8), v8, lit(0x7e8) -// SICI: v_madak_f32 v4, lit(0x7e8), v8, lit(0x7e8) ; encoding: [0xff,0x10,0x08,0x42,0xe8,0x07,0x00,0x00] // GFX89: v_madak_f32 v4, lit(0x7e8), v8, lit(0x7e8) ; encoding: [0xff,0x10,0x08,0x30,0xe8,0x07,0x00,0x00] -// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU +// SICI: v_madak_f32 v4, lit(0x7e8), v8, lit(0x7e8) ; encoding: [0xff,0x10,0x08,0x42,0xe8,0x07,0x00,0x00] v_madak_f32 v4, lit(lit(0x7e8)), v8, lit(0x7e8) -// NOSICI: :[[@LINE-1]]:24: error: not a valid operand. -// NOGFX89: :[[@LINE-2]]:24: error: not a valid operand. -// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU -// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU -// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU +// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU +// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU +// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU +// NOGFX89: :[[@LINE-4]]:24: error: not a valid operand. +// NOSICI: :[[@LINE-5]]:24: error: not a valid operand. // NOSICIVI: :[[@LINE-1]]:24: error: not a valid operand. diff --git a/llvm/utils/update_mc_test_checks.py b/llvm/utils/update_mc_test_checks.py index ab7fe19c432de..67fff56333e03 100755 --- a/llvm/utils/update_mc_test_checks.py +++ b/llvm/utils/update_mc_test_checks.py @@ -290,11 +290,9 @@ def update_test(ti: common.TestInfo): # prefix is selected and generated with most shared output lines # each run_id can only be used once - gen_prefix = "" used_runid = set() - # line number diff between generated prefix and testline - line_offset = 1 + selected_prefixes = set() for prefix, tup in p_dict_sorted.items(): o, run_ids = tup @@ -308,18 +306,24 @@ def update_test(ti: common.TestInfo): else: used_runid.add(i) if not skip: - used_prefixes.add(prefix) + selected_prefixes.add(prefix) - if hasErr(o): - newline = getErrCheckLine(prefix, o, mc_mode, line_offset) - else: - newline = getStdCheckLine(prefix, o, mc_mode) + # Generate check lines in alphabetical order. + check_lines = [] + for prefix in sorted(selected_prefixes): + o, run_ids = p_dict[prefix] + used_prefixes.add(prefix) + + if hasErr(o): + line_offset = len(check_lines) + 1 + check = getErrCheckLine(prefix, o, mc_mode, line_offset) + else: + check = getStdCheckLine(prefix, o, mc_mode) - if newline: - gen_prefix += newline - line_offset += 1 + if check: + check_lines.append(check.strip()) - generated_prefixes[input_line] = gen_prefix.rstrip("\n") + generated_prefixes[input_line] = "\n".join(check_lines) # write output for input_info in ti.iterlines(output_lines): From 20340accf235579d64faf322abc428bc5ddd7f91 Mon Sep 17 00:00:00 2001 From: Sam Parker Date: Wed, 22 Oct 2025 11:43:44 +0100 Subject: [PATCH 077/530] [NFC][WebAssembly] FP conversion interleave tests (#164576) --- .../CodeGen/WebAssembly/memory-interleave.ll | 1608 +++++++++++++++++ .../CodeGen/WebAssembly/simd-vector-trunc.ll | 39 + .../WebAssembly/memory-interleave.ll | 1000 ++++++++++ 3 files changed, 2647 insertions(+) diff --git a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll index 94efe0f4157f7..104ec31e13d3e 100644 --- a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll +++ b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll @@ -5,6 +5,7 @@ target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20 %struct.TwoInts = type { i32, i32 } %struct.ThreeInts = type { i32, i32, i32 } %struct.FourInts = type { i32, i32, i32, i32 } +%struct.TwoShorts = type { i16, i16 } %struct.ThreeShorts = type { i16, i16, i16 } %struct.FourShorts = type { i16, i16, i16, i16 } %struct.FiveShorts = type { i16, i16, i16, i16, i16 } @@ -12,6 +13,8 @@ target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20 %struct.ThreeBytes = type { i8, i8, i8 } %struct.FourBytes = type { i8, i8, i8, i8 } %struct.EightBytes = type { i8, i8, i8, i8, i8, i8, i8, i8 } +%struct.TwoFloats = type { float, float } +%struct.FourFloats = type { float, float, float, float } ; CHECK-LABEL: two_ints_same_op: ; CHECK: loop @@ -1536,3 +1539,1608 @@ define hidden void @scale_uv_row_down2_linear(ptr nocapture noundef readonly %0, 34: ; preds = %6, %4 ret void } + +; CHECK-LABEL: two_floats_same_op: +; CHECK-NOT: f32x4.mul +define hidden void @two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp21.not = icmp eq i32 %N, 0 + br i1 %cmp21.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.022 + store float %mul, ptr %arrayidx3, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store float %mul8, ptr %y10, align 4 + %inc = add nuw i32 %i.022, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_floats_vary_op: +; CHECK-NOT: f32x4 +define hidden void @two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp20.not = icmp eq i32 %N, 0 + br i1 %cmp20.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.021 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.021 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.021 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd float %0, %1 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.021 + store float %add, ptr %arrayidx3, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %sub = fsub float %2, %3 + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store float %sub, ptr %y9, align 4 + %inc = add nuw i32 %i.021, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_bytes_two_floats_same_op: +; CHECK: loop +; CHECK: v128.load64_zero +; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: v128.load64_zero +; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.mul +; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.mul +; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: v128.store +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: v128.store +define hidden void @two_bytes_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp24.not = icmp eq i32 %N, 0 + br i1 %cmp24.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.025 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoBytes, ptr %a, i32 %i.025 + %0 = load i8, ptr %arrayidx, align 1 + %conv = sitofp i8 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.TwoBytes, ptr %b, i32 %i.025 + %1 = load i8, ptr %arrayidx1, align 1 + %conv3 = sitofp i8 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.025 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1 + %2 = load i8, ptr %y, align 1 + %conv7 = sitofp i8 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1 + %3 = load i8, ptr %y9, align 1 + %conv10 = sitofp i8 %3 to float + %mul11 = fmul float %conv7, %conv10 + %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %mul11, ptr %y13, align 4 + %inc = add nuw i32 %i.025, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_bytes_two_floats_vary_op: +; CHECK: v128.load64_zero +; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: v128.load64_zero +; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.add +; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.sub +; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: v128.store +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: v128.store +define hidden void @two_bytes_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp23.not = icmp eq i32 %N, 0 + br i1 %cmp23.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.024 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoBytes, ptr %a, i32 %i.024 + %0 = load i8, ptr %arrayidx, align 1 + %conv = sitofp i8 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.TwoBytes, ptr %b, i32 %i.024 + %1 = load i8, ptr %arrayidx1, align 1 + %conv3 = sitofp i8 %1 to float + %add = fadd float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.024 + store float %add, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1 + %2 = load i8, ptr %y, align 1 + %conv7 = sitofp i8 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1 + %3 = load i8, ptr %y9, align 1 + %conv10 = sitofp i8 %3 to float + %sub = fsub float %conv7, %conv10 + %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %sub, ptr %y12, align 4 + %inc = add nuw i32 %i.024, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_floats_two_bytes_same_op: +; CHECK: loop +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: f32x4.mul +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.splat +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: f32x4.mul +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: v128.store64_lane +define hidden void @two_floats_two_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp22.not = icmp eq i32 %N, 0 + br i1 %cmp22.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.023 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.023 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i8 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoBytes, ptr %res, i32 %i.023 + store i8 %conv, ptr %arrayidx3, align 1 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %conv9 = fptosi float %mul8 to i8 + %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1 + store i8 %conv9, ptr %y11, align 1 + %inc = add nuw i32 %i.023, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_floats_two_bytes_vary_op: +; CHECK: loop +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: f32x4.add +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.splat +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: f32x4.sub +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: v128.store64_lane +define hidden void @two_floats_two_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp21.not = icmp eq i32 %N, 0 + br i1 %cmp21.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd float %0, %1 + %conv = fptosi float %add to i8 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoBytes, ptr %res, i32 %i.022 + store i8 %conv, ptr %arrayidx3, align 1 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %sub = fsub float %2, %3 + %conv8 = fptosi float %sub to i8 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1 + store i8 %conv8, ptr %y10, align 1 + %inc = add nuw i32 %i.022, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_shorts_two_floats_same_op: +; CHECK: loop +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.mul +; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.mul +; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: v128.store +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: v128.store +define hidden void @two_shorts_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp24.not = icmp eq i32 %N, 0 + br i1 %cmp24.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.025 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoShorts, ptr %a, i32 %i.025 + %0 = load i16, ptr %arrayidx, align 2 + %conv = sitofp i16 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.TwoShorts, ptr %b, i32 %i.025 + %1 = load i16, ptr %arrayidx1, align 2 + %conv3 = sitofp i16 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.025 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %2 = load i16, ptr %y, align 2 + %conv7 = sitofp i16 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %3 = load i16, ptr %y9, align 2 + %conv10 = sitofp i16 %3 to float + %mul11 = fmul float %conv7, %conv10 + %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %mul11, ptr %y13, align 4 + %inc = add nuw i32 %i.025, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_shorts_two_floats_vary_op: +; CHECK: loop +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.add +; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.sub +; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: v128.store +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: v128.store +define hidden void @two_shorts_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp23.not = icmp eq i32 %N, 0 + br i1 %cmp23.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.024 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoShorts, ptr %a, i32 %i.024 + %0 = load i16, ptr %arrayidx, align 2 + %conv = sitofp i16 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.TwoShorts, ptr %b, i32 %i.024 + %1 = load i16, ptr %arrayidx1, align 2 + %conv3 = sitofp i16 %1 to float + %add = fadd float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.024 + store float %add, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %2 = load i16, ptr %y, align 2 + %conv7 = sitofp i16 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %3 = load i16, ptr %y9, align 2 + %conv10 = sitofp i16 %3 to float + %sub = fsub float %conv7, %conv10 + %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %sub, ptr %y12, align 4 + %inc = add nuw i32 %i.024, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_floats_two_shorts_same_op: +; CHECK: loop +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: f32x4.mul +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.splat +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: f32x4.mul +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: v128.store +define hidden void @two_floats_two_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp22.not = icmp eq i32 %N, 0 + br i1 %cmp22.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.023 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.023 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i16 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoShorts, ptr %res, i32 %i.023 + store i16 %conv, ptr %arrayidx3, align 2 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %conv9 = fptosi float %mul8 to i16 + %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i16 %conv9, ptr %y11, align 2 + %inc = add nuw i32 %i.023, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_floats_two_shorts_vary_op: +; CHECK: loop +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK: f32x4.add +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.splat +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK: f32x4.sub +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: v128.store +define hidden void @two_floats_two_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp21.not = icmp eq i32 %N, 0 + br i1 %cmp21.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd float %0, %1 + %conv = fptosi float %add to i16 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoShorts, ptr %res, i32 %i.022 + store i16 %conv, ptr %arrayidx3, align 2 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %sub = fsub float %2, %3 + %conv8 = fptosi float %sub to i16 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i16 %conv8, ptr %y10, align 2 + %inc = add nuw i32 %i.022, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_same_op: +; CHECK: loop +; CHECK: v128.load +; CHECK: v128.load +; CHECK: f32x4.mul +; CHECK: v128.store +define hidden void @four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp45.not = icmp eq i32 %N, 0 + br i1 %cmp45.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %arrayidx3 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.046 + store float %mul, ptr %arrayidx3, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store float %mul8, ptr %y10, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z13, align 4 + %mul14 = fmul float %4, %5 + %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 8 + store float %mul14, ptr %z16, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w19, align 4 + %mul20 = fmul float %6, %7 + %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 12 + store float %mul20, ptr %w22, align 4 + %inc = add nuw i32 %i.046, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_vary_op: +; CHECK-NOT: f32x4 +define hidden void @four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp42.not = icmp eq i32 %N, 0 + br i1 %cmp42.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.043 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.043 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.043 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd float %0, %1 + %arrayidx3 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.043 + store float %add, ptr %arrayidx3, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %sub = fsub float %2, %3 + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store float %sub, ptr %y9, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z12 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z12, align 4 + %mul = fmul float %4, %5 + %z14 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 8 + store float %mul, ptr %z14, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w17, align 4 + %div = fdiv float %6, %7 + %w19 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 12 + store float %div, ptr %w19, align 4 + %inc = add nuw i32 %i.043, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_bytes_four_floats_same_op: +; CHECK: loop +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.mul +; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.mul +; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.mul +; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.mul +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.store +; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.store +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.store +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.store +define hidden void @four_bytes_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp52.not = icmp eq i32 %N, 0 + br i1 %cmp52.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.053 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourBytes, ptr %a, i32 %i.053 + %0 = load i8, ptr %arrayidx, align 1 + %conv = sitofp i8 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.FourBytes, ptr %b, i32 %i.053 + %1 = load i8, ptr %arrayidx1, align 1 + %conv3 = sitofp i8 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.053 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1 + %2 = load i8, ptr %y, align 1 + %conv7 = sitofp i8 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1 + %3 = load i8, ptr %y9, align 1 + %conv10 = sitofp i8 %3 to float + %mul11 = fmul float %conv7, %conv10 + %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %mul11, ptr %y13, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %4 = load i8, ptr %z, align 1 + %conv15 = sitofp i8 %4 to float + %z17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %5 = load i8, ptr %z17, align 1 + %conv18 = sitofp i8 %5 to float + %mul19 = fmul float %conv15, %conv18 + %z21 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8 + store float %mul19, ptr %z21, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 3 + %6 = load i8, ptr %w, align 1 + %conv23 = sitofp i8 %6 to float + %w25 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 3 + %7 = load i8, ptr %w25, align 1 + %conv26 = sitofp i8 %7 to float + %mul27 = fmul float %conv23, %conv26 + %w29 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12 + store float %mul27, ptr %w29, align 4 + %inc = add nuw i32 %i.053, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_bytes_four_floats_vary_op: +; CHECK: loop +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.mul +; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.add +; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.div +; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.sub +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.store +; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.store +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.store +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.store +define hidden void @four_bytes_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp49.not = icmp eq i32 %N, 0 + br i1 %cmp49.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.050 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourBytes, ptr %a, i32 %i.050 + %0 = load i8, ptr %arrayidx, align 1 + %conv = sitofp i8 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.FourBytes, ptr %b, i32 %i.050 + %1 = load i8, ptr %arrayidx1, align 1 + %conv3 = sitofp i8 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.050 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1 + %2 = load i8, ptr %y, align 1 + %conv7 = sitofp i8 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1 + %3 = load i8, ptr %y9, align 1 + %conv10 = sitofp i8 %3 to float + %add = fadd float %conv7, %conv10 + %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %add, ptr %y12, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %4 = load i8, ptr %z, align 1 + %conv14 = sitofp i8 %4 to float + %z16 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %5 = load i8, ptr %z16, align 1 + %conv17 = sitofp i8 %5 to float + %div = fdiv float %conv14, %conv17 + %z19 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8 + store float %div, ptr %z19, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 3 + %6 = load i8, ptr %w, align 1 + %conv21 = sitofp i8 %6 to float + %w23 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 3 + %7 = load i8, ptr %w23, align 1 + %conv24 = sitofp i8 %7 to float + %sub = fsub float %conv21, %conv24 + %w26 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12 + store float %sub, ptr %w26, align 4 + %inc = add nuw i32 %i.050, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_four_bytes_same_op: +; CHECK: loop +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: f32x4.mul +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.splat +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: f32x4.mul +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: f32x4.mul +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: f32x4.mul +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: v128.store +define hidden void @four_floats_four_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp48.not = icmp eq i32 %N, 0 + br i1 %cmp48.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.049 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.049 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.049 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i8 + %arrayidx3 = getelementptr inbounds nuw %struct.FourBytes, ptr %res, i32 %i.049 + store i8 %conv, ptr %arrayidx3, align 1 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %conv9 = fptosi float %mul8 to i8 + %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1 + store i8 %conv9, ptr %y11, align 1 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z14 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z14, align 4 + %mul15 = fmul float %4, %5 + %conv16 = fptosi float %mul15 to i8 + %z18 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i8 %conv16, ptr %z18, align 1 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w21 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w21, align 4 + %mul22 = fmul float %6, %7 + %conv23 = fptosi float %mul22 to i8 + %w25 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 3 + store i8 %conv23, ptr %w25, align 1 + %inc = add nuw i32 %i.049, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_four_bytes_vary_op: +; CHECK: loop +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: f32x4.mul +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.splat +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: f32x4.add +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: f32x4.div +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: f32x4.sub +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i8x16.replace_lane +; CHECK: v128.store +define hidden void @four_floats_four_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp45.not = icmp eq i32 %N, 0 + br i1 %cmp45.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i8 + %arrayidx3 = getelementptr inbounds nuw %struct.FourBytes, ptr %res, i32 %i.046 + store i8 %conv, ptr %arrayidx3, align 1 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %add = fadd float %2, %3 + %conv8 = fptosi float %add to i8 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1 + store i8 %conv8, ptr %y10, align 1 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z13, align 4 + %div = fdiv float %4, %5 + %conv14 = fptosi float %div to i8 + %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i8 %conv14, ptr %z16, align 1 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w19, align 4 + %sub = fsub float %6, %7 + %conv20 = fptosi float %sub to i8 + %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 3 + store i8 %conv20, ptr %w22, align 1 + %inc = add nuw i32 %i.046, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_shorts_four_floats_same_op: +; CHECK: loop +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.mul +; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.mul +; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.mul +; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.mul +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.store +; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.store +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.store +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.store +define hidden void @four_shorts_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp52.not = icmp eq i32 %N, 0 + br i1 %cmp52.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.053 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourShorts, ptr %a, i32 %i.053 + %0 = load i16, ptr %arrayidx, align 2 + %conv = sitofp i16 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.FourShorts, ptr %b, i32 %i.053 + %1 = load i16, ptr %arrayidx1, align 2 + %conv3 = sitofp i16 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.053 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %2 = load i16, ptr %y, align 2 + %conv7 = sitofp i16 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %3 = load i16, ptr %y9, align 2 + %conv10 = sitofp i16 %3 to float + %mul11 = fmul float %conv7, %conv10 + %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %mul11, ptr %y13, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %4 = load i16, ptr %z, align 2 + %conv15 = sitofp i16 %4 to float + %z17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %5 = load i16, ptr %z17, align 2 + %conv18 = sitofp i16 %5 to float + %mul19 = fmul float %conv15, %conv18 + %z21 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8 + store float %mul19, ptr %z21, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 6 + %6 = load i16, ptr %w, align 2 + %conv23 = sitofp i16 %6 to float + %w25 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 6 + %7 = load i16, ptr %w25, align 2 + %conv26 = sitofp i16 %7 to float + %mul27 = fmul float %conv23, %conv26 + %w29 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12 + store float %mul27, ptr %w29, align 4 + %inc = add nuw i32 %i.053, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_shorts_four_floats_vary_op: +; CHECK: loop +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.mul +; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.add +; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.div +; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: f32x4.convert_i32x4_s +; CHECK: f32x4.sub +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.store +; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.store +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.store +define hidden void @four_shorts_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp49.not = icmp eq i32 %N, 0 + br i1 %cmp49.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.050 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourShorts, ptr %a, i32 %i.050 + %0 = load i16, ptr %arrayidx, align 2 + %conv = sitofp i16 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.FourShorts, ptr %b, i32 %i.050 + %1 = load i16, ptr %arrayidx1, align 2 + %conv3 = sitofp i16 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.050 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %2 = load i16, ptr %y, align 2 + %conv7 = sitofp i16 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %3 = load i16, ptr %y9, align 2 + %conv10 = sitofp i16 %3 to float + %add = fadd float %conv7, %conv10 + %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %add, ptr %y12, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %4 = load i16, ptr %z, align 2 + %conv14 = sitofp i16 %4 to float + %z16 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %5 = load i16, ptr %z16, align 2 + %conv17 = sitofp i16 %5 to float + %div = fdiv float %conv14, %conv17 + %z19 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8 + store float %div, ptr %z19, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 6 + %6 = load i16, ptr %w, align 2 + %conv21 = sitofp i16 %6 to float + %w23 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 6 + %7 = load i16, ptr %w23, align 2 + %conv24 = sitofp i16 %7 to float + %sub = fsub float %conv21, %conv24 + %w26 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12 + store float %sub, ptr %w26, align 4 + %inc = add nuw i32 %i.050, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_four_shorts_same_op: +; CHECK: loop +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: f32x4.mul +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.splat +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: f32x4.mul +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: f32x4.mul +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: f32x4.mul +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: v128.store +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.splat +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: v128.store +define hidden void @four_floats_four_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp48.not = icmp eq i32 %N, 0 + br i1 %cmp48.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.049 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.049 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.049 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i16 + %arrayidx3 = getelementptr inbounds nuw %struct.FourShorts, ptr %res, i32 %i.049 + store i16 %conv, ptr %arrayidx3, align 2 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %conv9 = fptosi float %mul8 to i16 + %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i16 %conv9, ptr %y11, align 2 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z14 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z14, align 4 + %mul15 = fmul float %4, %5 + %conv16 = fptosi float %mul15 to i16 + %z18 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store i16 %conv16, ptr %z18, align 2 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w21 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w21, align 4 + %mul22 = fmul float %6, %7 + %conv23 = fptosi float %mul22 to i16 + %w25 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 6 + store i16 %conv23, ptr %w25, align 2 + %inc = add nuw i32 %i.049, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_four_shorts_vary_op: +; CHECK: loop +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: v128.load +; CHECK: v128.load +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: f32x4.mul +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.splat +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: f32x4.add +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: f32x4.div +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 +; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 +; CHECK: f32x4.sub +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: v128.store +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.splat +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: f32x4.extract_lane +; CHECK: i32.trunc_sat_f32_s +; CHECK: i16x8.replace_lane +; CHECK: v128.store +define hidden void @four_floats_four_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp45.not = icmp eq i32 %N, 0 + br i1 %cmp45.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i16 + %arrayidx3 = getelementptr inbounds nuw %struct.FourShorts, ptr %res, i32 %i.046 + store i16 %conv, ptr %arrayidx3, align 2 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %add = fadd float %2, %3 + %conv8 = fptosi float %add to i16 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i16 %conv8, ptr %y10, align 2 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z13, align 4 + %div = fdiv float %4, %5 + %conv14 = fptosi float %div to i16 + %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store i16 %conv14, ptr %z16, align 2 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w19, align 4 + %sub = fsub float %6, %7 + %conv20 = fptosi float %sub to i16 + %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 6 + store i16 %conv20, ptr %w22, align 2 + %inc = add nuw i32 %i.046, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} diff --git a/llvm/test/CodeGen/WebAssembly/simd-vector-trunc.ll b/llvm/test/CodeGen/WebAssembly/simd-vector-trunc.ll index 123438d7f838b..f58456b4c8476 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-vector-trunc.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-vector-trunc.ll @@ -94,6 +94,19 @@ entry: ret <16 x i8> %0 } +define <8 x i8> @trunc8i16_8i8(<8 x i16> %a) { +; CHECK-LABEL: trunc8i16_8i8: +; CHECK: .functype trunc8i16_8i8 (v128) -> (v128) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK-NEXT: # fallthrough-return +entry: + %0 = trunc <8 x i16> %a to <8 x i8> + ret <8 x i8> %0 +} + define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) { ; CHECK-LABEL: trunc8i64_8i16: ; CHECK: .functype trunc8i64_8i16 (v128, v128, v128, v128) -> (v128) @@ -139,3 +152,29 @@ entry: %0 = trunc <8 x i32> %a to <8 x i16> ret <8 x i16> %0 } + +define <4 x i16> @trunc4i32_4i16(<4 x i32> %a) { +; CHECK-LABEL: trunc4i32_4i16: +; CHECK: .functype trunc4i32_4i16 (v128) -> (v128) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK-NEXT: # fallthrough-return +entry: + %0 = trunc <4 x i32> %a to <4 x i16> + ret <4 x i16> %0 +} + +define <4 x i8> @trunc4i32_4i8(<4 x i32> %a) { +; CHECK-LABEL: trunc4i32_4i8: +; CHECK: .functype trunc4i32_4i8 (v128) -> (v128) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.shuffle 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK-NEXT: # fallthrough-return +entry: + %0 = trunc <4 x i32> %a to <4 x i8> + ret <4 x i8> %0 +} diff --git a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll index c8d20dccbb32b..e42e2c73d588a 100644 --- a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll @@ -7,6 +7,7 @@ target triple = "wasm32-unknown-wasi" %struct.TwoInts = type { i32, i32 } %struct.ThreeInts = type { i32, i32, i32 } %struct.FourInts = type { i32, i32, i32, i32 } +%struct.TwoShorts = type { i16, i16 } %struct.ThreeShorts = type { i16, i16, i16 } %struct.FourShorts = type { i16, i16, i16, i16 } %struct.TwoBytes = type { i8, i8 } @@ -14,6 +15,8 @@ target triple = "wasm32-unknown-wasi" %struct.FourBytes = type { i8, i8, i8, i8 } %struct.FiveBytes = type { i8, i8, i8, i8, i8 } %struct.EightBytes = type { i8, i8, i8, i8, i8, i8, i8, i8 } +%struct.TwoFloats = type { float, float } +%struct.FourFloats = type { float, float, float, float } ; CHECK-LABEL: two_ints_same_op ; CHECK: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 2 at %10 @@ -1350,3 +1353,1000 @@ define hidden void @scale_uv_row_down2_linear(ptr nocapture noundef readonly %0, 34: ; preds = %6, %4 ret void } + +; CHECK-LABEL: two_floats_same_op +; CHECK: LV: Scalar loop costs: 14 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 14. +; CHECK: LV: Vector loop of width 2 costs: 19. +; CHECK: LV: Vector loop of width 4 costs: 15. +; CHECK: LV: Selecting VF: 1. +define hidden void @two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp21.not = icmp eq i32 %N, 0 + br i1 %cmp21.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.022 + store float %mul, ptr %arrayidx3, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store float %mul8, ptr %y10, align 4 + %inc = add nuw i32 %i.022, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_floats_vary_op +; CHECK: LV: Scalar loop costs: 14 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 14. +; CHECK: LV: Vector loop of width 2 costs: 19. +; CHECK: LV: Vector loop of width 4 costs: 15. +; CHECK: LV: Selecting VF: 1. +define hidden void @two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp20.not = icmp eq i32 %N, 0 + br i1 %cmp20.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.021 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.021 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.021 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd float %0, %1 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.021 + store float %add, ptr %arrayidx3, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %sub = fsub float %2, %3 + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store float %sub, ptr %y9, align 4 + %inc = add nuw i32 %i.021, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_bytes_two_floats_same_op +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 18 +; CHECK: LV: Vector loop of width 2 costs: 23 +; CHECK: LV: Vector loop of width 4 costs: 13 +; CHECK: LV: Selecting VF: 4. +define hidden void @two_bytes_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp24.not = icmp eq i32 %N, 0 + br i1 %cmp24.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.025 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoBytes, ptr %a, i32 %i.025 + %0 = load i8, ptr %arrayidx, align 1 + %conv = sitofp i8 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.TwoBytes, ptr %b, i32 %i.025 + %1 = load i8, ptr %arrayidx1, align 1 + %conv3 = sitofp i8 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.025 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1 + %2 = load i8, ptr %y, align 1 + %conv7 = sitofp i8 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1 + %3 = load i8, ptr %y9, align 1 + %conv10 = sitofp i8 %3 to float + %mul11 = fmul float %conv7, %conv10 + %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %mul11, ptr %y13, align 4 + %inc = add nuw i32 %i.025, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_bytes_two_floats_vary_op +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 18 +; CHECK: LV: Vector loop of width 2 costs: 23 +; CHECK: LV: Vector loop of width 4 costs: 13 +; CHECK: LV: Selecting VF: 4. +define hidden void @two_bytes_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp23.not = icmp eq i32 %N, 0 + br i1 %cmp23.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.024 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoBytes, ptr %a, i32 %i.024 + %0 = load i8, ptr %arrayidx, align 1 + %conv = sitofp i8 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.TwoBytes, ptr %b, i32 %i.024 + %1 = load i8, ptr %arrayidx1, align 1 + %conv3 = sitofp i8 %1 to float + %add = fadd float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.024 + store float %add, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1 + %2 = load i8, ptr %y, align 1 + %conv7 = sitofp i8 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1 + %3 = load i8, ptr %y9, align 1 + %conv10 = sitofp i8 %3 to float + %sub = fsub float %conv7, %conv10 + %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %sub, ptr %y12, align 4 + %inc = add nuw i32 %i.024, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_floats_two_bytes_same_op +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 16 +; CHECK: LV: Vector loop of width 2 costs: 21 +; CHECK: LV: Vector loop of width 4 costs: 14. +; CHECK: LV: Selecting VF: 4. +define hidden void @two_floats_two_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp22.not = icmp eq i32 %N, 0 + br i1 %cmp22.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.023 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.023 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i8 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoBytes, ptr %res, i32 %i.023 + store i8 %conv, ptr %arrayidx3, align 1 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %conv9 = fptosi float %mul8 to i8 + %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1 + store i8 %conv9, ptr %y11, align 1 + %inc = add nuw i32 %i.023, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_floats_two_bytes_vary_op +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 16 +; CHECK: LV: Vector loop of width 2 costs: 21 +; CHECK: LV: Vector loop of width 4 costs: 14. +; CHECK: LV: Selecting VF: 4. +define hidden void @two_floats_two_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp21.not = icmp eq i32 %N, 0 + br i1 %cmp21.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd float %0, %1 + %conv = fptosi float %add to i8 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoBytes, ptr %res, i32 %i.022 + store i8 %conv, ptr %arrayidx3, align 1 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %sub = fsub float %2, %3 + %conv8 = fptosi float %sub to i8 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1 + store i8 %conv8, ptr %y10, align 1 + %inc = add nuw i32 %i.022, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_shorts_two_floats_same_op +; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 18 +; CHECK: LV: Vector loop of width 2 costs: 22 +; CHECK: LV: Vector loop of width 4 costs: 11. +; CHECK: LV: Selecting VF: 4. +define hidden void @two_shorts_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp24.not = icmp eq i32 %N, 0 + br i1 %cmp24.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.025 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoShorts, ptr %a, i32 %i.025 + %0 = load i16, ptr %arrayidx, align 2 + %conv = sitofp i16 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.TwoShorts, ptr %b, i32 %i.025 + %1 = load i16, ptr %arrayidx1, align 2 + %conv3 = sitofp i16 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.025 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %2 = load i16, ptr %y, align 2 + %conv7 = sitofp i16 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %3 = load i16, ptr %y9, align 2 + %conv10 = sitofp i16 %3 to float + %mul11 = fmul float %conv7, %conv10 + %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %mul11, ptr %y13, align 4 + %inc = add nuw i32 %i.025, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_shorts_two_floats_vary_op +; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 18 +; CHECK: LV: Vector loop of width 2 costs: 22 +; CHECK: LV: Vector loop of width 4 costs: 11. +; CHECK: LV: Selecting VF: 4. +define hidden void @two_shorts_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp23.not = icmp eq i32 %N, 0 + br i1 %cmp23.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.024 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoShorts, ptr %a, i32 %i.024 + %0 = load i16, ptr %arrayidx, align 2 + %conv = sitofp i16 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.TwoShorts, ptr %b, i32 %i.024 + %1 = load i16, ptr %arrayidx1, align 2 + %conv3 = sitofp i16 %1 to float + %add = fadd float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.024 + store float %add, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %2 = load i16, ptr %y, align 2 + %conv7 = sitofp i16 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %3 = load i16, ptr %y9, align 2 + %conv10 = sitofp i16 %3 to float + %sub = fsub float %conv7, %conv10 + %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %sub, ptr %y12, align 4 + %inc = add nuw i32 %i.024, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_floats_two_shorts_same_op +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 16 +; CHECK: LV: Vector loop of width 2 costs: 20 +; CHECK: LV: Vector loop of width 4 costs: 13. +; CHECK: LV: Selecting VF: 4. +define hidden void @two_floats_two_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp22.not = icmp eq i32 %N, 0 + br i1 %cmp22.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.023 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.023 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i16 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoShorts, ptr %res, i32 %i.023 + store i16 %conv, ptr %arrayidx3, align 2 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %conv9 = fptosi float %mul8 to i16 + %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i16 %conv9, ptr %y11, align 2 + %inc = add nuw i32 %i.023, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_floats_two_shorts_vary_op +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 16 +; CHECK: LV: Vector loop of width 2 costs: 20 +; CHECK: LV: Vector loop of width 4 costs: 13. +; CHECK: LV: Selecting VF: 4. +define hidden void @two_floats_two_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp21.not = icmp eq i32 %N, 0 + br i1 %cmp21.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd float %0, %1 + %conv = fptosi float %add to i16 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoShorts, ptr %res, i32 %i.022 + store i16 %conv, ptr %arrayidx3, align 2 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %sub = fsub float %2, %3 + %conv8 = fptosi float %sub to i16 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i16 %conv8, ptr %y10, align 2 + %inc = add nuw i32 %i.022, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_same_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 24 +; CHECK: LV: Vector loop of width 2 costs: 33 +; CHECK: LV: Vector loop of width 4 costs: 30 +; CHECK: LV: Selecting VF: 4 +define hidden void @four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp45.not = icmp eq i32 %N, 0 + br i1 %cmp45.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %arrayidx3 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.046 + store float %mul, ptr %arrayidx3, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store float %mul8, ptr %y10, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z13, align 4 + %mul14 = fmul float %4, %5 + %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 8 + store float %mul14, ptr %z16, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w19, align 4 + %mul20 = fmul float %6, %7 + %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 12 + store float %mul20, ptr %w22, align 4 + %inc = add nuw i32 %i.046, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_vary_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 24 +; CHECK: LV: Vector loop of width 2 costs: 33 +; CHECK: LV: Vector loop of width 4 costs: 30 +; CHECK: LV: Selecting VF: 1 +define hidden void @four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp42.not = icmp eq i32 %N, 0 + br i1 %cmp42.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.043 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.043 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.043 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd float %0, %1 + %arrayidx3 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.043 + store float %add, ptr %arrayidx3, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %sub = fsub float %2, %3 + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store float %sub, ptr %y9, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z12 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z12, align 4 + %mul = fmul float %4, %5 + %z14 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 8 + store float %mul, ptr %z14, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w17, align 4 + %div = fdiv float %6, %7 + %w19 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 12 + store float %div, ptr %w19, align 4 + %inc = add nuw i32 %i.043, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_bytes_four_floats_same_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 32 +; CHECK: LV: Vector loop of width 2 costs: 43 +; CHECK: LV: Vector loop of width 4 costs: 23 +; CHECK: LV: Selecting VF: 4 +define hidden void @four_bytes_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp52.not = icmp eq i32 %N, 0 + br i1 %cmp52.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.053 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourBytes, ptr %a, i32 %i.053 + %0 = load i8, ptr %arrayidx, align 1 + %conv = sitofp i8 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.FourBytes, ptr %b, i32 %i.053 + %1 = load i8, ptr %arrayidx1, align 1 + %conv3 = sitofp i8 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.053 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1 + %2 = load i8, ptr %y, align 1 + %conv7 = sitofp i8 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1 + %3 = load i8, ptr %y9, align 1 + %conv10 = sitofp i8 %3 to float + %mul11 = fmul float %conv7, %conv10 + %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %mul11, ptr %y13, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %4 = load i8, ptr %z, align 1 + %conv15 = sitofp i8 %4 to float + %z17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %5 = load i8, ptr %z17, align 1 + %conv18 = sitofp i8 %5 to float + %mul19 = fmul float %conv15, %conv18 + %z21 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8 + store float %mul19, ptr %z21, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 3 + %6 = load i8, ptr %w, align 1 + %conv23 = sitofp i8 %6 to float + %w25 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 3 + %7 = load i8, ptr %w25, align 1 + %conv26 = sitofp i8 %7 to float + %mul27 = fmul float %conv23, %conv26 + %w29 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12 + store float %mul27, ptr %w29, align 4 + %inc = add nuw i32 %i.053, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_bytes_four_floats_vary_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 32 +; CHECK: LV: Vector loop of width 2 costs: 43 +; CHECK: LV: Vector loop of width 4 costs: 23 +; CHECK: LV: Selecting VF: 4 +define hidden void @four_bytes_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp49.not = icmp eq i32 %N, 0 + br i1 %cmp49.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.050 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourBytes, ptr %a, i32 %i.050 + %0 = load i8, ptr %arrayidx, align 1 + %conv = sitofp i8 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.FourBytes, ptr %b, i32 %i.050 + %1 = load i8, ptr %arrayidx1, align 1 + %conv3 = sitofp i8 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.050 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1 + %2 = load i8, ptr %y, align 1 + %conv7 = sitofp i8 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1 + %3 = load i8, ptr %y9, align 1 + %conv10 = sitofp i8 %3 to float + %add = fadd float %conv7, %conv10 + %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %add, ptr %y12, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %4 = load i8, ptr %z, align 1 + %conv14 = sitofp i8 %4 to float + %z16 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %5 = load i8, ptr %z16, align 1 + %conv17 = sitofp i8 %5 to float + %div = fdiv float %conv14, %conv17 + %z19 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8 + store float %div, ptr %z19, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 3 + %6 = load i8, ptr %w, align 1 + %conv21 = sitofp i8 %6 to float + %w23 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 3 + %7 = load i8, ptr %w23, align 1 + %conv24 = sitofp i8 %7 to float + %sub = fsub float %conv21, %conv24 + %w26 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12 + store float %sub, ptr %w26, align 4 + %inc = add nuw i32 %i.050, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_four_bytes_same_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 28 +; CHECK: LV: Vector loop of width 2 costs: 38 +; CHECK: LV: Vector loop of width 4 costs: 26 +; CHECK: LV: Selecting VF: 4 +define hidden void @four_floats_four_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp48.not = icmp eq i32 %N, 0 + br i1 %cmp48.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.049 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.049 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.049 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i8 + %arrayidx3 = getelementptr inbounds nuw %struct.FourBytes, ptr %res, i32 %i.049 + store i8 %conv, ptr %arrayidx3, align 1 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %conv9 = fptosi float %mul8 to i8 + %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1 + store i8 %conv9, ptr %y11, align 1 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z14 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z14, align 4 + %mul15 = fmul float %4, %5 + %conv16 = fptosi float %mul15 to i8 + %z18 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i8 %conv16, ptr %z18, align 1 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w21 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w21, align 4 + %mul22 = fmul float %6, %7 + %conv23 = fptosi float %mul22 to i8 + %w25 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 3 + store i8 %conv23, ptr %w25, align 1 + %inc = add nuw i32 %i.049, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_four_bytes_vary_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 28 +; CHECK: LV: Vector loop of width 2 costs: 38 +; CHECK: LV: Vector loop of width 4 costs: 26 +; CHECK: LV: Selecting VF: 4 +define hidden void @four_floats_four_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp45.not = icmp eq i32 %N, 0 + br i1 %cmp45.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i8 + %arrayidx3 = getelementptr inbounds nuw %struct.FourBytes, ptr %res, i32 %i.046 + store i8 %conv, ptr %arrayidx3, align 1 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %add = fadd float %2, %3 + %conv8 = fptosi float %add to i8 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1 + store i8 %conv8, ptr %y10, align 1 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z13, align 4 + %div = fdiv float %4, %5 + %conv14 = fptosi float %div to i8 + %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i8 %conv14, ptr %z16, align 1 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w19, align 4 + %sub = fsub float %6, %7 + %conv20 = fptosi float %sub to i8 + %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 3 + store i8 %conv20, ptr %w22, align 1 + %inc = add nuw i32 %i.046, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_shorts_four_floats_same_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 32 +; CHECK: LV: Vector loop of width 2 costs: 37 +; CHECK: LV: Vector loop of width 4 costs: 23 +; CHECK: LV: Selecting VF: 4 +define hidden void @four_shorts_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp52.not = icmp eq i32 %N, 0 + br i1 %cmp52.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.053 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourShorts, ptr %a, i32 %i.053 + %0 = load i16, ptr %arrayidx, align 2 + %conv = sitofp i16 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.FourShorts, ptr %b, i32 %i.053 + %1 = load i16, ptr %arrayidx1, align 2 + %conv3 = sitofp i16 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.053 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %2 = load i16, ptr %y, align 2 + %conv7 = sitofp i16 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %3 = load i16, ptr %y9, align 2 + %conv10 = sitofp i16 %3 to float + %mul11 = fmul float %conv7, %conv10 + %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %mul11, ptr %y13, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %4 = load i16, ptr %z, align 2 + %conv15 = sitofp i16 %4 to float + %z17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %5 = load i16, ptr %z17, align 2 + %conv18 = sitofp i16 %5 to float + %mul19 = fmul float %conv15, %conv18 + %z21 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8 + store float %mul19, ptr %z21, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 6 + %6 = load i16, ptr %w, align 2 + %conv23 = sitofp i16 %6 to float + %w25 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 6 + %7 = load i16, ptr %w25, align 2 + %conv26 = sitofp i16 %7 to float + %mul27 = fmul float %conv23, %conv26 + %w29 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12 + store float %mul27, ptr %w29, align 4 + %inc = add nuw i32 %i.053, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_shorts_four_floats_vary_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 32 +; CHECK: LV: Vector loop of width 2 costs: 37 +; CHECK: LV: Vector loop of width 4 costs: 23 +; CHECK: LV: Selecting VF: 4 +define hidden void @four_shorts_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp49.not = icmp eq i32 %N, 0 + br i1 %cmp49.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.050 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourShorts, ptr %a, i32 %i.050 + %0 = load i16, ptr %arrayidx, align 2 + %conv = sitofp i16 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.FourShorts, ptr %b, i32 %i.050 + %1 = load i16, ptr %arrayidx1, align 2 + %conv3 = sitofp i16 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.050 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %2 = load i16, ptr %y, align 2 + %conv7 = sitofp i16 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %3 = load i16, ptr %y9, align 2 + %conv10 = sitofp i16 %3 to float + %add = fadd float %conv7, %conv10 + %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %add, ptr %y12, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %4 = load i16, ptr %z, align 2 + %conv14 = sitofp i16 %4 to float + %z16 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %5 = load i16, ptr %z16, align 2 + %conv17 = sitofp i16 %5 to float + %div = fdiv float %conv14, %conv17 + %z19 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8 + store float %div, ptr %z19, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 6 + %6 = load i16, ptr %w, align 2 + %conv21 = sitofp i16 %6 to float + %w23 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 6 + %7 = load i16, ptr %w23, align 2 + %conv24 = sitofp i16 %7 to float + %sub = fsub float %conv21, %conv24 + %w26 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12 + store float %sub, ptr %w26, align 4 + %inc = add nuw i32 %i.050, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_four_shorts_same_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 28 +; CHECK: LV: Vector loop of width 2 costs: 35 +; CHECK: LV: Vector loop of width 4 costs: 26 +; CHECK: LV: Selecting VF: 4 +define hidden void @four_floats_four_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp48.not = icmp eq i32 %N, 0 + br i1 %cmp48.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.049 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.049 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.049 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i16 + %arrayidx3 = getelementptr inbounds nuw %struct.FourShorts, ptr %res, i32 %i.049 + store i16 %conv, ptr %arrayidx3, align 2 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %conv9 = fptosi float %mul8 to i16 + %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i16 %conv9, ptr %y11, align 2 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z14 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z14, align 4 + %mul15 = fmul float %4, %5 + %conv16 = fptosi float %mul15 to i16 + %z18 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store i16 %conv16, ptr %z18, align 2 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w21 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w21, align 4 + %mul22 = fmul float %6, %7 + %conv23 = fptosi float %mul22 to i16 + %w25 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 6 + store i16 %conv23, ptr %w25, align 2 + %inc = add nuw i32 %i.049, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_four_shorts_vary_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 28 +; CHECK: LV: Vector loop of width 2 costs: 35 +; CHECK: LV: Vector loop of width 4 costs: 26 +; CHECK: LV: Selecting VF: 4 +define hidden void @four_floats_four_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp45.not = icmp eq i32 %N, 0 + br i1 %cmp45.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i16 + %arrayidx3 = getelementptr inbounds nuw %struct.FourShorts, ptr %res, i32 %i.046 + store i16 %conv, ptr %arrayidx3, align 2 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %add = fadd float %2, %3 + %conv8 = fptosi float %add to i16 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i16 %conv8, ptr %y10, align 2 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z13, align 4 + %div = fdiv float %4, %5 + %conv14 = fptosi float %div to i16 + %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store i16 %conv14, ptr %z16, align 2 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w19, align 4 + %sub = fsub float %6, %7 + %conv20 = fptosi float %sub to i16 + %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 6 + store i16 %conv20, ptr %w22, align 2 + %inc = add nuw i32 %i.046, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} From 12bf1836dec8d5f47339b485727603568fa9e819 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 22 Oct 2025 12:44:55 +0200 Subject: [PATCH 078/530] [AutoUpgrade] Gracefully handle invalid alignment on masked intrinsics Generate a usage error instead of asserting. --- llvm/lib/IR/AutoUpgrade.cpp | 30 +++++++++--- .../autoupgrade-invalid-masked-align.ll | 49 +++++++++++++++++++ 2 files changed, 71 insertions(+), 8 deletions(-) create mode 100644 llvm/test/Assembler/autoupgrade-invalid-masked-align.ll diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 7e5e7b524d85c..b838e36c8824f 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -5262,33 +5262,47 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) { return; } + auto GetMaybeAlign = [](Value *Op) { + if (auto *CI = dyn_cast(Op)) { + uint64_t Val = CI->getZExtValue(); + if (Val == 0) + return MaybeAlign(); + if (isPowerOf2_64(Val)) + return MaybeAlign(Val); + } + reportFatalUsageError("Invalid alignment argument"); + }; + auto GetAlign = [&](Value *Op) { + MaybeAlign Align = GetMaybeAlign(Op); + if (Align) + return *Align; + reportFatalUsageError("Invalid zero alignment argument"); + }; + const DataLayout &DL = CI->getDataLayout(); switch (NewFn->getIntrinsicID()) { case Intrinsic::masked_load: NewCall = Builder.CreateMaskedLoad( - CI->getType(), CI->getArgOperand(0), - cast(CI->getArgOperand(1))->getAlignValue(), + CI->getType(), CI->getArgOperand(0), GetAlign(CI->getArgOperand(1)), CI->getArgOperand(2), CI->getArgOperand(3)); break; case Intrinsic::masked_gather: NewCall = Builder.CreateMaskedGather( CI->getType(), CI->getArgOperand(0), - DL.getValueOrABITypeAlignment( - cast(CI->getArgOperand(1))->getMaybeAlignValue(), - CI->getType()->getScalarType()), + DL.getValueOrABITypeAlignment(GetMaybeAlign(CI->getArgOperand(1)), + CI->getType()->getScalarType()), CI->getArgOperand(2), CI->getArgOperand(3)); break; case Intrinsic::masked_store: NewCall = Builder.CreateMaskedStore( CI->getArgOperand(0), CI->getArgOperand(1), - cast(CI->getArgOperand(2))->getAlignValue(), - CI->getArgOperand(3)); + GetAlign(CI->getArgOperand(2)), CI->getArgOperand(3)); break; case Intrinsic::masked_scatter: NewCall = Builder.CreateMaskedScatter( CI->getArgOperand(0), CI->getArgOperand(1), DL.getValueOrABITypeAlignment( - cast(CI->getArgOperand(2))->getMaybeAlignValue(), + GetMaybeAlign(CI->getArgOperand(2)), CI->getArgOperand(0)->getType()->getScalarType()), CI->getArgOperand(3)); break; diff --git a/llvm/test/Assembler/autoupgrade-invalid-masked-align.ll b/llvm/test/Assembler/autoupgrade-invalid-masked-align.ll new file mode 100644 index 0000000000000..458bd2edce712 --- /dev/null +++ b/llvm/test/Assembler/autoupgrade-invalid-masked-align.ll @@ -0,0 +1,49 @@ +; RUN: split-file %s %t +; RUN: not llvm-as < %t/masked-store.ll 2>&1 | FileCheck %s --check-prefix=MASKED-STORE +; RUN: not llvm-as < %t/masked-store-zero.ll 2>&1 | FileCheck %s --check-prefix=MASKED-STORE-ZERO +; RUN: not llvm-as < %t/masked-load.ll 2>&1 | FileCheck %s --check-prefix=MASKED-LOAD +; RUN: not llvm-as < %t/masked-load-zero.ll 2>&1 | FileCheck %s --check-prefix=MASKED-LOAD-ZERO +; RUN: not llvm-as < %t/masked-scatter.ll 2>&1 | FileCheck %s --check-prefix=MASKED-SCATTER +; RUN: not llvm-as < %t/masked-gather.ll 2>&1 | FileCheck %s --check-prefix=MASKED-GATHER + +;--- masked-store.ll +; MASKED-STORE: LLVM ERROR: Invalid alignment argument +define void @masked_store(ptr %ptr, <2 x i1> %mask, <2 x double> %val) { + call void @llvm.masked.store.v2f64.p0(<2 x double> %val, ptr %ptr, i32 3, <2 x i1> %mask) + ret void +} + +;--- masked-store-zero.ll +; MASKED-STORE-ZERO: LLVM ERROR: Invalid zero alignment argument +define void @masked_store_zero(ptr %ptr, <2 x i1> %mask, <2 x double> %val) { + call void @llvm.masked.store.v2f64.p0(<2 x double> %val, ptr %ptr, i32 0, <2 x i1> %mask) + ret void +} + +;--- masked-load.ll +; MASKED-LOAD: LLVM ERROR: Invalid alignment argument +define void @masked_load(ptr %ptr, <2 x i1> %mask, <2 x double> %val) { + call <2 x double> @llvm.masked.load.v2f64.p0(ptr %ptr, i32 3, <2 x i1> %mask, <2 x double> %val) + ret void +} + +;--- masked-load-zero.ll +; MASKED-LOAD-ZERO: LLVM ERROR: Invalid zero alignment argument +define void @masked_load_zero(ptr %ptr, <2 x i1> %mask, <2 x double> %val) { + call <2 x double> @llvm.masked.load.v2f64.p0(ptr %ptr, i32 0, <2 x i1> %mask, <2 x double> %val) + ret void +} + +;--- masked-scatter.ll +; MASKED-SCATTER: LLVM ERROR: Invalid alignment argument +define void @masked_scatter(<2 x ptr> %ptr, <2 x i1> %mask, <2 x double> %val) { + call void @llvm.masked.scatter.v2f64.p0(<2 x double> %val, <2 x ptr> %ptr, i32 3, <2 x i1> %mask) + ret void +} + +;--- masked-gather.ll +; MASKED-GATHER: LLVM ERROR: Invalid alignment argument +define void @masked_gather(<2 x ptr> %ptr, <2 x i1> %mask, <2 x double> %val) { + call <2 x double> @llvm.masked.gather.v2f64.p0(<2 x ptr> %ptr, i32 3, <2 x i1> %mask, <2 x double> %val) + ret void +} From cde445716907ccf1003f2d7a95c1a672178d6e8e Mon Sep 17 00:00:00 2001 From: Sushant Gokhale Date: Wed, 22 Oct 2025 16:32:28 +0530 Subject: [PATCH 079/530] [ShrinkWrap][NFC] Test with load from constant pool preventing shrink (#162476) wrapping Shrink wrapping treats a load from constant pool as a stack access. This is not correct. Constants are basically stored in read only section AFAIU. This prevents shrink wrapping from kicking in. (Related to PR #160257. PR #160257 will be closed.) --- .../AArch64/shrink-wrap-const-pool-access.mir | 74 +++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/shrink-wrap-const-pool-access.mir diff --git a/llvm/test/CodeGen/AArch64/shrink-wrap-const-pool-access.mir b/llvm/test/CodeGen/AArch64/shrink-wrap-const-pool-access.mir new file mode 100644 index 0000000000000..80312740607b2 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/shrink-wrap-const-pool-access.mir @@ -0,0 +1,74 @@ +# RUN: llc -mtriple=aarch64 -simplify-mir -run-pass=shrink-wrap -o - %s | FileCheck %s +--- | + declare double @foo() + + define double @shrink_wrap_load_from_const_pool(double %q) { + entry: + %0 = fcmp oeq double %q, 3.125500e+02 + br i1 %0, label %common.ret, label %if.else + + common.ret: ; preds = %if.else, %entry, %exit1 + %common.ret.op = phi double [ %3, %exit1 ], [ 0.000000e+00, %entry ], [ 0.000000e+00, %if.else ] + ret double %common.ret.op + + if.else: ; preds = %entry + %1 = call double @foo() + %2 = fcmp oeq double %1, 0.000000e+00 + br i1 %2, label %exit1, label %common.ret + + exit1: ; preds = %if.else + %3 = call double @foo() + br label %common.ret + } +... +# FIXME: Following code has a load from constant pool. Accessing constant pool +# must not be considered as a stack access and hence, shrink wrapping must +# happen. +# CHECK-LABEL:name: shrink_wrap_load_from_const_pool +# CHECK-NOT: savePoint +# CHECK-NOT: restorePoint +--- +name: shrink_wrap_load_from_const_pool +tracksRegLiveness: true +constants: + - id: 0 + value: 'double 3.125500e+02' + alignment: 8 +body: | + bb.0.entry: + successors: %bb.4(0x50000000), %bb.2(0x30000000) + liveins: $d0 + + renamable $d1 = COPY $d0 + renamable $x8 = ADRP target-flags(aarch64-page) %const.0 + renamable $d2 = LDRDui killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) %const.0 :: (load (s64) from constant-pool) + renamable $d0 = FMOVD0 + nofpexcept FCMPDrr killed renamable $d1, killed renamable $d2, implicit-def $nzcv, implicit $fpcr + Bcc 1, %bb.2, implicit killed $nzcv + + bb.4: + liveins: $d0 + + bb.1.common.ret: + liveins: $d0 + + RET_ReallyLR implicit $d0 + + bb.2.if.else: + successors: %bb.3(0x50000000), %bb.1(0x30000000) + + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + BL @foo, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $d0 + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + renamable $d1 = COPY $d0 + renamable $d0 = FMOVD0 + nofpexcept FCMPDri killed renamable $d1, implicit-def $nzcv, implicit $fpcr + Bcc 1, %bb.1, implicit killed $nzcv + B %bb.3 + + bb.3.exit1: + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + BL @foo, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $d0 + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + B %bb.1 +... From 10d3c6bc11ca919563036aa590440c38502bcd2f Mon Sep 17 00:00:00 2001 From: Sushant Gokhale Date: Wed, 22 Oct 2025 16:35:57 +0530 Subject: [PATCH 080/530] [ShrinkWrap] Consider constant pool access as non-stack access (#164393) As far as I understand, constant pool access does not access stack and accesses read-only memory. This patch considers constant pool access as non-stack access allowing shrink wrapping to happen in the concerned test. We should be seeing perf improvement with povray benchmark from SPEC17(around 12% with -flto -Ofast) after this patch. An NFC PR #162476 already exists to upload the test before the patch but approval has got delayed. So, as @davemgreen suggested in that PR, I have uploaded the test and patch in this single PR to show how test looks like. --- llvm/lib/CodeGen/ShrinkWrap.cpp | 2 +- .../CodeGen/AArch64/shrink-wrap-const-pool-access.mir | 10 ++++++---- llvm/test/CodeGen/X86/fp128-select.ll | 6 +++--- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/llvm/lib/CodeGen/ShrinkWrap.cpp b/llvm/lib/CodeGen/ShrinkWrap.cpp index 826e4126de44c..83581052560cb 100644 --- a/llvm/lib/CodeGen/ShrinkWrap.cpp +++ b/llvm/lib/CodeGen/ShrinkWrap.cpp @@ -319,7 +319,7 @@ bool ShrinkWrapImpl::useOrDefCSROrFI(const MachineInstr &MI, RegScavenger *RS, return isa(UO); } if (const PseudoSourceValue *PSV = Op->getPseudoValue()) - return PSV->isJumpTable(); + return PSV->isJumpTable() || PSV->isConstantPool(); return false; }; // Load/store operations may access the stack indirectly when we previously diff --git a/llvm/test/CodeGen/AArch64/shrink-wrap-const-pool-access.mir b/llvm/test/CodeGen/AArch64/shrink-wrap-const-pool-access.mir index 80312740607b2..6f33a75ab9fcb 100644 --- a/llvm/test/CodeGen/AArch64/shrink-wrap-const-pool-access.mir +++ b/llvm/test/CodeGen/AArch64/shrink-wrap-const-pool-access.mir @@ -21,12 +21,14 @@ br label %common.ret } ... -# FIXME: Following code has a load from constant pool. Accessing constant pool +# Following code has a load from constant pool. Accessing constant pool # must not be considered as a stack access and hence, shrink wrapping must # happen. -# CHECK-LABEL:name: shrink_wrap_load_from_const_pool -# CHECK-NOT: savePoint -# CHECK-NOT: restorePoint +# CHECK-LABEL:name: shrink_wrap_load_from_const_pool +# CHECK: savePoint: +# CHECK: - point: '%bb.3' +# CHECK: restorePoint: +# CHECK: - point: '%bb.5' --- name: shrink_wrap_load_from_const_pool tracksRegLiveness: true diff --git a/llvm/test/CodeGen/X86/fp128-select.ll b/llvm/test/CodeGen/X86/fp128-select.ll index 659e4ddedc646..27a651e23f886 100644 --- a/llvm/test/CodeGen/X86/fp128-select.ll +++ b/llvm/test/CodeGen/X86/fp128-select.ll @@ -13,8 +13,8 @@ define void @test_select(ptr %p, ptr %q, i1 zeroext %c) nounwind { ; SSE: # %bb.0: ; SSE-NEXT: testl %edx, %edx ; SSE-NEXT: jne .LBB0_1 -; SSE-NEXT: # %bb.3: -; SSE-NEXT: movaps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: # %bb.2: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [NaN] ; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: retq ; SSE-NEXT: .LBB0_1: @@ -58,7 +58,7 @@ define fp128 @test_select_cc(fp128, fp128) nounwind { ; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: jmp .LBB1_3 ; SSE-NEXT: .LBB1_1: -; SSE-NEXT: movaps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0] ; SSE-NEXT: .LBB1_3: # %BB0 ; SSE-NEXT: testl %ebx, %ebx ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload From 15d11ebc84886e06127750ef5bea60ba1d36798a Mon Sep 17 00:00:00 2001 From: paperchalice Date: Wed, 22 Oct 2025 19:07:23 +0800 Subject: [PATCH 081/530] [NFC] "unsafe-fp-math" post cleanup (code comments part) (#164582) --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 6 +++--- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 4 ++-- llvm/lib/Target/X86/X86ISelLowering.cpp | 2 +- llvm/lib/Target/X86/X86InstrAVX512.td | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 6aa71254fe6ef..d2ea6525e1116 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17460,8 +17460,8 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // fold (fsub (fpext (fneg (fmul, x, y))), z) // -> (fneg (fma (fpext x), (fpext y), z)) // Note: This could be removed with appropriate canonicalization of the - // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the - // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent + // input expression into (fneg (fadd (fpext (fmul, x, y)), z)). However, the + // command line flag -fp-contract=fast and fast-math flag contract prevent // from implementing the canonicalization in visitFSUB. if (matcher.match(N0, ISD::FP_EXTEND)) { SDValue N00 = N0.getOperand(0); @@ -17485,7 +17485,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // -> (fneg (fma (fpext x)), (fpext y), z) // Note: This could be removed with appropriate canonicalization of the // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the - // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent + // command line flag -fp-contract=fast and fast-math flag contract prevent // from implementing the canonicalization in visitFSUB. if (matcher.match(N0, ISD::FNEG)) { SDValue N00 = N0.getOperand(0); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index d47752267ba83..17f04d0fd05e8 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -14736,8 +14736,8 @@ SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG, } unsigned PPCTargetLowering::combineRepeatedFPDivisors() const { - // Note: This functionality is used only when unsafe-fp-math is enabled, and - // on cores with reciprocal estimates (which are used when unsafe-fp-math is + // Note: This functionality is used only when arcp is enabled, and + // on cores with reciprocal estimates (which are used when arcp is // enabled for division), this functionality is redundant with the default // combiner logic (once the division -> reciprocal/multiply transformation // has taken place). As a result, this matters more for older cores than for diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index b54a1e71d8c10..d49f25a950e3a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -20558,7 +20558,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL, // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); // NOTE: By using fsub of a positive constant instead of fadd of a negative - // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is + // constant, we avoid reassociation in MachineCombiner when reassoc is // enabled. See PR24512. SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High); // TODO: Are there any fast-math-flags to propagate here? diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 83bd6ac26cc59..1b748b7355716 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -5519,7 +5519,7 @@ defm VMIN : avx512_binop_s_sae<0x5D, "vmin", X86any_fmin, X86fmins, X86fminSAEs, defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86any_fmax, X86fmaxs, X86fmaxSAEs, SchedWriteFCmpSizes, 0>; -// MIN/MAX nodes are commutable under "unsafe-fp-math". In this case we use +// MIN/MAX nodes are commutable under (nnan + ninf). In this case we use // X86fminc and X86fmaxc instead of X86fmin and X86fmax multiclass avx512_comutable_binop_s opc, string OpcodeStr, X86VectorVTInfo _, SDNode OpNode, From ec546ce5745b195655cbdf645322d5dda91374e0 Mon Sep 17 00:00:00 2001 From: paperchalice Date: Wed, 22 Oct 2025 19:14:37 +0800 Subject: [PATCH 082/530] [lld][test] Remove unsafe-fp-math uses (NFC) (#164598) --- lld/test/COFF/Inputs/undefined-symbol-lto-a.ll | 4 ++-- lld/test/COFF/Inputs/undefined-symbol-lto-b.ll | 2 +- lld/test/COFF/Inputs/undefined-symbol-multi-lto.ll | 4 ++-- lld/test/MachO/lto-mattrs.ll | 2 +- lld/test/wasm/Inputs/debuginfo1.ll | 6 +++--- lld/test/wasm/Inputs/debuginfo2.ll | 2 +- lld/test/wasm/debug-removed-fn.ll | 2 +- 7 files changed, 11 insertions(+), 11 deletions(-) diff --git a/lld/test/COFF/Inputs/undefined-symbol-lto-a.ll b/lld/test/COFF/Inputs/undefined-symbol-lto-a.ll index 7e29044ebe00d..f57a3e3c1ede4 100644 --- a/lld/test/COFF/Inputs/undefined-symbol-lto-a.ll +++ b/lld/test/COFF/Inputs/undefined-symbol-lto-a.ll @@ -47,8 +47,8 @@ entry: ret void } -attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind sspstrong uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" } +attributes #1 = { nounwind sspstrong uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" } attributes #2 = { nounwind } !llvm.linker.options = !{!1, !2} diff --git a/lld/test/COFF/Inputs/undefined-symbol-lto-b.ll b/lld/test/COFF/Inputs/undefined-symbol-lto-b.ll index 0f64e236a1f47..7347fde7a9dee 100644 --- a/lld/test/COFF/Inputs/undefined-symbol-lto-b.ll +++ b/lld/test/COFF/Inputs/undefined-symbol-lto-b.ll @@ -11,7 +11,7 @@ entry: ret void } -attributes #0 = { norecurse nounwind readnone sspstrong uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { norecurse nounwind readnone sspstrong uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" } !llvm.linker.options = !{!0, !1} !llvm.module.flags = !{!2, !3, !4, !5} diff --git a/lld/test/COFF/Inputs/undefined-symbol-multi-lto.ll b/lld/test/COFF/Inputs/undefined-symbol-multi-lto.ll index 5f6730272e610..61828cae7ea58 100644 --- a/lld/test/COFF/Inputs/undefined-symbol-multi-lto.ll +++ b/lld/test/COFF/Inputs/undefined-symbol-multi-lto.ll @@ -13,8 +13,8 @@ declare dso_local i32 @"?foo@@YAHXZ"() #1 declare dso_local i32 @"?bar@@YAHXZ"() #1 -attributes #0 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" } +attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" } !llvm.module.flags = !{!0} !llvm.ident = !{!1} diff --git a/lld/test/MachO/lto-mattrs.ll b/lld/test/MachO/lto-mattrs.ll index f658b485a1792..41313296dc269 100644 --- a/lld/test/MachO/lto-mattrs.ll +++ b/lld/test/MachO/lto-mattrs.ll @@ -33,4 +33,4 @@ define float @foo(float %x) #0 { ret float %div } -attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" } +attributes #0 = { "reciprocal-estimates"="divf,vec-divf" } diff --git a/lld/test/wasm/Inputs/debuginfo1.ll b/lld/test/wasm/Inputs/debuginfo1.ll index d6db88002ce1e..0a1c42a214f2c 100644 --- a/lld/test/wasm/Inputs/debuginfo1.ll +++ b/lld/test/wasm/Inputs/debuginfo1.ll @@ -35,9 +35,9 @@ declare void @foo(i32) local_unnamed_addr #2 ; Function Attrs: nounwind readnone speculatable declare void @llvm.dbg.value(metadata, metadata, metadata) #3 -attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "use-soft-float"="false" } +attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "use-soft-float"="false" } +attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "use-soft-float"="false" } attributes #3 = { nounwind readnone speculatable } attributes #4 = { nounwind } diff --git a/lld/test/wasm/Inputs/debuginfo2.ll b/lld/test/wasm/Inputs/debuginfo2.ll index 1b63dd57474e5..c832be521e325 100644 --- a/lld/test/wasm/Inputs/debuginfo2.ll +++ b/lld/test/wasm/Inputs/debuginfo2.ll @@ -31,7 +31,7 @@ entry: ; Function Attrs: nounwind readnone speculatable declare void @llvm.dbg.value(metadata, metadata, metadata) #1 -attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "use-soft-float"="false" } attributes #1 = { nounwind readnone speculatable } !llvm.dbg.cu = !{!2} diff --git a/lld/test/wasm/debug-removed-fn.ll b/lld/test/wasm/debug-removed-fn.ll index 8dae48a136dc4..20c30346cc0e7 100644 --- a/lld/test/wasm/debug-removed-fn.ll +++ b/lld/test/wasm/debug-removed-fn.ll @@ -28,7 +28,7 @@ entry: ret i32 6, !dbg !13 } -attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "use-soft-float"="false" } !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4, !5} From 8b2aba2e20c3cfb9d2e9337fdc38c889b0ff8ae2 Mon Sep 17 00:00:00 2001 From: Hassnaa Hamdi Date: Wed, 22 Oct 2025 12:16:11 +0100 Subject: [PATCH 083/530] [WPD]: Enable speculative devirtualizatoin. (#159048) This patch implements the speculative devirtualization feature in the LLVM backend. It handles the case of single implementation devirtualization where there is a single possible callee of a virtual function. - Add cl::opt 'devirtualize-speculatively' to enable it. - Flag is disabled by default. - It works regardless of the visibility of the object. - Not enabled for LTO for now. --- .../lib/Transforms/IPO/WholeProgramDevirt.cpp | 78 ++++++++--- .../speculative-devirt-single-impl.ll | 132 ++++++++++++++++++ .../virtual-const-prop-check.ll | 7 + 3 files changed, 199 insertions(+), 18 deletions(-) create mode 100644 llvm/test/Transforms/WholeProgramDevirt/speculative-devirt-single-impl.ll diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp index 76e588b116c04..a0f7ec6d5fae3 100644 --- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -24,7 +24,8 @@ // returns 0, or a single vtable's function returns 1, replace each virtual // call with a comparison of the vptr against that vtable's address. // -// This pass is intended to be used during the regular and thin LTO pipelines: +// This pass is intended to be used during the regular/thin and non-LTO +// pipelines: // // During regular LTO, the pass determines the best optimization for each // virtual call and applies the resolutions directly to virtual calls that are @@ -48,6 +49,14 @@ // is supported. // - Import phase: (same as with hybrid case above). // +// During Speculative devirtualization mode -not restricted to LTO-: +// - The pass applies speculative devirtualization without requiring any type of +// visibility. +// - Skips other features like virtual constant propagation, uniform return +// value optimization, unique return value optimization and branch funnels as +// they need LTO. +// - This mode is enabled via 'devirtualize-speculatively' flag. +// //===----------------------------------------------------------------------===// #include "llvm/Transforms/IPO/WholeProgramDevirt.h" @@ -61,7 +70,9 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/ModuleSummaryAnalysis.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TypeMetadataUtils.h" #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/Bitcode/BitcodeWriter.h" @@ -145,6 +156,13 @@ static cl::opt ClWriteSummary( "bitcode, otherwise YAML"), cl::Hidden); +// TODO: This option eventually should support any public visibility vtables +// with/out LTO. +static cl::opt ClDevirtualizeSpeculatively( + "devirtualize-speculatively", + cl::desc("Enable speculative devirtualization optimization"), + cl::init(false)); + static cl::opt ClThreshold("wholeprogramdevirt-branch-funnel-threshold", cl::Hidden, cl::init(10), @@ -892,6 +910,8 @@ void llvm::updatePublicTypeTestCalls(Module &M, CI->eraseFromParent(); } } else { + // TODO: Don't replace public type tests when speculative devirtualization + // gets enabled in LTO mode. auto *True = ConstantInt::getTrue(M.getContext()); for (Use &U : make_early_inc_range(PublicTypeTestFunc->uses())) { auto *CI = cast(U.getUser()); @@ -1083,10 +1103,10 @@ bool DevirtModule::tryFindVirtualCallTargets( if (!TM.Bits->GV->isConstant()) return false; - // We cannot perform whole program devirtualization analysis on a vtable - // with public LTO visibility. - if (TM.Bits->GV->getVCallVisibility() == - GlobalObject::VCallVisibilityPublic) + // Without ClDevirtualizeSpeculatively, we cannot perform whole program + // devirtualization analysis on a vtable with public LTO visibility. + if (!ClDevirtualizeSpeculatively && TM.Bits->GV->getVCallVisibility() == + GlobalObject::VCallVisibilityPublic) return false; Function *Fn = nullptr; @@ -1105,6 +1125,12 @@ bool DevirtModule::tryFindVirtualCallTargets( if (Fn->getName() == "__cxa_pure_virtual") continue; + // In most cases empty functions will be overridden by the + // implementation of the derived class, so we can skip them. + if (ClDevirtualizeSpeculatively && Fn->getReturnType()->isVoidTy() && + Fn->getInstructionCount() <= 1) + continue; + // We can disregard unreachable functions as possible call targets, as // unreachable functions shouldn't be called. if (mustBeUnreachableFunction(Fn, ExportSummary)) @@ -1223,10 +1249,12 @@ void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo, CallTrap->setDebugLoc(CB.getDebugLoc()); } - // If fallback checking is enabled, add support to compare the virtual - // function pointer to the devirtualized target. In case of a mismatch, - // fall back to indirect call. - if (DevirtCheckMode == WPDCheckMode::Fallback) { + // If fallback checking or speculative devirtualization are enabled, + // add support to compare the virtual function pointer to the + // devirtualized target. In case of a mismatch, fall back to indirect + // call. + if (DevirtCheckMode == WPDCheckMode::Fallback || + ClDevirtualizeSpeculatively) { MDNode *Weights = MDBuilder(M.getContext()).createLikelyBranchWeights(); // Version the indirect call site. If the called value is equal to the // given callee, 'NewInst' will be executed, otherwise the original call @@ -2057,15 +2085,15 @@ void DevirtModule::scanTypeTestUsers( Function *TypeTestFunc, DenseMap> &TypeIdMap) { // Find all virtual calls via a virtual table pointer %p under an assumption - // of the form llvm.assume(llvm.type.test(%p, %md)). This indicates that %p - // points to a member of the type identifier %md. Group calls by (type ID, - // offset) pair (effectively the identity of the virtual function) and store - // to CallSlots. + // of the form llvm.assume(llvm.type.test(%p, %md)) or + // llvm.assume(llvm.public.type.test(%p, %md)). + // This indicates that %p points to a member of the type identifier %md. + // Group calls by (type ID, offset) pair (effectively the identity of the + // virtual function) and store to CallSlots. for (Use &U : llvm::make_early_inc_range(TypeTestFunc->uses())) { auto *CI = dyn_cast(U.getUser()); if (!CI) continue; - // Search for virtual calls based on %p and add them to DevirtCalls. SmallVector DevirtCalls; SmallVector Assumes; @@ -2348,6 +2376,12 @@ bool DevirtModule::run() { (ImportSummary && ImportSummary->partiallySplitLTOUnits())) return false; + Function *PublicTypeTestFunc = nullptr; + // If we are in speculative devirtualization mode, we can work on the public + // type test intrinsics. + if (ClDevirtualizeSpeculatively) + PublicTypeTestFunc = + Intrinsic::getDeclarationIfExists(&M, Intrinsic::public_type_test); Function *TypeTestFunc = Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_test); Function *TypeCheckedLoadFunc = @@ -2361,8 +2395,9 @@ bool DevirtModule::run() { // module, this pass has nothing to do. But if we are exporting, we also need // to handle any users that appear only in the function summaries. if (!ExportSummary && - (!TypeTestFunc || TypeTestFunc->use_empty() || !AssumeFunc || - AssumeFunc->use_empty()) && + (((!PublicTypeTestFunc || PublicTypeTestFunc->use_empty()) && + (!TypeTestFunc || TypeTestFunc->use_empty())) || + !AssumeFunc || AssumeFunc->use_empty()) && (!TypeCheckedLoadFunc || TypeCheckedLoadFunc->use_empty()) && (!TypeCheckedLoadRelativeFunc || TypeCheckedLoadRelativeFunc->use_empty())) @@ -2373,6 +2408,9 @@ bool DevirtModule::run() { DenseMap> TypeIdMap; buildTypeIdentifierMap(Bits, TypeIdMap); + if (PublicTypeTestFunc && AssumeFunc) + scanTypeTestUsers(PublicTypeTestFunc, TypeIdMap); + if (TypeTestFunc && AssumeFunc) scanTypeTestUsers(TypeTestFunc, TypeIdMap); @@ -2472,8 +2510,12 @@ bool DevirtModule::run() { .WPDRes[S.first.ByteOffset]; if (tryFindVirtualCallTargets(TargetsForSlot, TypeMemberInfos, S.first.ByteOffset, ExportSummary)) { - - if (!trySingleImplDevirt(ExportSummary, TargetsForSlot, S.second, Res)) { + bool SingleImplDevirt = + trySingleImplDevirt(ExportSummary, TargetsForSlot, S.second, Res); + // Out of speculative devirtualization mode, Try to apply virtual constant + // propagation or branch funneling. + // TODO: This should eventually be enabled for non-public type tests. + if (!SingleImplDevirt && !ClDevirtualizeSpeculatively) { DidVirtualConstProp |= tryVirtualConstProp(TargetsForSlot, S.second, Res, S.first); diff --git a/llvm/test/Transforms/WholeProgramDevirt/speculative-devirt-single-impl.ll b/llvm/test/Transforms/WholeProgramDevirt/speculative-devirt-single-impl.ll new file mode 100644 index 0000000000000..10566ae0dee8e --- /dev/null +++ b/llvm/test/Transforms/WholeProgramDevirt/speculative-devirt-single-impl.ll @@ -0,0 +1,132 @@ +; -stats requires asserts +; REQUIRES: asserts + +; Check that we can still devirtualize outside LTO mode when speculative devirtualization is enabled. +; Check that we skip devirtualization for empty functions in speculative devirtualization mode + +; RUN: opt -S -passes=wholeprogramdevirt -devirtualize-speculatively \ +; RUN: -pass-remarks=wholeprogramdevirt -stats %s 2>&1 | FileCheck %s + +target datalayout = "e-p:64:64" +target triple = "x86_64-unknown-linux-gnu" + +; CHECK: remark: devirt-single.cc:30:32: single-impl: devirtualized a call to vf +; CHECK: remark: devirt-single.cc:41:32: single-impl: devirtualized a call to vf +; CHECK: remark: devirt-single.cc:51:32: single-impl: devirtualized a call to vf +; CHECK: remark: devirt-single.cc:13:0: devirtualized vf +; CHECK-NOT: devirtualized + +@vt1 = constant [1 x ptr] [ptr @vf], !type !8 +@vt2 = constant [1 x ptr] [ptr @vf_empty], !type !12 + +define i1 @vf(ptr %this) #0 !dbg !7 { + ret i1 true +} + +; This should NOT be devirtualized because during non-lto empty functions +; are skipped. +define void @vf_empty(ptr %this) !dbg !11 { + ret void +} + +; CHECK: define void @call +define void @call(ptr %obj) #1 !dbg !5 { + %vtable = load ptr, ptr %obj + %p = call i1 @llvm.public.type.test(ptr %vtable, metadata !"typeid") + call void @llvm.assume(i1 %p) + %fptr = load ptr, ptr %vtable + ; CHECK: if.true.direct_targ: + ; CHECK: call i1 @vf( + ; CHECK: if.false.orig_indirect: + ; CHECK: call i1 %fptr( + call i1 %fptr(ptr %obj), !dbg !6 + ret void +} + + +; CHECK: define void @call1 +define void @call1(ptr %obj) #1 !dbg !9 { + %vtable = load ptr, ptr %obj + %p = call i1 @llvm.type.test(ptr %vtable, metadata !"typeid1") + call void @llvm.assume(i1 %p) + %fptr = load ptr, ptr %vtable, align 8 + ; CHECK: call i1 %fptr + %1 = call i1 %fptr(ptr %obj), !dbg !10 + ret void +} +declare ptr @llvm.load.relative.i32(ptr, i32) + +@vt3 = private unnamed_addr constant [1 x i32] [ + i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @vf to i64), i64 ptrtoint (ptr @vt3 to i64)) to i32) +], align 4, !type !15 + +; CHECK: define void @call2 +define void @call2(ptr %obj) #1 !dbg !13 { + %vtable = load ptr, ptr %obj + %p = call i1 @llvm.type.test(ptr %vtable, metadata !"typeid2") + call void @llvm.assume(i1 %p) + %fptr = call ptr @llvm.load.relative.i32(ptr %vtable, i32 0) + ; CHECK: if.true.direct_targ: + ; CHECK: call i1 @vf( + ; CHECK: if.false.orig_indirect: + ; CHECK: call i1 %fptr( + call i1 %fptr(ptr %obj), !dbg !14 + ret void +} + +@_ZTV1A.local = private unnamed_addr constant { [3 x i32] } { [3 x i32] [ + i32 0, ; offset to top + i32 0, ; rtti + i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @vf to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [3 x i32] }, ptr @_ZTV1A.local, i32 0, i32 0, i32 2) to i64)) to i32) ; vf_emptyunc offset +] }, align 4, !type !18 + +; CHECK: define void @call3 +define void @call3(ptr %obj) #1 !dbg !16 { + %vtable = load ptr, ptr %obj + %p = call i1 @llvm.type.test(ptr %vtable, metadata !"typeid3") + call void @llvm.assume(i1 %p) + %fptr = call ptr @llvm.load.relative.i32(ptr %vtable, i32 8) + ; CHECK: if.true.direct_targ: + ; CHECK: call i1 @vf( + ; CHECK: if.false.orig_indirect: + ; CHECK: call i1 %fptr( + call i1 %fptr(ptr %obj), !dbg !17 + ret void +} + + +declare i1 @llvm.type.test(ptr, metadata) +declare i1 @llvm.public.type.test(ptr, metadata) +declare void @llvm.assume(i1) + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 4.0.0 (trunk 278098)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug) +!1 = !DIFile(filename: "devirt-single.cc", directory: ".") +!2 = !{i32 2, !"Dwarf Version", i32 4} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{!"clang version 4.0.0 (trunk 278098)"} +!5 = distinct !DISubprogram(name: "call", linkageName: "_Z4callPv", scope: !1, file: !1, line: 29, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: false, unit: !0) +!6 = !DILocation(line: 30, column: 32, scope: !5) +!7 = distinct !DISubprogram(name: "vf", linkageName: "_ZN3vt12vfEv", scope: !1, file: !1, line: 13, isLocal: false, isDefinition: true, scopeLine: 13, flags: DIFlagPrototyped, isOptimized: false, unit: !0) +!8 = !{i32 0, !"typeid"} + +!9 = distinct !DISubprogram(name: "call1", linkageName: "_Z5call1Pv", scope: !1, file: !1, line: 31, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: false, unit: !0) +!10 = !DILocation(line: 35, column: 32, scope: !9) +!11 = distinct !DISubprogram(name: "vf_empty", linkageName: "_ZN3vt18vf_emptyEv", scope: !1, file: !1, line: 23, isLocal: false, isDefinition: true, scopeLine: 23, flags: DIFlagPrototyped, isOptimized: false, unit: !0) +!12 = !{i32 0, !"typeid1"} + +!13 = distinct !DISubprogram(name: "call2", linkageName: "_Z5call2Pv", scope: !1, file: !1, line: 40, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: false, unit: !0) +!14 = !DILocation(line: 41, column: 32, scope: !13) +!15 = !{i32 0, !"typeid2"} + +!16 = distinct !DISubprogram(name: "call3", linkageName: "_Z5call3Pv", scope: !1, file: !1, line: 50, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: false, unit: !0) +!17 = !DILocation(line: 51, column: 32, scope: !16) +!18 = !{i32 0, !"typeid3"} + + + +; CHECK: 1 wholeprogramdevirt - Number of whole program devirtualization targets +; CHECK: 3 wholeprogramdevirt - Number of single implementation devirtualizations diff --git a/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll b/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll index d8f5c912e9a50..8327e1cfdf1d2 100644 --- a/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll +++ b/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll @@ -11,6 +11,9 @@ ; Check wildcard ; RUN: opt -S -passes=wholeprogramdevirt -whole-program-visibility -pass-remarks=wholeprogramdevirt -wholeprogramdevirt-skip=vf?i1 %s 2>&1 | FileCheck %s --check-prefix=SKIP +; Check that no stats are reported in speculative devirtualization mode as the virtual const prop is disabled. +; RUN: opt -S -passes=wholeprogramdevirt -devirtualize-speculatively -stats %s 2>&1 | FileCheck %s --check-prefix=CHECK-SPECULATIVE-WPD + target datalayout = "e-p:64:64" target triple = "x86_64-unknown-linux-gnu" @@ -225,3 +228,7 @@ declare ptr @llvm.load.relative.i32(ptr, i32) ; CHECK: 2 wholeprogramdevirt - Number of unique return value optimizations ; CHECK: 2 wholeprogramdevirt - Number of virtual constant propagations ; CHECK: 2 wholeprogramdevirt - Number of 1 bit virtual constant propagations + +; CHECK-SPECULATIVE-WPD-NOT: 0 wholeprogramdevirt - Number of unique return value optimizations +; CHECK-SPECULATIVE-WPD-NOT: 0 wholeprogramdevirt - Number of virtual constant propagations +; CHECK-SPECULATIVE-WPD-NOT: 0 wholeprogramdevirt - Number of 1 bit virtual constant propagations From a4dbd111c285012d744fa0f86e710e4b3032d826 Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Wed, 22 Oct 2025 12:22:49 +0100 Subject: [PATCH 084/530] [LLVM][CodeGen][AArch64] Fix global-isel for LD1R. (#164418) LD1Rv8b only supports a base register but the DAG is matched using am_indexed8 with the offset it finds silently dropped. I've also fixed a couple of immediate operands types inconsistencies that don't manifest as bugs because their incorrect scaling is overriden by the complex pattern and MachineInstr that are correct and thus there's nothing to test. --- .../lib/Target/AArch64/AArch64InstrAtomics.td | 12 ++++----- llvm/lib/Target/AArch64/AArch64InstrGISel.td | 2 +- llvm/test/CodeGen/AArch64/arm64-ld1.ll | 26 ++++++------------- 3 files changed, 15 insertions(+), 25 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td index 31fcd63b9f2c8..5d9215dd71233 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td +++ b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td @@ -136,8 +136,8 @@ def : Pat<(f32 (bitconvert (i32 (relaxed_load (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend))))), (LDRSroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>; def : Pat<(f32 (bitconvert (i32 (relaxed_load - (am_indexed32 GPR64sp:$Rn, uimm12s8:$offset))))), - (LDRSui GPR64sp:$Rn, uimm12s8:$offset)>; + (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))), + (LDRSui GPR64sp:$Rn, uimm12s4:$offset)>; def : Pat<(f32 (bitconvert (i32 (relaxed_load (am_unscaled32 GPR64sp:$Rn, simm9:$offset))))), (LDURSi GPR64sp:$Rn, simm9:$offset)>; @@ -236,11 +236,11 @@ def : Pat<(relaxed_store def : Pat<(releasing_store GPR64sp:$ptr, GPR64:$val), (STLRX GPR64:$val, GPR64sp:$ptr)>; def : Pat<(relaxed_store (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm, - ro_Wextend16:$extend), + ro_Wextend64:$extend), GPR64:$val), (STRXroW GPR64:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>; def : Pat<(relaxed_store (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm, - ro_Xextend16:$extend), + ro_Xextend64:$extend), GPR64:$val), (STRXroX GPR64:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>; def : Pat<(relaxed_store @@ -276,8 +276,8 @@ def : Pat<(relaxed_store (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm, (i64 (bitconvert (f64 FPR64Op:$val)))), (STRDroX FPR64Op:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>; def : Pat<(relaxed_store - (am_indexed64 GPR64sp:$Rn, uimm12s4:$offset), (i64 (bitconvert (f64 FPR64Op:$val)))), - (STRDui FPR64Op:$val, GPR64sp:$Rn, uimm12s4:$offset)>; + (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset), (i64 (bitconvert (f64 FPR64Op:$val)))), + (STRDui FPR64Op:$val, GPR64sp:$Rn, uimm12s8:$offset)>; def : Pat<(relaxed_store (am_unscaled64 GPR64sp:$Rn, simm9:$offset), (i64 (bitconvert (f64 FPR64Op:$val)))), (STURDi FPR64Op:$val, GPR64sp:$Rn, simm9:$offset)>; diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td index fe8419301b306..30b7b03f7a69a 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td +++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td @@ -507,7 +507,7 @@ let AddedComplexity = 19 in { defm : VecROStoreLane64_0Pat; } -def : Pat<(v8i8 (AArch64dup (i8 (load (am_indexed8 GPR64sp:$Rn))))), +def : Pat<(v8i8 (AArch64dup (i8 (load GPR64sp:$Rn)))), (LD1Rv8b GPR64sp:$Rn)>; def : Pat<(v16i8 (AArch64dup (i8 (load GPR64sp:$Rn)))), (LD1Rv16b GPR64sp:$Rn)>; diff --git a/llvm/test/CodeGen/AArch64/arm64-ld1.ll b/llvm/test/CodeGen/AArch64/arm64-ld1.ll index 0b22fa49cb5c1..c2b2c1ebf58fe 100644 --- a/llvm/test/CodeGen/AArch64/arm64-ld1.ll +++ b/llvm/test/CodeGen/AArch64/arm64-ld1.ll @@ -1654,24 +1654,14 @@ define %struct.__neon_float64x2x4_t @ld1_x4_v2f64(ptr %addr) { } define <8 x i8> @dup_ld1_from_stack(ptr %__ret) { -; CHECK-SD-LABEL: dup_ld1_from_stack: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: sub sp, sp, #16 -; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 -; CHECK-SD-NEXT: add x8, sp, #15 -; CHECK-SD-NEXT: ld1r.8b { v0 }, [x8] -; CHECK-SD-NEXT: add sp, sp, #16 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: dup_ld1_from_stack: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 -; CHECK-GI-NEXT: .cfi_offset w29, -16 -; CHECK-GI-NEXT: add x8, sp, #15 -; CHECK-GI-NEXT: ld1r.8b { v0 }, [x8] -; CHECK-GI-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -; CHECK-GI-NEXT: ret +; CHECK-LABEL: dup_ld1_from_stack: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: add x8, sp, #15 +; CHECK-NEXT: ld1r.8b { v0 }, [x8] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret entry: %item = alloca i8, align 1 %0 = load i8, ptr %item, align 1 From 128eacfaba78162c944c073270db02e237b7b851 Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Wed, 22 Oct 2025 12:23:32 +0100 Subject: [PATCH 085/530] [LLVM][CodeGen][SVE] Fix typo in PPR_p8to15's DecoderMethod. (#164429) --- llvm/lib/Target/AArch64/AArch64RegisterInfo.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td index ef974df823100..47144c7333f7a 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -993,7 +993,7 @@ def PPR_3b : PPRClass<0, 7> { // Restricted 3 bit SVE predicate register class. let DecoderMethod = "DecodeSimpleRegisterClass"; } def PPR_p8to15 : PPRClass<8, 15> { - let DecoderMethod = "DecodeSimpleRegisterClass"; + let DecoderMethod = "DecodeSimpleRegisterClass"; } def PPRMul2 : PPRClass<0, 14, 2>; From b8062f85dd3612f2b5c0c5cfc14bdc5c0eae641f Mon Sep 17 00:00:00 2001 From: Ebuka Ezike Date: Wed, 22 Oct 2025 12:44:42 +0100 Subject: [PATCH 086/530] [lldb-dap] Use protocol types for exceptioninfo (#164318) It also separates the `ProtocolRequestsTests` from `ProtocolTypesTests` as I did not want to increase the work in https://github.com/llvm/llvm-project/pull/144595 --- .../Handler/ExceptionInfoRequestHandler.cpp | 211 +++++------------- lldb/tools/lldb-dap/Handler/RequestHandler.h | 10 +- .../lldb-dap/Protocol/ProtocolRequests.cpp | 18 ++ .../lldb-dap/Protocol/ProtocolRequests.h | 22 ++ .../tools/lldb-dap/Protocol/ProtocolTypes.cpp | 33 +++ lldb/tools/lldb-dap/Protocol/ProtocolTypes.h | 30 +++ lldb/unittests/DAP/CMakeLists.txt | 1 + lldb/unittests/DAP/ProtocolRequestsTest.cpp | 69 ++++++ lldb/unittests/DAP/ProtocolTypesTest.cpp | 47 ++++ .../TestingSupport/TestUtilities.cpp | 5 + lldb/unittests/TestingSupport/TestUtilities.h | 4 + 11 files changed, 296 insertions(+), 154 deletions(-) create mode 100644 lldb/unittests/DAP/ProtocolRequestsTest.cpp diff --git a/lldb/tools/lldb-dap/Handler/ExceptionInfoRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/ExceptionInfoRequestHandler.cpp index c1c2adb32a510..fa01a2036e1dd 100644 --- a/lldb/tools/lldb-dap/Handler/ExceptionInfoRequestHandler.cpp +++ b/lldb/tools/lldb-dap/Handler/ExceptionInfoRequestHandler.cpp @@ -7,168 +7,77 @@ //===----------------------------------------------------------------------===// #include "DAP.h" -#include "EventHelper.h" -#include "JSONUtils.h" +#include "DAPError.h" +#include "Protocol/ProtocolRequests.h" +#include "Protocol/ProtocolTypes.h" #include "RequestHandler.h" #include "lldb/API/SBStream.h" +using namespace lldb_dap::protocol; + namespace lldb_dap { -// "ExceptionInfoRequest": { -// "allOf": [ { "$ref": "#/definitions/Request" }, { -// "type": "object", -// "description": "Retrieves the details of the exception that -// caused this event to be raised. Clients should only call this request if -// the corresponding capability `supportsExceptionInfoRequest` is true.", -// "properties": { -// "command": { -// "type": "string", -// "enum": [ "exceptionInfo" ] -// }, -// "arguments": { -// "$ref": "#/definitions/ExceptionInfoArguments" -// } -// }, -// "required": [ "command", "arguments" ] -// }] -// }, -// "ExceptionInfoArguments": { -// "type": "object", -// "description": "Arguments for `exceptionInfo` request.", -// "properties": { -// "threadId": { -// "type": "integer", -// "description": "Thread for which exception information should be -// retrieved." -// } -// }, -// "required": [ "threadId" ] -// }, -// "ExceptionInfoResponse": { -// "allOf": [ { "$ref": "#/definitions/Response" }, { -// "type": "object", -// "description": "Response to `exceptionInfo` request.", -// "properties": { -// "body": { -// "type": "object", -// "properties": { -// "exceptionId": { -// "type": "string", -// "description": "ID of the exception that was thrown." -// }, -// "description": { -// "type": "string", -// "description": "Descriptive text for the exception." -// }, -// "breakMode": { -// "$ref": "#/definitions/ExceptionBreakMode", -// "description": "Mode that caused the exception notification to -// be raised." -// }, -// "details": { -// "$ref": "#/definitions/ExceptionDetails", -// "description": "Detailed information about the exception." -// } -// }, -// "required": [ "exceptionId", "breakMode" ] -// } -// }, -// "required": [ "body" ] -// }] -// } -// "ExceptionDetails": { -// "type": "object", -// "description": "Detailed information about an exception that has -// occurred.", "properties": { -// "message": { -// "type": "string", -// "description": "Message contained in the exception." -// }, -// "typeName": { -// "type": "string", -// "description": "Short type name of the exception object." -// }, -// "fullTypeName": { -// "type": "string", -// "description": "Fully-qualified type name of the exception object." -// }, -// "evaluateName": { -// "type": "string", -// "description": "An expression that can be evaluated in the current -// scope to obtain the exception object." -// }, -// "stackTrace": { -// "type": "string", -// "description": "Stack trace at the time the exception was thrown." -// }, -// "innerException": { -// "type": "array", -// "items": { -// "$ref": "#/definitions/ExceptionDetails" -// }, -// "description": "Details of the exception contained by this exception, -// if any." -// } -// } -// }, -void ExceptionInfoRequestHandler::operator()( - const llvm::json::Object &request) const { - llvm::json::Object response; - FillResponse(request, response); - const auto *arguments = request.getObject("arguments"); - llvm::json::Object body; - lldb::SBThread thread = dap.GetLLDBThread(*arguments); - if (thread.IsValid()) { - auto stopReason = thread.GetStopReason(); - if (stopReason == lldb::eStopReasonSignal) - body.try_emplace("exceptionId", "signal"); - else if (stopReason == lldb::eStopReasonBreakpoint) { - ExceptionBreakpoint *exc_bp = dap.GetExceptionBPFromStopReason(thread); - if (exc_bp) { - EmplaceSafeString(body, "exceptionId", exc_bp->GetFilter()); - EmplaceSafeString(body, "description", exc_bp->GetLabel()); - } else { - body.try_emplace("exceptionId", "exception"); - } +/// Retrieves the details of the exception that caused this event to be raised. +/// +/// Clients should only call this request if the corresponding capability +/// `supportsExceptionInfoRequest` is true. +llvm::Expected +ExceptionInfoRequestHandler::Run(const ExceptionInfoArguments &args) const { + + lldb::SBThread thread = dap.GetLLDBThread(args.threadId); + if (!thread.IsValid()) + return llvm::make_error( + llvm::formatv("Invalid thread id: {}", args.threadId).str()); + + ExceptionInfoResponseBody response; + response.breakMode = eExceptionBreakModeAlways; + const lldb::StopReason stop_reason = thread.GetStopReason(); + switch (stop_reason) { + case lldb::eStopReasonSignal: + response.exceptionId = "signal"; + break; + case lldb::eStopReasonBreakpoint: { + const ExceptionBreakpoint *exc_bp = + dap.GetExceptionBPFromStopReason(thread); + if (exc_bp) { + response.exceptionId = exc_bp->GetFilter(); + response.description = exc_bp->GetLabel(); } else { - body.try_emplace("exceptionId", "exception"); + response.exceptionId = "exception"; } - if (!ObjectContainsKey(body, "description")) { - char description[1024]; - if (thread.GetStopDescription(description, sizeof(description))) { - EmplaceSafeString(body, "description", description); - } + } break; + default: + response.exceptionId = "exception"; + } + + if (response.description.empty()) { + const size_t buffer_size = thread.GetStopDescription(nullptr, 0); + if (buffer_size > 0) { + std::string &buffer = response.description; + buffer.resize(buffer_size); + thread.GetStopDescription(buffer.data(), buffer.size()); } - body.try_emplace("breakMode", "always"); - auto exception = thread.GetCurrentException(); - if (exception.IsValid()) { - llvm::json::Object details; - lldb::SBStream stream; - if (exception.GetDescription(stream)) { - EmplaceSafeString(details, "message", stream.GetData()); - } + } - auto exceptionBacktrace = thread.GetCurrentExceptionBacktrace(); - if (exceptionBacktrace.IsValid()) { - lldb::SBStream stream; - exceptionBacktrace.GetDescription(stream); - for (uint32_t i = 0; i < exceptionBacktrace.GetNumFrames(); i++) { - lldb::SBFrame frame = exceptionBacktrace.GetFrameAtIndex(i); - frame.GetDescription(stream); - } - EmplaceSafeString(details, "stackTrace", stream.GetData()); - } + if (lldb::SBValue exception = thread.GetCurrentException()) { + lldb::SBStream stream; + response.details = ExceptionDetails{}; + if (exception.GetDescription(stream)) { + response.details->message = stream.GetData(); + } + + if (lldb::SBThread exception_backtrace = + thread.GetCurrentExceptionBacktrace()) { + stream.Clear(); + exception_backtrace.GetDescription(stream); - body.try_emplace("details", std::move(details)); + for (uint32_t idx = 0; idx < exception_backtrace.GetNumFrames(); idx++) { + lldb::SBFrame frame = exception_backtrace.GetFrameAtIndex(idx); + frame.GetDescription(stream); + } + response.details->stackTrace = stream.GetData(); } - // auto excInfoCount = thread.GetStopReasonDataCount(); - // for (auto i=0; i> { public: - using LegacyRequestHandler::LegacyRequestHandler; + using RequestHandler::RequestHandler; static llvm::StringLiteral GetCommand() { return "exceptionInfo"; } FeatureSet GetSupportedFeatures() const override { return {protocol::eAdapterFeatureExceptionInfoRequest}; } - void operator()(const llvm::json::Object &request) const override; + llvm::Expected + Run(const protocol::ExceptionInfoArguments &args) const override; }; class InitializeRequestHandler diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp index b9393356b4e01..e207aad2167d6 100644 --- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp +++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp @@ -625,4 +625,22 @@ llvm::json::Value toJSON(const ModuleSymbolsResponseBody &DGMSR) { return result; } +bool fromJSON(const json::Value &Params, ExceptionInfoArguments &Args, + json::Path Path) { + json::ObjectMapper O(Params, Path); + return O && O.map("threadId", Args.threadId); +} + +json::Value toJSON(const ExceptionInfoResponseBody &ERB) { + json::Object result{{"exceptionId", ERB.exceptionId}, + {"breakMode", ERB.breakMode}}; + + if (!ERB.description.empty()) + result.insert({"description", ERB.description.c_str()}); + if (ERB.details.has_value()) + result.insert({"details", *ERB.details}); + + return result; +} + } // namespace lldb_dap::protocol diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h index a85a68b87014c..53e551ac2ec64 100644 --- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h +++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h @@ -1039,6 +1039,28 @@ struct ModuleSymbolsResponseBody { }; llvm::json::Value toJSON(const ModuleSymbolsResponseBody &); +struct ExceptionInfoArguments { + /// Thread for which exception information should be retrieved. + lldb::tid_t threadId = LLDB_INVALID_THREAD_ID; +}; +bool fromJSON(const llvm::json::Value &, ExceptionInfoArguments &, + llvm::json::Path); + +struct ExceptionInfoResponseBody { + /// ID of the exception that was thrown. + std::string exceptionId; + + /// Descriptive text for the exception. + std::string description; + + /// Mode that caused the exception notification to be raised. + ExceptionBreakMode breakMode; + + /// Detailed information about the exception. + std::optional details; +}; +llvm::json::Value toJSON(const ExceptionInfoResponseBody &); + } // namespace lldb_dap::protocol #endif diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp index dc8edaadcd9bb..95007013742a0 100644 --- a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp +++ b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp @@ -1136,4 +1136,37 @@ bool fromJSON(const json::Value &Param, Variable &V, json::Path Path) { Path, /*required=*/false); } +json::Value toJSON(const ExceptionBreakMode Mode) { + switch (Mode) { + case eExceptionBreakModeNever: + return "never"; + case eExceptionBreakModeAlways: + return "always"; + case eExceptionBreakModeUnhandled: + return "unhandled"; + case eExceptionBreakModeUserUnhandled: + return "userUnhandled"; + } + llvm_unreachable("unhandled exception breakMode."); +} + +json::Value toJSON(const ExceptionDetails &ED) { + json::Object result; + + if (!ED.message.empty()) + result.insert({"message", ED.message}); + if (!ED.typeName.empty()) + result.insert({"typeName", ED.typeName}); + if (!ED.fullTypeName.empty()) + result.insert({"fullTypeName", ED.fullTypeName}); + if (!ED.evaluateName.empty()) + result.insert({"evaluateName", ED.evaluateName}); + if (!ED.stackTrace.empty()) + result.insert({"stackTrace", ED.stackTrace}); + if (!ED.innerException.empty()) + result.insert({"innerException", ED.innerException}); + + return result; +} + } // namespace lldb_dap::protocol diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h index 7077df90a85b5..6d85c74377bd3 100644 --- a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h +++ b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h @@ -1007,6 +1007,36 @@ struct Variable { llvm::json::Value toJSON(const Variable &); bool fromJSON(const llvm::json::Value &, Variable &, llvm::json::Path); +enum ExceptionBreakMode : unsigned { + eExceptionBreakModeNever, + eExceptionBreakModeAlways, + eExceptionBreakModeUnhandled, + eExceptionBreakModeUserUnhandled, +}; +llvm::json::Value toJSON(ExceptionBreakMode); + +struct ExceptionDetails { + /// Message contained in the exception. + std::string message; + + /// Short type name of the exception object. + std::string typeName; + + /// Fully-qualified type name of the exception object. + std::string fullTypeName; + + /// An expression that can be evaluated in the current scope to obtain the + /// exception object. + std::string evaluateName; + + /// Stack trace at the time the exception was thrown. + std::string stackTrace; + + /// Details of the exception contained by this exception, if any. + std::vector innerException; +}; +llvm::json::Value toJSON(const ExceptionDetails &); + } // namespace lldb_dap::protocol #endif diff --git a/lldb/unittests/DAP/CMakeLists.txt b/lldb/unittests/DAP/CMakeLists.txt index a08414c30e6cd..434f5280a97a0 100644 --- a/lldb/unittests/DAP/CMakeLists.txt +++ b/lldb/unittests/DAP/CMakeLists.txt @@ -7,6 +7,7 @@ add_lldb_unittest(DAPTests Handler/ContinueTest.cpp JSONUtilsTest.cpp LLDBUtilsTest.cpp + ProtocolRequestsTest.cpp ProtocolTypesTest.cpp ProtocolUtilsTest.cpp TestBase.cpp diff --git a/lldb/unittests/DAP/ProtocolRequestsTest.cpp b/lldb/unittests/DAP/ProtocolRequestsTest.cpp new file mode 100644 index 0000000000000..498195dc09325 --- /dev/null +++ b/lldb/unittests/DAP/ProtocolRequestsTest.cpp @@ -0,0 +1,69 @@ +//===-- ProtocolRequestsTest.cpp ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "Protocol/ProtocolRequests.h" +#include "Protocol/ProtocolTypes.h" +#include "TestingSupport/TestUtilities.h" +#include "llvm/Testing/Support/Error.h" +#include + +using namespace llvm; +using namespace lldb_dap::protocol; +using lldb_private::PrettyPrint; +using llvm::json::parse; + +TEST(ProtocolRequestsTest, ExceptionInfoArguments) { + llvm::Expected expected = + parse(R"({ + "threadId": 3434 + })"); + ASSERT_THAT_EXPECTED(expected, llvm::Succeeded()); + EXPECT_EQ(expected->threadId, 3434U); + + // Check required keys; + EXPECT_THAT_EXPECTED(parse(R"({})"), + FailedWithMessage("missing value at (root).threadId")); + + EXPECT_THAT_EXPECTED(parse(R"({"id": 10})"), + FailedWithMessage("missing value at (root).threadId")); +} + +TEST(ProtocolRequestsTest, ExceptionInfoResponseBody) { + ExceptionInfoResponseBody body; + body.exceptionId = "signal"; + body.breakMode = eExceptionBreakModeAlways; + + // Check required keys. + Expected expected = parse( + R"({ + "exceptionId": "signal", + "breakMode": "always" + })"); + + ASSERT_THAT_EXPECTED(expected, llvm::Succeeded()); + EXPECT_EQ(PrettyPrint(*expected), PrettyPrint(body)); + + // Check optional keys. + body.description = "SIGNAL SIGWINCH"; + body.breakMode = eExceptionBreakModeNever; + body.details = ExceptionDetails{}; + body.details->message = "some message"; + + Expected expected_opt = parse( + R"({ + "exceptionId": "signal", + "description": "SIGNAL SIGWINCH", + "breakMode": "never", + "details": { + "message": "some message" + } + })"); + + ASSERT_THAT_EXPECTED(expected_opt, llvm::Succeeded()); + EXPECT_EQ(PrettyPrint(*expected_opt), PrettyPrint(body)); +} diff --git a/lldb/unittests/DAP/ProtocolTypesTest.cpp b/lldb/unittests/DAP/ProtocolTypesTest.cpp index 8170abdd25bc6..6a4620a3f1e59 100644 --- a/lldb/unittests/DAP/ProtocolTypesTest.cpp +++ b/lldb/unittests/DAP/ProtocolTypesTest.cpp @@ -1129,3 +1129,50 @@ TEST(ProtocolTypesTest, DataBreakpointInfoArguments) { EXPECT_THAT_EXPECTED(parse(R"({"name":"data"})"), llvm::Succeeded()); } + +TEST(ProtocolTypesTest, ExceptionBreakMode) { + const std::vector> test_cases = + {{ExceptionBreakMode::eExceptionBreakModeAlways, "always"}, + {ExceptionBreakMode::eExceptionBreakModeNever, "never"}, + {ExceptionBreakMode::eExceptionBreakModeUnhandled, "unhandled"}, + {ExceptionBreakMode::eExceptionBreakModeUserUnhandled, "userUnhandled"}}; + + for (const auto [value, expected] : test_cases) { + json::Value const serialized = toJSON(value); + ASSERT_EQ(serialized.kind(), llvm::json::Value::Kind::String); + EXPECT_EQ(serialized.getAsString(), expected); + } +} + +TEST(ProtocolTypesTest, ExceptionDetails) { + ExceptionDetails details; + + // Check required keys. + Expected expected = parse(R"({})"); + ASSERT_THAT_EXPECTED(expected, llvm::Succeeded()); + EXPECT_EQ(pp(*expected), pp(details)); + + // Check optional keys. + details.message = "SIGABRT exception"; + details.typeName = "signal"; + details.fullTypeName = "SIGABRT"; + details.evaluateName = "process handle SIGABRT"; + details.stackTrace = "some stacktrace"; + ExceptionDetails inner_details; + inner_details.message = "inner message"; + details.innerException = {std::move(inner_details)}; + + Expected expected_opt = parse(R"({ + "message": "SIGABRT exception", + "typeName": "signal", + "fullTypeName": "SIGABRT", + "evaluateName": "process handle SIGABRT", + "stackTrace": "some stacktrace", + "innerException": [{ + "message": "inner message" + }] + })"); + + ASSERT_THAT_EXPECTED(expected_opt, llvm::Succeeded()); + EXPECT_EQ(pp(*expected_opt), pp(details)); +} diff --git a/lldb/unittests/TestingSupport/TestUtilities.cpp b/lldb/unittests/TestingSupport/TestUtilities.cpp index b53822e38324b..d164c227afb9e 100644 --- a/lldb/unittests/TestingSupport/TestUtilities.cpp +++ b/lldb/unittests/TestingSupport/TestUtilities.cpp @@ -20,6 +20,11 @@ using namespace lldb_private; extern const char *TestMainArgv0; std::once_flag TestUtilities::g_debugger_initialize_flag; + +std::string lldb_private::PrettyPrint(const llvm::json::Value &value) { + return llvm::formatv("{0:2}", value).str(); +} + std::string lldb_private::GetInputFilePath(const llvm::Twine &name) { llvm::SmallString<128> result = llvm::sys::path::parent_path(TestMainArgv0); llvm::sys::fs::make_absolute(result); diff --git a/lldb/unittests/TestingSupport/TestUtilities.h b/lldb/unittests/TestingSupport/TestUtilities.h index cc93a68a6a431..f05d176618fa0 100644 --- a/lldb/unittests/TestingSupport/TestUtilities.h +++ b/lldb/unittests/TestingSupport/TestUtilities.h @@ -30,6 +30,10 @@ } namespace lldb_private { + +/// Returns a pretty printed json string of a `llvm::json::Value`. +std::string PrettyPrint(const llvm::json::Value &E); + std::string GetInputFilePath(const llvm::Twine &name); class TestUtilities { From 37fcaf5c3441564ab5051d8088f5a29701026acb Mon Sep 17 00:00:00 2001 From: NexusXe Date: Wed, 22 Oct 2025 07:09:40 -0500 Subject: [PATCH 087/530] [X86] Fix some values for Znver4 model (#161405) This PR fixes a handful of latency and uop changes between Znver3 and Znver4 that were otherwise copied from Znver3. Latency and uop values listed that matched Zen3 on uops.info were updated to those for Zen4. Includes: BSF/BSR, DIV, TZCNT, CLMUL, PCMPISTRM, VALIGN, VPERM --- llvm/lib/Target/X86/X86ScheduleZnver4.td | 110 ++++++++++-------- .../llvm-mca/X86/Znver4/resources-avx1.s | 18 +-- .../llvm-mca/X86/Znver4/resources-avx2.s | 26 ++--- .../llvm-mca/X86/Znver4/resources-avx512.s | 4 +- .../llvm-mca/X86/Znver4/resources-avx512vl.s | 14 +-- .../X86/Znver4/resources-avx512vpclmulqdq.s | 10 +- .../X86/Znver4/resources-avx512vpclmulqdqvl.s | 18 +-- .../llvm-mca/X86/Znver4/resources-bmi1.s | 16 +-- .../llvm-mca/X86/Znver4/resources-cmpxchg.s | 18 +-- .../llvm-mca/X86/Znver4/resources-pclmul.s | 10 +- .../llvm-mca/X86/Znver4/resources-sse42.s | 8 +- .../X86/Znver4/resources-vpclmulqdq.s | 10 +- .../llvm-mca/X86/Znver4/resources-x86_64.s | 98 ++++++++-------- 13 files changed, 188 insertions(+), 172 deletions(-) diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td index cc300548a50e6..ac4d31de8dbfe 100644 --- a/llvm/lib/Target/X86/X86ScheduleZnver4.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td @@ -15,7 +15,7 @@ //===----------------------------------------------------------------------===// def Znver4Model : SchedMachineModel { - // AMD SOG Zen4, 2.9.6 Dispatch + // AMD SOG Zen4, 2.9.8 Dispatch // The processor may dispatch up to 6 macro ops per cycle // into the execution engine. let IssueWidth = 6; @@ -46,8 +46,9 @@ def Znver4Model : SchedMachineModel { int VecLoadLatency = 7; // Latency of a simple store operation. int StoreLatency = 1; - // FIXME: - let HighLatency = 25; // FIXME: any better choice? + // Mean and median value for all instructions with latencies >6 + // Source: Zen4 Instruction Latencies spreadsheet (included with SOG) + let HighLatency = 13; // AMD SOG Zen4, 2.8 Optimizing Branching // The branch misprediction penalty is in the range from 11 to 18 cycles, // <...>. The common case penalty is 13 cycles. @@ -612,6 +613,7 @@ def Zn4WriteLEA : SchedWriteVariant<[ def : InstRW<[Zn4WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>; +// values from uops.info def Zn4SlowLEA16r : SchedWriteRes<[Zn4ALU0123]> { let Latency = 2; // FIXME: not from llvm-exegesis let ReleaseAtCycles = [4]; @@ -659,15 +661,15 @@ def : InstRW<[Zn4WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>; def Zn4WriteCMPXCHG8B : SchedWriteRes<[Zn4ALU0123]> { let Latency = 3; // FIXME: not from llvm-exegesis - let ReleaseAtCycles = [24]; - let NumMicroOps = 19; + let ReleaseAtCycles = [20]; + let NumMicroOps = 15; } def : InstRW<[Zn4WriteCMPXCHG8B], (instrs CMPXCHG8B)>; def Zn4WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn4ALU0123]> { - let Latency = 4; // FIXME: not from llvm-exegesis - let ReleaseAtCycles = [59]; - let NumMicroOps = 28; + let Latency = 2; // FIXME: not from llvm-exegesis + let ReleaseAtCycles = [40]; + let NumMicroOps = 26; } def : InstRW<[Zn4WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>; @@ -681,7 +683,7 @@ def : InstRW<[Zn4WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16a def Zn4WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> { let Latency = !add(Znver4Model.LoadLatency, 3); // FIXME: not from llvm-exegesis let ReleaseAtCycles = [1, 1, 2]; - let NumMicroOps = 5; + let NumMicroOps = 2; } def : InstRW<[Zn4WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>; @@ -693,19 +695,17 @@ def Zn4WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> def : InstRW<[Zn4WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>; // Integer division. -// FIXME: uops for 8-bit division measures as 2. for others it's a guess. -// FIXME: latency for 8-bit division measures as 10. for others it's a guess. -defm : Zn4WriteResIntPair; -defm : Zn4WriteResIntPair; -defm : Zn4WriteResIntPair; -defm : Zn4WriteResIntPair; -defm : Zn4WriteResIntPair; -defm : Zn4WriteResIntPair; -defm : Zn4WriteResIntPair; -defm : Zn4WriteResIntPair; - -defm : Zn4WriteResIntPair; // Bit scan forward. -defm : Zn4WriteResIntPair; // Bit scan reverse. +defm : Zn4WriteResIntPair; +defm : Zn4WriteResIntPair; +defm : Zn4WriteResIntPair; +defm : Zn4WriteResIntPair; +defm : Zn4WriteResIntPair; +defm : Zn4WriteResIntPair; +defm : Zn4WriteResIntPair; +defm : Zn4WriteResIntPair; + +defm : Zn4WriteResIntPair; // Bit scan forward. +defm : Zn4WriteResIntPair; // Bit scan reverse. defm : Zn4WriteResIntPair; // Bit population count. @@ -725,12 +725,12 @@ def Zn4WriteLZCNT16rr : SchedWriteRes<[Zn4ALU0123]> { } def : InstRW<[Zn4WriteLZCNT16rr], (instrs LZCNT16rr)>; -defm : Zn4WriteResIntPair; // Trailing zero count. +defm : Zn4WriteResIntPair; // Trailing zero count. def Zn4WriteTZCNT16rr : SchedWriteRes<[Zn4ALU0123]> { - let Latency = 2; - let ReleaseAtCycles = [4]; - let NumMicroOps = 2; + let Latency = 1; + let ReleaseAtCycles = [1]; + let NumMicroOps = 1; } def : InstRW<[Zn4WriteTZCNT16rr], (instrs TZCNT16rr)>; @@ -1109,15 +1109,31 @@ def Zn4WriteVecOpMaskKRMov : SchedWriteRes<[Zn4FPOpMask4]> { } def : InstRW<[Zn4WriteVecOpMaskKRMov], (instrs KMOVBkr, KMOVDkr, KMOVQkr, KMOVWkr)>; -def Zn4WriteVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> { - // TODO: All align instructions are expected to be of 4 cycle latency - let Latency = 4; +// 128-bit VALIGN +def Zn4WriteXMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> { + let Latency = 2; let ReleaseAtCycles = [1]; let NumMicroOps = 1; } -def : InstRW<[Zn4WriteVecALU2Slow], (instrs VALIGNDZrri, VALIGNDZ128rri, VALIGNDZ256rri, - VALIGNQZrri, VALIGNQZ128rri, VALIGNQZ256rri) - >; + +// 256-bit VALIGN +def Zn4WriteYMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> { + let Latency = 3; + let ReleaseAtCycles = [1]; + let NumMicroOps = 1; +} + +// 512-bit VALIGN +def Zn4WriteZMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> { + let Latency = 4; + let ReleaseAtCycles = [2]; + let NumMicroOps = 1; +} + +def : InstRW<[Zn4WriteXMMVecALU2Slow], (instrs VALIGNDZrri, VALIGNQZrri)>; +def : InstRW<[Zn4WriteYMMVecALU2Slow], (instrs VALIGNDZ128rri, VALIGNQZ128rri)>; +def : InstRW<[Zn4WriteZMMVecALU2Slow], (instrs VALIGNDZ256rri, VALIGNQZ256rri)>; + defm : Zn4WriteResYMMPair; // Vector integer ALU op, no logicals (YMM). def Zn4WriteVecALUYSlow : SchedWriteRes<[Zn4FPVAdd01]> { @@ -1326,9 +1342,9 @@ def : InstRW<[Zn4WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>; // Strings instructions. // Packed Compare Implicit Length Strings, Return Mask -defm : Zn4WriteResXMMPair; +defm : Zn4WriteResXMMPair; // Packed Compare Explicit Length Strings, Return Mask -defm : Zn4WriteResXMMPair; +defm : Zn4WriteResXMMPair; // Packed Compare Implicit Length Strings, Return Index defm : Zn4WriteResXMMPair; // Packed Compare Explicit Length Strings, Return Index @@ -1340,7 +1356,7 @@ defm : Zn4WriteResXMMPair; // InvMixColumn defm : Zn4WriteResXMMPair; // Key Generation. // Carry-less multiplication instructions. -defm : Zn4WriteResXMMPair; +defm : Zn4WriteResXMMPair; // EMMS/FEMMS defm : Zn4WriteResInt; // FIXME: latency not from llvm-exegesis @@ -1386,44 +1402,44 @@ def Zn4WriteVPERM2F128rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { def : InstRW<[Zn4WriteVPERM2F128rm], (instrs VPERM2F128rmi)>; def Zn4WriteVPERMPSYrr : SchedWriteRes<[Zn4FPVShuf]> { - let Latency = 7; + let Latency = 4; let ReleaseAtCycles = [1]; - let NumMicroOps = 2; + let NumMicroOps = 1; } def : InstRW<[Zn4WriteVPERMPSYrr], (instrs VPERMPSYrr)>; def Zn4WriteVPERMPSYrm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMPSYrr.Latency); - let ReleaseAtCycles = [1, 1, 2]; - let NumMicroOps = !add(Zn4WriteVPERMPSYrr.NumMicroOps, 1); + let ReleaseAtCycles = [1, 1, 1]; + let NumMicroOps = 1; } def : InstRW<[Zn4WriteVPERMPSYrm], (instrs VPERMPSYrm)>; def Zn4WriteVPERMYri : SchedWriteRes<[Zn4FPVShuf]> { - let Latency = 6; + let Latency = 4; let ReleaseAtCycles = [1]; - let NumMicroOps = 2; + let NumMicroOps = 1; } def : InstRW<[Zn4WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>; def Zn4WriteVPERMPDYmi : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMYri.Latency); - let ReleaseAtCycles = [1, 1, 2]; - let NumMicroOps = !add(Zn4WriteVPERMYri.NumMicroOps, 1); + let ReleaseAtCycles = [1, 1, 1]; + let NumMicroOps = 1; } def : InstRW<[Zn4WriteVPERMPDYmi], (instrs VPERMPDYmi)>; def Zn4WriteVPERMDYrr : SchedWriteRes<[Zn4FPVShuf]> { - let Latency = 5; + let Latency = 4; let ReleaseAtCycles = [1]; - let NumMicroOps = 2; + let NumMicroOps = 1; } def : InstRW<[Zn4WriteVPERMDYrr], (instrs VPERMDYrr)>; def Zn4WriteVPERMYm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMDYrr.Latency); - let ReleaseAtCycles = [1, 1, 2]; - let NumMicroOps = !add(Zn4WriteVPERMDYrr.NumMicroOps, 0); + let ReleaseAtCycles = [1, 1, 1]; + let NumMicroOps = 1; } def : InstRW<[Zn4WriteVPERMYm], (instrs VPERMQYmi, VPERMDYrm)>; diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx1.s index 1ffe53366fdb0..d1df30497325b 100644 --- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx1.s +++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx1.s @@ -1403,8 +1403,8 @@ vzeroupper # CHECK-NEXT: 1 8 0.50 * vpblendvb %xmm3, (%rax), %xmm1, %xmm2 # CHECK-NEXT: 1 1 0.25 vpblendw $11, %xmm0, %xmm1, %xmm2 # CHECK-NEXT: 1 8 0.50 * vpblendw $11, (%rax), %xmm1, %xmm2 -# CHECK-NEXT: 4 4 2.00 vpclmulqdq $11, %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: 4 11 2.00 * vpclmulqdq $11, (%rax), %xmm1, %xmm2 +# CHECK-NEXT: 4 4 1.50 vpclmulqdq $11, %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: 4 11 1.50 * vpclmulqdq $11, (%rax), %xmm1, %xmm2 # CHECK-NEXT: 1 1 0.25 vpcmpeqb %xmm0, %xmm1, %xmm2 # CHECK-NEXT: 1 8 0.50 * vpcmpeqb (%rax), %xmm1, %xmm2 # CHECK-NEXT: 1 1 0.25 vpcmpeqd %xmm0, %xmm1, %xmm2 @@ -1415,8 +1415,8 @@ vzeroupper # CHECK-NEXT: 1 8 0.50 * vpcmpeqw (%rax), %xmm1, %xmm2 # CHECK-NEXT: 8 6 3.00 vpcmpestri $1, %xmm0, %xmm2 # CHECK-NEXT: 12 13 3.00 * vpcmpestri $1, (%rax), %xmm2 -# CHECK-NEXT: 7 6 3.00 vpcmpestrm $1, %xmm0, %xmm2 -# CHECK-NEXT: 12 13 3.00 * vpcmpestrm $1, (%rax), %xmm2 +# CHECK-NEXT: 7 7 3.00 vpcmpestrm $1, %xmm0, %xmm2 +# CHECK-NEXT: 12 14 3.00 * vpcmpestrm $1, (%rax), %xmm2 # CHECK-NEXT: 1 1 0.25 vpcmpgtb %xmm0, %xmm1, %xmm2 # CHECK-NEXT: 1 8 0.50 * vpcmpgtb (%rax), %xmm1, %xmm2 # CHECK-NEXT: 1 1 0.25 vpcmpgtd %xmm0, %xmm1, %xmm2 @@ -1427,8 +1427,8 @@ vzeroupper # CHECK-NEXT: 1 8 0.50 * vpcmpgtw (%rax), %xmm1, %xmm2 # CHECK-NEXT: 4 2 2.00 vpcmpistri $1, %xmm0, %xmm2 # CHECK-NEXT: 4 9 2.00 * vpcmpistri $1, (%rax), %xmm2 -# CHECK-NEXT: 3 6 2.00 vpcmpistrm $1, %xmm0, %xmm2 -# CHECK-NEXT: 4 13 2.00 * vpcmpistrm $1, (%rax), %xmm2 +# CHECK-NEXT: 3 7 2.00 vpcmpistrm $1, %xmm0, %xmm2 +# CHECK-NEXT: 4 14 2.00 * vpcmpistrm $1, (%rax), %xmm2 # CHECK-NEXT: 1 3 1.00 vperm2f128 $1, %ymm0, %ymm1, %ymm2 # CHECK-NEXT: 1 10 1.00 * vperm2f128 $1, (%rax), %ymm1, %ymm2 # CHECK-NEXT: 1 1 0.50 vpermilpd $1, %xmm0, %xmm2 @@ -1749,7 +1749,7 @@ vzeroupper # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] -# CHECK-NEXT: 1.33 1.33 1.33 16.50 16.50 16.50 16.50 - 205.25 393.58 268.08 158.08 208.50 208.50 65.00 119.67 119.67 119.67 107.00 107.00 107.00 19.00 19.00 +# CHECK-NEXT: 1.33 1.33 1.33 16.50 16.50 16.50 16.50 - 204.25 392.58 268.08 158.08 208.50 208.50 65.00 119.67 119.67 119.67 107.00 107.00 107.00 19.00 19.00 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions: @@ -2126,8 +2126,8 @@ vzeroupper # CHECK-NEXT: - - - - - - - - 0.50 - - 0.50 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpblendvb %xmm3, (%rax), %xmm1, %xmm2 # CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 - - - - - - - - - - - vpblendw $11, %xmm0, %xmm1, %xmm2 # CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpblendw $11, (%rax), %xmm1, %xmm2 -# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - - - - - - - - - - - - vpclmulqdq $11, %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpclmulqdq $11, (%rax), %xmm1, %xmm2 +# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - - - - - - - - - - - - vpclmulqdq $11, %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpclmulqdq $11, (%rax), %xmm1, %xmm2 # CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 - - - - - - - - - - - vpcmpeqb %xmm0, %xmm1, %xmm2 # CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpcmpeqb (%rax), %xmm1, %xmm2 # CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 - - - - - - - - - - - vpcmpeqd %xmm0, %xmm1, %xmm2 diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx2.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx2.s index 6dc5bacde9059..6c8fac4566498 100644 --- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx2.s +++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx2.s @@ -560,14 +560,14 @@ vpxor (%rax), %ymm1, %ymm2 # CHECK-NEXT: 1 8 0.50 * vpcmpgtw (%rax), %ymm1, %ymm2 # CHECK-NEXT: 1 3 1.00 vperm2i128 $1, %ymm0, %ymm1, %ymm2 # CHECK-NEXT: 1 8 1.00 * vperm2i128 $1, (%rax), %ymm1, %ymm2 -# CHECK-NEXT: 2 5 1.00 vpermd %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: 2 12 2.00 * vpermd (%rax), %ymm1, %ymm2 -# CHECK-NEXT: 2 6 1.00 vpermpd $1, %ymm0, %ymm2 -# CHECK-NEXT: 3 13 2.00 * vpermpd $1, (%rax), %ymm2 -# CHECK-NEXT: 2 7 1.00 vpermps %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: 3 14 2.00 * vpermps (%rax), %ymm1, %ymm2 -# CHECK-NEXT: 2 6 1.00 vpermq $1, %ymm0, %ymm2 -# CHECK-NEXT: 2 12 2.00 * vpermq $1, (%rax), %ymm2 +# CHECK-NEXT: 1 4 1.00 vpermd %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: 1 11 1.00 * vpermd (%rax), %ymm1, %ymm2 +# CHECK-NEXT: 1 4 1.00 vpermpd $1, %ymm0, %ymm2 +# CHECK-NEXT: 1 11 1.00 * vpermpd $1, (%rax), %ymm2 +# CHECK-NEXT: 1 4 1.00 vpermps %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: 1 11 1.00 * vpermps (%rax), %ymm1, %ymm2 +# CHECK-NEXT: 1 4 1.00 vpermq $1, %ymm0, %ymm2 +# CHECK-NEXT: 1 11 1.00 * vpermq $1, (%rax), %ymm2 # CHECK-NEXT: 1 5 0.33 * vpgatherdd %xmm0, (%rax,%xmm1,2), %xmm2 # CHECK-NEXT: 1 5 0.33 * vpgatherdd %ymm0, (%rax,%ymm1,2), %ymm2 # CHECK-NEXT: 1 5 0.33 * vpgatherdq %xmm0, (%rax,%xmm1,2), %xmm2 @@ -789,7 +789,7 @@ vpxor (%rax), %ymm1, %ymm2 # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] -# CHECK-NEXT: 6.67 6.67 6.67 - - - - - 93.75 132.75 92.25 36.25 80.50 80.50 29.00 52.33 52.33 52.33 50.67 50.67 50.67 2.50 2.50 +# CHECK-NEXT: 6.67 6.67 6.67 - - - - - 93.75 128.75 92.25 36.25 80.50 80.50 29.00 52.33 52.33 52.33 50.67 50.67 50.67 2.50 2.50 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions: @@ -894,13 +894,13 @@ vpxor (%rax), %ymm1, %ymm2 # CHECK-NEXT: - - - - - - - - - 1.00 - - - - - - - - - - - - - vperm2i128 $1, %ymm0, %ymm1, %ymm2 # CHECK-NEXT: - - - - - - - - - 1.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vperm2i128 $1, (%rax), %ymm1, %ymm2 # CHECK-NEXT: - - - - - - - - - 1.00 - - - - - - - - - - - - - vpermd %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - 2.00 - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpermd (%rax), %ymm1, %ymm2 +# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - 1.00 - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpermd (%rax), %ymm1, %ymm2 # CHECK-NEXT: - - - - - - - - - 1.00 - - - - - - - - - - - - - vpermpd $1, %ymm0, %ymm2 -# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - 2.00 - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpermpd $1, (%rax), %ymm2 +# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - 1.00 - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpermpd $1, (%rax), %ymm2 # CHECK-NEXT: - - - - - - - - - 1.00 - - - - - - - - - - - - - vpermps %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - 2.00 - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpermps (%rax), %ymm1, %ymm2 +# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - 1.00 - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpermps (%rax), %ymm1, %ymm2 # CHECK-NEXT: - - - - - - - - - 1.00 - - - - - - - - - - - - - vpermq $1, %ymm0, %ymm2 -# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - 2.00 - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpermq $1, (%rax), %ymm2 +# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - 1.00 - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpermq $1, (%rax), %ymm2 # CHECK-NEXT: 0.33 0.33 0.33 - - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpgatherdd %xmm0, (%rax,%xmm1,2), %xmm2 # CHECK-NEXT: 0.33 0.33 0.33 - - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpgatherdd %ymm0, (%rax,%ymm1,2), %ymm2 # CHECK-NEXT: 0.33 0.33 0.33 - - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpgatherdq %xmm0, (%rax,%xmm1,2), %xmm2 diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s index 72d7de3353346..14b8e5f36c666 100644 --- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s +++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s @@ -1207,7 +1207,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: 1 3 1.00 vaddps %zmm16, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 1 10 1.00 * vaddps (%rax), %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 1 10 1.00 * vaddps (%rax){1to16}, %zmm17, %zmm19 {%k1} {z} -# CHECK-NEXT: 1 4 0.50 valignd $1, %zmm16, %zmm17, %zmm19 +# CHECK-NEXT: 1 2 0.50 valignd $1, %zmm16, %zmm17, %zmm19 # CHECK-NEXT: 1 8 1.00 * valignd $1, (%rax), %zmm17, %zmm19 # CHECK-NEXT: 1 8 1.00 * valignd $1, (%rax){1to16}, %zmm17, %zmm19 # CHECK-NEXT: 1 1 1.00 valignd $1, %zmm16, %zmm17, %zmm19 {%k1} @@ -1216,7 +1216,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: 1 1 1.00 valignd $1, %zmm16, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 1 8 1.00 * valignd $1, (%rax), %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 1 8 1.00 * valignd $1, (%rax){1to16}, %zmm17, %zmm19 {%k1} {z} -# CHECK-NEXT: 1 4 0.50 valignq $1, %zmm16, %zmm17, %zmm19 +# CHECK-NEXT: 1 2 0.50 valignq $1, %zmm16, %zmm17, %zmm19 # CHECK-NEXT: 1 8 1.00 * valignq $1, (%rax), %zmm17, %zmm19 # CHECK-NEXT: 1 8 1.00 * valignq $1, (%rax){1to8}, %zmm17, %zmm19 # CHECK-NEXT: 1 1 1.00 valignq $1, %zmm16, %zmm17, %zmm19 {%k1} diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s index 552b3e40284b9..ead609e33da4d 100644 --- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s +++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s @@ -1948,7 +1948,7 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK-NEXT: 1 3 0.50 vaddps %ymm16, %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: 1 10 0.50 * vaddps (%rax), %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: 1 10 0.50 * vaddps (%rax){1to8}, %ymm17, %ymm19 {%k1} {z} -# CHECK-NEXT: 1 4 0.50 valignd $1, %xmm16, %xmm17, %xmm19 +# CHECK-NEXT: 1 3 0.50 valignd $1, %xmm16, %xmm17, %xmm19 # CHECK-NEXT: 1 8 0.50 * valignd $1, (%rax), %xmm17, %xmm19 # CHECK-NEXT: 1 8 0.50 * valignd $1, (%rax){1to4}, %xmm17, %xmm19 # CHECK-NEXT: 1 1 0.50 valignd $1, %xmm16, %xmm17, %xmm19 {%k1} @@ -1957,7 +1957,7 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK-NEXT: 1 1 0.50 valignd $1, %xmm16, %xmm17, %xmm19 {%k1} {z} # CHECK-NEXT: 1 8 0.50 * valignd $1, (%rax), %xmm17, %xmm19 {%k1} {z} # CHECK-NEXT: 1 8 0.50 * valignd $1, (%rax){1to4}, %xmm17, %xmm19 {%k1} {z} -# CHECK-NEXT: 1 4 0.50 valignd $1, %ymm16, %ymm17, %ymm19 +# CHECK-NEXT: 1 4 1.00 valignd $1, %ymm16, %ymm17, %ymm19 # CHECK-NEXT: 1 8 0.50 * valignd $1, (%rax), %ymm17, %ymm19 # CHECK-NEXT: 1 8 0.50 * valignd $1, (%rax){1to8}, %ymm17, %ymm19 # CHECK-NEXT: 1 1 0.50 valignd $1, %ymm16, %ymm17, %ymm19 {%k1} @@ -1966,7 +1966,7 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK-NEXT: 1 1 0.50 valignd $1, %ymm16, %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: 1 8 0.50 * valignd $1, (%rax), %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: 1 8 0.50 * valignd $1, (%rax){1to8}, %ymm17, %ymm19 {%k1} {z} -# CHECK-NEXT: 1 4 0.50 valignq $1, %xmm16, %xmm17, %xmm19 +# CHECK-NEXT: 1 3 0.50 valignq $1, %xmm16, %xmm17, %xmm19 # CHECK-NEXT: 1 8 0.50 * valignq $1, (%rax), %xmm17, %xmm19 # CHECK-NEXT: 1 8 0.50 * valignq $1, (%rax){1to2}, %xmm17, %xmm19 # CHECK-NEXT: 1 1 0.50 valignq $1, %xmm16, %xmm17, %xmm19 {%k1} @@ -1975,7 +1975,7 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK-NEXT: 1 1 0.50 valignq $1, %xmm16, %xmm17, %xmm19 {%k1} {z} # CHECK-NEXT: 1 8 0.50 * valignq $1, (%rax), %xmm17, %xmm19 {%k1} {z} # CHECK-NEXT: 1 8 0.50 * valignq $1, (%rax){1to2}, %xmm17, %xmm19 {%k1} {z} -# CHECK-NEXT: 1 4 0.50 valignq $1, %ymm16, %ymm17, %ymm19 +# CHECK-NEXT: 1 4 1.00 valignq $1, %ymm16, %ymm17, %ymm19 # CHECK-NEXT: 1 8 0.50 * valignq $1, (%rax), %ymm17, %ymm19 # CHECK-NEXT: 1 8 0.50 * valignq $1, (%rax){1to4}, %ymm17, %ymm19 # CHECK-NEXT: 1 1 0.50 valignq $1, %ymm16, %ymm17, %ymm19 {%k1} @@ -3614,7 +3614,7 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] -# CHECK-NEXT: 10.67 10.67 10.67 - - - - - 208.00 1083.00 636.50 261.50 509.50 509.50 32.00 355.67 355.67 355.67 334.33 334.33 334.33 32.00 32.00 +# CHECK-NEXT: 10.67 10.67 10.67 - - - - - 208.00 1084.00 637.50 261.50 509.50 509.50 32.00 355.67 355.67 355.67 334.33 334.33 334.33 32.00 32.00 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions: @@ -3663,7 +3663,7 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - - - - - - - - - - valignd $1, %xmm16, %xmm17, %xmm19 {%k1} {z} # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - valignd $1, (%rax), %xmm17, %xmm19 {%k1} {z} # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - valignd $1, (%rax){1to4}, %xmm17, %xmm19 {%k1} {z} -# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - - - - - - - - - - valignd $1, %ymm16, %ymm17, %ymm19 +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - - - - - - - - - - valignd $1, %ymm16, %ymm17, %ymm19 # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - valignd $1, (%rax), %ymm17, %ymm19 # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - valignd $1, (%rax){1to8}, %ymm17, %ymm19 # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - - - - - - - - - - valignd $1, %ymm16, %ymm17, %ymm19 {%k1} @@ -3681,7 +3681,7 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - - - - - - - - - - valignq $1, %xmm16, %xmm17, %xmm19 {%k1} {z} # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - valignq $1, (%rax), %xmm17, %xmm19 {%k1} {z} # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - valignq $1, (%rax){1to2}, %xmm17, %xmm19 {%k1} {z} -# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - - - - - - - - - - valignq $1, %ymm16, %ymm17, %ymm19 +# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - - - - - - - - - - valignq $1, %ymm16, %ymm17, %ymm19 # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - valignq $1, (%rax), %ymm17, %ymm19 # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - valignq $1, (%rax){1to4}, %ymm17, %ymm19 # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - - - - - - - - - - valignq $1, %ymm16, %ymm17, %ymm19 {%k1} diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdq.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdq.s index 87ba0607e71d1..d1f2a980ee444 100644 --- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdq.s +++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdq.s @@ -13,8 +13,8 @@ vpclmulqdq $11, (%rax), %zmm17, %zmm19 # CHECK-NEXT: [6]: HasSideEffects (U) # CHECK: [1] [2] [3] [4] [5] [6] Instructions: -# CHECK-NEXT: 4 4 2.00 vpclmulqdq $11, %zmm16, %zmm17, %zmm19 -# CHECK-NEXT: 4 11 2.00 * vpclmulqdq $11, (%rax), %zmm17, %zmm19 +# CHECK-NEXT: 4 4 1.50 vpclmulqdq $11, %zmm16, %zmm17, %zmm19 +# CHECK-NEXT: 4 11 1.50 * vpclmulqdq $11, (%rax), %zmm17, %zmm19 # CHECK: Resources: # CHECK-NEXT: [0] - Zn4AGU0 @@ -43,9 +43,9 @@ vpclmulqdq $11, (%rax), %zmm17, %zmm19 # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] -# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions: -# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - - - - - - - - - - - - vpclmulqdq $11, %zmm16, %zmm17, %zmm19 -# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpclmulqdq $11, (%rax), %zmm17, %zmm19 +# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - - - - - - - - - - - - vpclmulqdq $11, %zmm16, %zmm17, %zmm19 +# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpclmulqdq $11, (%rax), %zmm17, %zmm19 diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdqvl.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdqvl.s index 3c80c567227c5..ea7a28027a782 100644 --- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdqvl.s +++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdqvl.s @@ -16,10 +16,10 @@ vpclmulqdq $11, (%rax), %ymm17, %ymm19 # CHECK-NEXT: [6]: HasSideEffects (U) # CHECK: [1] [2] [3] [4] [5] [6] Instructions: -# CHECK-NEXT: 4 4 2.00 vpclmulqdq $11, %xmm16, %xmm17, %xmm19 -# CHECK-NEXT: 4 11 2.00 * vpclmulqdq $11, (%rax), %xmm17, %xmm19 -# CHECK-NEXT: 4 4 2.00 vpclmulqdq $11, %ymm16, %ymm17, %ymm19 -# CHECK-NEXT: 4 11 2.00 * vpclmulqdq $11, (%rax), %ymm17, %ymm19 +# CHECK-NEXT: 4 4 1.50 vpclmulqdq $11, %xmm16, %xmm17, %xmm19 +# CHECK-NEXT: 4 11 1.50 * vpclmulqdq $11, (%rax), %xmm17, %xmm19 +# CHECK-NEXT: 4 4 1.50 vpclmulqdq $11, %ymm16, %ymm17, %ymm19 +# CHECK-NEXT: 4 11 1.50 * vpclmulqdq $11, (%rax), %ymm17, %ymm19 # CHECK: Resources: # CHECK-NEXT: [0] - Zn4AGU0 @@ -48,11 +48,11 @@ vpclmulqdq $11, (%rax), %ymm17, %ymm19 # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] -# CHECK-NEXT: - - - - - - - - 8.00 8.00 - - 1.00 1.00 - 0.67 0.67 0.67 0.67 0.67 0.67 - - +# CHECK-NEXT: - - - - - - - - 6.00 6.00 - - 1.00 1.00 - 0.67 0.67 0.67 0.67 0.67 0.67 - - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions: -# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - - - - - - - - - - - - vpclmulqdq $11, %xmm16, %xmm17, %xmm19 -# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpclmulqdq $11, (%rax), %xmm17, %xmm19 -# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - - - - - - - - - - - - vpclmulqdq $11, %ymm16, %ymm17, %ymm19 -# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpclmulqdq $11, (%rax), %ymm17, %ymm19 +# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - - - - - - - - - - - - vpclmulqdq $11, %xmm16, %xmm17, %xmm19 +# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpclmulqdq $11, (%rax), %xmm17, %xmm19 +# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - - - - - - - - - - - - vpclmulqdq $11, %ymm16, %ymm17, %ymm19 +# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpclmulqdq $11, (%rax), %ymm17, %ymm19 diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-bmi1.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-bmi1.s index f4888cf81523f..afbd566751c95 100644 --- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-bmi1.s +++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-bmi1.s @@ -69,12 +69,12 @@ tzcnt (%rax), %rcx # CHECK-NEXT: 2 5 0.33 * blsrl (%rax), %ecx # CHECK-NEXT: 1 1 0.25 blsrq %rax, %rcx # CHECK-NEXT: 2 5 0.33 * blsrq (%rax), %rcx -# CHECK-NEXT: 2 2 1.00 tzcntw %ax, %cx -# CHECK-NEXT: 2 6 0.50 * tzcntw (%rax), %cx -# CHECK-NEXT: 2 2 0.50 tzcntl %eax, %ecx -# CHECK-NEXT: 2 6 0.50 * tzcntl (%rax), %ecx -# CHECK-NEXT: 2 2 0.50 tzcntq %rax, %rcx -# CHECK-NEXT: 2 6 0.50 * tzcntq (%rax), %rcx +# CHECK-NEXT: 1 1 0.25 tzcntw %ax, %cx +# CHECK-NEXT: 1 5 0.50 * tzcntw (%rax), %cx +# CHECK-NEXT: 1 1 0.50 tzcntl %eax, %ecx +# CHECK-NEXT: 1 5 0.50 * tzcntl (%rax), %ecx +# CHECK-NEXT: 1 1 0.50 tzcntq %rax, %rcx +# CHECK-NEXT: 1 5 0.50 * tzcntq (%rax), %rcx # CHECK: Resources: # CHECK-NEXT: [0] - Zn4AGU0 @@ -103,7 +103,7 @@ tzcnt (%rax), %rcx # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] -# CHECK-NEXT: 4.33 4.33 4.33 5.00 9.50 9.50 5.00 - - - - - - - - 4.33 4.33 4.33 4.33 4.33 4.33 - - +# CHECK-NEXT: 4.33 4.33 4.33 4.25 8.75 8.75 4.25 - - - - - - - - 4.33 4.33 4.33 4.33 4.33 4.33 - - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions: @@ -127,7 +127,7 @@ tzcnt (%rax), %rcx # CHECK-NEXT: 0.33 0.33 0.33 0.25 0.25 0.25 0.25 - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - blsrl (%rax), %ecx # CHECK-NEXT: - - - 0.25 0.25 0.25 0.25 - - - - - - - - - - - - - - - - blsrq %rax, %rcx # CHECK-NEXT: 0.33 0.33 0.33 0.25 0.25 0.25 0.25 - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - blsrq (%rax), %rcx -# CHECK-NEXT: - - - 1.00 1.00 1.00 1.00 - - - - - - - - - - - - - - - - tzcntw %ax, %cx +# CHECK-NEXT: - - - 0.25 0.25 0.25 0.25 - - - - - - - - - - - - - - - - tzcntw %ax, %cx # CHECK-NEXT: 0.33 0.33 0.33 - 0.50 0.50 - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - tzcntw (%rax), %cx # CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - - - - - - - - - - - tzcntl %eax, %ecx # CHECK-NEXT: 0.33 0.33 0.33 - 0.50 0.50 - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - tzcntl (%rax), %ecx diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-cmpxchg.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-cmpxchg.s index 64feeaf6d4ad8..26a42fd9964b5 100644 --- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-cmpxchg.s +++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-cmpxchg.s @@ -15,10 +15,10 @@ lock cmpxchg16b (%rax) # CHECK-NEXT: [6]: HasSideEffects (U) # CHECK: [1] [2] [3] [4] [5] [6] Instructions: -# CHECK-NEXT: 19 3 6.00 * * cmpxchg8b (%rax) -# CHECK-NEXT: 28 4 14.75 * * cmpxchg16b (%rax) -# CHECK-NEXT: 19 3 6.00 * * lock cmpxchg8b (%rax) -# CHECK-NEXT: 28 4 14.75 * * lock cmpxchg16b (%rax) +# CHECK-NEXT: 15 3 5.00 * * cmpxchg8b (%rax) +# CHECK-NEXT: 26 2 10.00 * * cmpxchg16b (%rax) +# CHECK-NEXT: 15 3 5.00 * * lock cmpxchg8b (%rax) +# CHECK-NEXT: 26 2 10.00 * * lock cmpxchg16b (%rax) # CHECK: Resources: # CHECK-NEXT: [0] - Zn4AGU0 @@ -47,11 +47,11 @@ lock cmpxchg16b (%rax) # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] -# CHECK-NEXT: - - - 41.50 41.50 41.50 41.50 - - - - - - - - - - - - - - - - +# CHECK-NEXT: - - - 30.00 30.00 30.00 30.00 - - - - - - - - - - - - - - - - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions: -# CHECK-NEXT: - - - 6.00 6.00 6.00 6.00 - - - - - - - - - - - - - - - - cmpxchg8b (%rax) -# CHECK-NEXT: - - - 14.75 14.75 14.75 14.75 - - - - - - - - - - - - - - - - cmpxchg16b (%rax) -# CHECK-NEXT: - - - 6.00 6.00 6.00 6.00 - - - - - - - - - - - - - - - - lock cmpxchg8b (%rax) -# CHECK-NEXT: - - - 14.75 14.75 14.75 14.75 - - - - - - - - - - - - - - - - lock cmpxchg16b (%rax) +# CHECK-NEXT: - - - 5.00 5.00 5.00 5.00 - - - - - - - - - - - - - - - - cmpxchg8b (%rax) +# CHECK-NEXT: - - - 10.00 10.00 10.00 10.00 - - - - - - - - - - - - - - - - cmpxchg16b (%rax) +# CHECK-NEXT: - - - 5.00 5.00 5.00 5.00 - - - - - - - - - - - - - - - - lock cmpxchg8b (%rax) +# CHECK-NEXT: - - - 10.00 10.00 10.00 10.00 - - - - - - - - - - - - - - - - lock cmpxchg16b (%rax) diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-pclmul.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-pclmul.s index a36fb2aabe486..fc2bc8e21bf14 100644 --- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-pclmul.s +++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-pclmul.s @@ -13,8 +13,8 @@ pclmulqdq $11, (%rax), %xmm2 # CHECK-NEXT: [6]: HasSideEffects (U) # CHECK: [1] [2] [3] [4] [5] [6] Instructions: -# CHECK-NEXT: 4 4 2.00 pclmulqdq $11, %xmm0, %xmm2 -# CHECK-NEXT: 4 11 2.00 * pclmulqdq $11, (%rax), %xmm2 +# CHECK-NEXT: 4 4 1.50 pclmulqdq $11, %xmm0, %xmm2 +# CHECK-NEXT: 4 11 1.50 * pclmulqdq $11, (%rax), %xmm2 # CHECK: Resources: # CHECK-NEXT: [0] - Zn4AGU0 @@ -43,9 +43,9 @@ pclmulqdq $11, (%rax), %xmm2 # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] -# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions: -# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - - - - - - - - - - - - pclmulqdq $11, %xmm0, %xmm2 -# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - pclmulqdq $11, (%rax), %xmm2 +# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - - - - - - - - - - - - pclmulqdq $11, %xmm0, %xmm2 +# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - pclmulqdq $11, (%rax), %xmm2 diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-sse42.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-sse42.s index 015d37e3e6296..ae608354e2a6f 100644 --- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-sse42.s +++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-sse42.s @@ -52,12 +52,12 @@ pcmpgtq (%rax), %xmm2 # CHECK-NEXT: 1 7 1.00 * crc32q (%rax), %rcx # CHECK-NEXT: 8 6 3.00 pcmpestri $1, %xmm0, %xmm2 # CHECK-NEXT: 12 13 3.00 * pcmpestri $1, (%rax), %xmm2 -# CHECK-NEXT: 7 6 3.00 pcmpestrm $1, %xmm0, %xmm2 -# CHECK-NEXT: 12 13 3.00 * pcmpestrm $1, (%rax), %xmm2 +# CHECK-NEXT: 7 7 3.00 pcmpestrm $1, %xmm0, %xmm2 +# CHECK-NEXT: 12 14 3.00 * pcmpestrm $1, (%rax), %xmm2 # CHECK-NEXT: 4 2 2.00 pcmpistri $1, %xmm0, %xmm2 # CHECK-NEXT: 4 9 2.00 * pcmpistri $1, (%rax), %xmm2 -# CHECK-NEXT: 3 6 2.00 pcmpistrm $1, %xmm0, %xmm2 -# CHECK-NEXT: 4 13 2.00 * pcmpistrm $1, (%rax), %xmm2 +# CHECK-NEXT: 3 7 2.00 pcmpistrm $1, %xmm0, %xmm2 +# CHECK-NEXT: 4 14 2.00 * pcmpistrm $1, (%rax), %xmm2 # CHECK-NEXT: 1 1 0.25 pcmpgtq %xmm0, %xmm2 # CHECK-NEXT: 1 8 0.50 * pcmpgtq (%rax), %xmm2 diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-vpclmulqdq.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-vpclmulqdq.s index 55a36d0f1ea09..dca470338b5a4 100644 --- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-vpclmulqdq.s +++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-vpclmulqdq.s @@ -13,8 +13,8 @@ vpclmulqdq $11, (%rax), %ymm1, %ymm3 # CHECK-NEXT: [6]: HasSideEffects (U) # CHECK: [1] [2] [3] [4] [5] [6] Instructions: -# CHECK-NEXT: 4 4 2.00 vpclmulqdq $11, %ymm0, %ymm1, %ymm3 -# CHECK-NEXT: 4 11 2.00 * vpclmulqdq $11, (%rax), %ymm1, %ymm3 +# CHECK-NEXT: 4 4 1.50 vpclmulqdq $11, %ymm0, %ymm1, %ymm3 +# CHECK-NEXT: 4 11 1.50 * vpclmulqdq $11, (%rax), %ymm1, %ymm3 # CHECK: Resources: # CHECK-NEXT: [0] - Zn4AGU0 @@ -43,9 +43,9 @@ vpclmulqdq $11, (%rax), %ymm1, %ymm3 # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] -# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - +# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions: -# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - - - - - - - - - - - - vpclmulqdq $11, %ymm0, %ymm1, %ymm3 -# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpclmulqdq $11, (%rax), %ymm1, %ymm3 +# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - - - - - - - - - - - - vpclmulqdq $11, %ymm0, %ymm1, %ymm3 +# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpclmulqdq $11, (%rax), %ymm1, %ymm3 diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-x86_64.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-x86_64.s index 9c5b4e45896de..886d9c6930418 100644 --- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-x86_64.s +++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-x86_64.s @@ -1173,18 +1173,18 @@ xorq (%rax), %rdi # CHECK-NEXT: 1 6 0.67 * * andq %rsi, (%rax) # CHECK-NEXT: 1 6 0.67 * * lock andq %rsi, (%rax) # CHECK-NEXT: 1 5 0.33 * andq (%rax), %rdi -# CHECK-NEXT: 6 1 1.00 bsfw %si, %di -# CHECK-NEXT: 6 1 1.00 bsrw %si, %di -# CHECK-NEXT: 7 5 1.00 * bsfw (%rax), %di -# CHECK-NEXT: 7 5 1.00 * bsrw (%rax), %di -# CHECK-NEXT: 6 1 1.00 bsfl %esi, %edi -# CHECK-NEXT: 6 1 1.00 bsrl %esi, %edi -# CHECK-NEXT: 7 5 1.00 * bsfl (%rax), %edi -# CHECK-NEXT: 7 5 1.00 * bsrl (%rax), %edi -# CHECK-NEXT: 6 1 1.00 bsfq %rsi, %rdi -# CHECK-NEXT: 6 1 1.00 bsrq %rsi, %rdi -# CHECK-NEXT: 7 5 1.00 * bsfq (%rax), %rdi -# CHECK-NEXT: 7 5 1.00 * bsrq (%rax), %rdi +# CHECK-NEXT: 1 1 1.00 bsfw %si, %di +# CHECK-NEXT: 1 1 1.00 bsrw %si, %di +# CHECK-NEXT: 2 5 1.00 * bsfw (%rax), %di +# CHECK-NEXT: 2 5 1.00 * bsrw (%rax), %di +# CHECK-NEXT: 1 1 1.00 bsfl %esi, %edi +# CHECK-NEXT: 1 1 1.00 bsrl %esi, %edi +# CHECK-NEXT: 2 5 1.00 * bsfl (%rax), %edi +# CHECK-NEXT: 2 5 1.00 * bsrl (%rax), %edi +# CHECK-NEXT: 1 1 1.00 bsfq %rsi, %rdi +# CHECK-NEXT: 1 1 1.00 bsrq %rsi, %rdi +# CHECK-NEXT: 2 5 1.00 * bsfq (%rax), %rdi +# CHECK-NEXT: 2 5 1.00 * bsrq (%rax), %rdi # CHECK-NEXT: 1 1 0.25 bswapl %eax # CHECK-NEXT: 1 1 0.25 bswapq %rax # CHECK-NEXT: 1 1 0.50 btw %si, %di @@ -1321,23 +1321,23 @@ xorq (%rax), %rdi # CHECK-NEXT: 1 1 0.25 decq %rdi # CHECK-NEXT: 1 6 0.67 * * decq (%rax) # CHECK-NEXT: 1 6 0.67 * * lock decq (%rax) -# CHECK-NEXT: 2 10 10.00 U divb %dil -# CHECK-NEXT: 2 14 10.00 * U divb (%rax) -# CHECK-NEXT: 2 11 11.00 U divw %si -# CHECK-NEXT: 2 15 11.00 * U divw (%rax) -# CHECK-NEXT: 2 13 13.00 U divl %edx -# CHECK-NEXT: 2 17 13.00 * U divl (%rax) -# CHECK-NEXT: 2 17 17.00 U divq %rcx -# CHECK-NEXT: 2 21 17.00 * U divq (%rax) +# CHECK-NEXT: 2 9 9.00 U divb %dil +# CHECK-NEXT: 2 13 9.00 * U divb (%rax) +# CHECK-NEXT: 2 10 10.00 U divw %si +# CHECK-NEXT: 2 14 10.00 * U divw (%rax) +# CHECK-NEXT: 2 12 12.00 U divl %edx +# CHECK-NEXT: 2 16 12.00 * U divl (%rax) +# CHECK-NEXT: 2 18 18.00 U divq %rcx +# CHECK-NEXT: 2 22 18.00 * U divq (%rax) # CHECK-NEXT: 100 100 25.00 U enter $7, $4095 -# CHECK-NEXT: 2 10 10.00 U idivb %dil -# CHECK-NEXT: 2 14 10.00 * U idivb (%rax) -# CHECK-NEXT: 2 11 11.00 U idivw %si -# CHECK-NEXT: 2 15 11.00 * U idivw (%rax) -# CHECK-NEXT: 2 13 13.00 U idivl %edx -# CHECK-NEXT: 2 17 13.00 * U idivl (%rax) -# CHECK-NEXT: 2 17 17.00 U idivq %rcx -# CHECK-NEXT: 2 21 17.00 * U idivq (%rax) +# CHECK-NEXT: 2 9 9.00 U idivb %dil +# CHECK-NEXT: 2 13 9.00 * U idivb (%rax) +# CHECK-NEXT: 2 10 10.00 U idivw %si +# CHECK-NEXT: 2 14 10.00 * U idivw (%rax) +# CHECK-NEXT: 2 12 12.00 U idivl %edx +# CHECK-NEXT: 2 16 12.00 * U idivl (%rax) +# CHECK-NEXT: 2 18 18.00 U idivq %rcx +# CHECK-NEXT: 2 22 18.00 * U idivq (%rax) # CHECK-NEXT: 1 3 3.00 imulb %dil # CHECK-NEXT: 1 7 3.00 * imulb (%rax) # CHECK-NEXT: 3 3 3.00 imulw %di @@ -1891,12 +1891,12 @@ xorq (%rax), %rdi # CHECK-NEXT: 1 5 0.67 * * xaddq %rax, (%rbx) # CHECK-NEXT: 1 5 0.67 * * lock xaddq %rax, (%rbx) # CHECK-NEXT: 2 1 0.50 xchgb %bl, %cl -# CHECK-NEXT: 5 7 0.50 * * xchgb %bl, (%rbx) -# CHECK-NEXT: 5 7 0.50 * * lock xchgb %bl, (%rbx) +# CHECK-NEXT: 2 7 0.50 * * xchgb %bl, (%rbx) +# CHECK-NEXT: 2 7 0.50 * * lock xchgb %bl, (%rbx) # CHECK-NEXT: 2 1 0.50 xchgw %bx, %ax # CHECK-NEXT: 2 1 0.50 xchgw %bx, %cx -# CHECK-NEXT: 5 7 0.50 * * xchgw %ax, (%rbx) -# CHECK-NEXT: 5 7 0.50 * * lock xchgw %ax, (%rbx) +# CHECK-NEXT: 2 7 0.50 * * xchgw %ax, (%rbx) +# CHECK-NEXT: 2 7 0.50 * * lock xchgw %ax, (%rbx) # CHECK-NEXT: 2 0 0.33 xchgl %ebx, %eax # CHECK-NEXT: 2 0 0.33 xchgl %ebx, %ecx # CHECK-NEXT: 2 6 0.50 * * xchgl %eax, (%rbx) @@ -1975,7 +1975,7 @@ xorq (%rax), %rdi # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] -# CHECK-NEXT: 259.00 259.00 259.00 1733.00 1865.50 1775.50 1529.50 1.50 - - - - - - - 259.00 259.00 259.00 151.67 151.67 151.67 161.00 161.00 +# CHECK-NEXT: 259.00 259.00 259.00 1725.00 1865.50 1775.50 1529.50 1.50 - - - - - - - 259.00 259.00 259.00 151.67 151.67 151.67 161.00 161.00 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions: @@ -2266,23 +2266,23 @@ xorq (%rax), %rdi # CHECK-NEXT: - - - 0.25 0.25 0.25 0.25 - - - - - - - - - - - - - - - - decq %rdi # CHECK-NEXT: 0.67 0.67 0.67 0.25 0.25 0.25 0.25 - - - - - - - - 0.67 0.67 0.67 0.33 0.33 0.33 0.50 0.50 decq (%rax) # CHECK-NEXT: 0.67 0.67 0.67 0.25 0.25 0.25 0.25 - - - - - - - - 0.67 0.67 0.67 0.33 0.33 0.33 0.50 0.50 lock decq (%rax) -# CHECK-NEXT: - - - 10.00 - - - - - - - - - - - - - - - - - - - divb %dil -# CHECK-NEXT: 0.33 0.33 0.33 10.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - divb (%rax) -# CHECK-NEXT: - - - 11.00 - - - - - - - - - - - - - - - - - - - divw %si -# CHECK-NEXT: 0.33 0.33 0.33 11.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - divw (%rax) -# CHECK-NEXT: - - - 13.00 - - - - - - - - - - - - - - - - - - - divl %edx -# CHECK-NEXT: 0.33 0.33 0.33 13.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - divl (%rax) -# CHECK-NEXT: - - - 17.00 - - - - - - - - - - - - - - - - - - - divq %rcx -# CHECK-NEXT: 0.33 0.33 0.33 17.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - divq (%rax) +# CHECK-NEXT: - - - 9.00 - - - - - - - - - - - - - - - - - - - divb %dil +# CHECK-NEXT: 0.33 0.33 0.33 9.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - divb (%rax) +# CHECK-NEXT: - - - 10.00 - - - - - - - - - - - - - - - - - - - divw %si +# CHECK-NEXT: 0.33 0.33 0.33 10.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - divw (%rax) +# CHECK-NEXT: - - - 12.00 - - - - - - - - - - - - - - - - - - - divl %edx +# CHECK-NEXT: 0.33 0.33 0.33 12.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - divl (%rax) +# CHECK-NEXT: - - - 18.00 - - - - - - - - - - - - - - - - - - - divq %rcx +# CHECK-NEXT: 0.33 0.33 0.33 18.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - divq (%rax) # CHECK-NEXT: - - - 25.00 25.00 25.00 25.00 - - - - - - - - - - - - - - - - enter $7, $4095 -# CHECK-NEXT: - - - 10.00 - - - - - - - - - - - - - - - - - - - idivb %dil -# CHECK-NEXT: 0.33 0.33 0.33 10.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - idivb (%rax) -# CHECK-NEXT: - - - 11.00 - - - - - - - - - - - - - - - - - - - idivw %si -# CHECK-NEXT: 0.33 0.33 0.33 11.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - idivw (%rax) -# CHECK-NEXT: - - - 13.00 - - - - - - - - - - - - - - - - - - - idivl %edx -# CHECK-NEXT: 0.33 0.33 0.33 13.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - idivl (%rax) -# CHECK-NEXT: - - - 17.00 - - - - - - - - - - - - - - - - - - - idivq %rcx -# CHECK-NEXT: 0.33 0.33 0.33 17.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - idivq (%rax) +# CHECK-NEXT: - - - 9.00 - - - - - - - - - - - - - - - - - - - idivb %dil +# CHECK-NEXT: 0.33 0.33 0.33 9.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - idivb (%rax) +# CHECK-NEXT: - - - 10.00 - - - - - - - - - - - - - - - - - - - idivw %si +# CHECK-NEXT: 0.33 0.33 0.33 10.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - idivw (%rax) +# CHECK-NEXT: - - - 12.00 - - - - - - - - - - - - - - - - - - - idivl %edx +# CHECK-NEXT: 0.33 0.33 0.33 12.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - idivl (%rax) +# CHECK-NEXT: - - - 18.00 - - - - - - - - - - - - - - - - - - - idivq %rcx +# CHECK-NEXT: 0.33 0.33 0.33 18.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - idivq (%rax) # CHECK-NEXT: - - - - 3.00 - - - - - - - - - - - - - - - - - - imulb %dil # CHECK-NEXT: 0.33 0.33 0.33 - 3.00 - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - imulb (%rax) # CHECK-NEXT: - - - - 3.00 - - - - - - - - - - - - - - - - - - imulw %di From becf84790126ce83ba36eaddc06a0a0a46005048 Mon Sep 17 00:00:00 2001 From: paperchalice Date: Wed, 22 Oct 2025 20:14:21 +0800 Subject: [PATCH 088/530] [AArch64][test] Remove unsafe-fp-math uses (NFC) (#164606) Post cleanup for #164534 --- llvm/test/CodeGen/AArch64/GlobalISel/inline-memcpy.mir | 2 +- llvm/test/CodeGen/AArch64/GlobalISel/inline-memmove.mir | 2 +- llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir | 2 +- .../AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll | 2 +- llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll | 4 ++-- llvm/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll | 4 ++-- llvm/test/CodeGen/AArch64/aarch64-fix-cortex-a53-835769.ll | 4 ++-- llvm/test/CodeGen/AArch64/aarch64-mov-debug-locs.mir | 4 ++-- llvm/test/CodeGen/AArch64/arm64-detect-vec-redux.ll | 2 +- llvm/test/CodeGen/AArch64/arm64-fma-combine-with-fpfusion.ll | 2 +- llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll | 2 +- llvm/test/CodeGen/AArch64/arm64-misched-basic-A53.ll | 2 +- llvm/test/CodeGen/AArch64/arm64-misched-basic-A57.ll | 2 +- llvm/test/CodeGen/AArch64/arm64-rounding.ll | 2 +- llvm/test/CodeGen/AArch64/arm64-storebytesmerge.ll | 4 ++-- llvm/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll | 2 +- llvm/test/CodeGen/AArch64/bti-branch-relaxation.ll | 2 +- llvm/test/CodeGen/AArch64/consthoist-gep.ll | 2 +- llvm/test/CodeGen/AArch64/dag-combine-invaraints.ll | 4 ++-- llvm/test/CodeGen/AArch64/partial-pipeline-execution.ll | 2 +- llvm/test/CodeGen/AArch64/recp-fastmath.ll | 4 ++-- llvm/test/CodeGen/AArch64/stack-tagging-ex-1.ll | 2 +- llvm/test/CodeGen/AArch64/stack-tagging-ex-2.ll | 4 ++-- llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll | 2 +- llvm/test/CodeGen/AArch64/stack_guard_remat.ll | 2 +- llvm/test/CodeGen/AArch64/vector_merge_dep_check.ll | 2 +- llvm/test/CodeGen/AArch64/wineh-frame5.mir | 4 ++-- llvm/test/CodeGen/AArch64/wineh-frame6.mir | 4 ++-- llvm/test/CodeGen/AArch64/wineh-frame7.mir | 4 ++-- llvm/test/CodeGen/AArch64/wineh-frame8.mir | 2 +- llvm/test/CodeGen/AArch64/wineh5.mir | 4 ++-- llvm/test/CodeGen/AArch64/wineh_shrinkwrap.mir | 4 ++-- 32 files changed, 45 insertions(+), 45 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memcpy.mir b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memcpy.mir index 97a0417042e1f..b040ff2932ce8 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memcpy.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memcpy.mir @@ -56,7 +56,7 @@ } - attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "use-soft-float"="false" } attributes #1 = { argmemonly nounwind } attributes #2 = { optsize } attributes #3 = { minsize } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memmove.mir b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memmove.mir index fc4fbacc33bcf..f24aeaed1d8bc 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memmove.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memmove.mir @@ -47,7 +47,7 @@ ret void } - attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "use-soft-float"="false" } attributes #1 = { argmemonly nounwind } ... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir index b06cadfa2e3f4..e4d2ca32468f6 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir @@ -50,7 +50,7 @@ declare void @llvm.stackprotector(ptr, ptr) #2 - attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "use-soft-float"="false" } attributes #1 = { argmemonly nounwind } ... diff --git a/llvm/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll b/llvm/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll index 0c1776e61a4d4..6e3682a02eaff 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll @@ -37,7 +37,7 @@ for.body: ; preds = %for.body, %entry ; Function Attrs: nounwind readnone declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1 -attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } attributes #1 = { nounwind readnone } !llvm.dbg.cu = !{!0} diff --git a/llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll b/llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll index f2ed57ead074e..353e818f09683 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll @@ -325,7 +325,7 @@ entry: declare void @hhh(double, double) -attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "use-soft-float"="false" } +attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "use-soft-float"="false" } attributes #2 = { nounwind } diff --git a/llvm/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll b/llvm/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll index 7e97116d9d022..8da0e111357d0 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll @@ -694,8 +694,8 @@ bb1: ; CHECK: .[[LABEL]]: ; CHECK: ret -attributes #0 = { "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } +attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } !1 = !{!2, !2, i64 0} !2 = !{!"int", !3, i64 0} diff --git a/llvm/test/CodeGen/AArch64/aarch64-fix-cortex-a53-835769.ll b/llvm/test/CodeGen/AArch64/aarch64-fix-cortex-a53-835769.ll index 296435adc8de5..937bfe4ff94bf 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-fix-cortex-a53-835769.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-fix-cortex-a53-835769.ll @@ -519,8 +519,8 @@ while.cond: br label %while.cond } -attributes #0 = { nounwind readonly "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind readonly "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } +attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } attributes #2 = { nounwind } diff --git a/llvm/test/CodeGen/AArch64/aarch64-mov-debug-locs.mir b/llvm/test/CodeGen/AArch64/aarch64-mov-debug-locs.mir index 45fa2be5b59de..c05d661da1b92 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-mov-debug-locs.mir +++ b/llvm/test/CodeGen/AArch64/aarch64-mov-debug-locs.mir @@ -79,8 +79,8 @@ ; Function Attrs: nounwind declare void @llvm.stackprotector(ptr, ptr) #3 - attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" } + attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" } attributes #2 = { nounwind readnone speculatable } attributes #3 = { nounwind } diff --git a/llvm/test/CodeGen/AArch64/arm64-detect-vec-redux.ll b/llvm/test/CodeGen/AArch64/arm64-detect-vec-redux.ll index 4e86f5258ef19..071344da03f47 100644 --- a/llvm/test/CodeGen/AArch64/arm64-detect-vec-redux.ll +++ b/llvm/test/CodeGen/AArch64/arm64-detect-vec-redux.ll @@ -47,6 +47,6 @@ declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>) #1 ; Function Attrs: nounwind readnone declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>) #1 -attributes #0 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" } attributes #1 = { nounwind readnone } attributes #2 = { nounwind } diff --git a/llvm/test/CodeGen/AArch64/arm64-fma-combine-with-fpfusion.ll b/llvm/test/CodeGen/AArch64/arm64-fma-combine-with-fpfusion.ll index 9b3d53911ba06..0ddcdccca4ec4 100644 --- a/llvm/test/CodeGen/AArch64/arm64-fma-combine-with-fpfusion.ll +++ b/llvm/test/CodeGen/AArch64/arm64-fma-combine-with-fpfusion.ll @@ -8,5 +8,5 @@ define float @mul_add(float %a, float %b, float %c) local_unnamed_addr #0 { ret float %add } -attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" } diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll index d2ce7e6cf0320..41f57bfaa6b8b 100644 --- a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll +++ b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll @@ -84,7 +84,7 @@ bb3: ; preds = %bb3, %bb ; Function Attrs: nounwind readnone declare i64 @llvm.objectsize.i64.p0(ptr, i1) #1 -attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } attributes #1 = { nounwind readnone } !1 = !{!2, !2, i64 0} diff --git a/llvm/test/CodeGen/AArch64/arm64-misched-basic-A53.ll b/llvm/test/CodeGen/AArch64/arm64-misched-basic-A53.ll index 4cdc6cc117bb8..c6cf2403135a7 100644 --- a/llvm/test/CodeGen/AArch64/arm64-misched-basic-A53.ll +++ b/llvm/test/CodeGen/AArch64/arm64-misched-basic-A53.ll @@ -107,7 +107,7 @@ define <4 x float> @neon4xfloat(<4 x float> %A, <4 x float> %B) { ; Function Attrs: nounwind declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1) #1 -attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AArch64/arm64-misched-basic-A57.ll b/llvm/test/CodeGen/AArch64/arm64-misched-basic-A57.ll index 82b34efab6834..bb1a6b02a63de 100644 --- a/llvm/test/CodeGen/AArch64/arm64-misched-basic-A57.ll +++ b/llvm/test/CodeGen/AArch64/arm64-misched-basic-A57.ll @@ -108,5 +108,5 @@ for.end: ; preds = %for.cond ; Function Attrs: nounwind declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1) #1 -attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AArch64/arm64-rounding.ll b/llvm/test/CodeGen/AArch64/arm64-rounding.ll index d487aabccc4f1..3ce35bfc4537c 100644 --- a/llvm/test/CodeGen/AArch64/arm64-rounding.ll +++ b/llvm/test/CodeGen/AArch64/arm64-rounding.ll @@ -201,4 +201,4 @@ entry: } attributes #0 = { nounwind } -attributes #1 = { nounwind "unsafe-fp-math"="true" } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AArch64/arm64-storebytesmerge.ll b/llvm/test/CodeGen/AArch64/arm64-storebytesmerge.ll index db65fdde0ae25..1486b3a8b9cff 100644 --- a/llvm/test/CodeGen/AArch64/arm64-storebytesmerge.ll +++ b/llvm/test/CodeGen/AArch64/arm64-storebytesmerge.ll @@ -36,6 +36,6 @@ for.end705.i: ; preds = %for.body453.i declare void @f() local_unnamed_addr #1 -attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a57" "target-features"="+crc,+crypto,+fp-armv8,+neon" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a57" "target-features"="+crc,+crypto,+fp-armv8,+neon" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a57" "target-features"="+crc,+crypto,+fp-armv8,+neon" "use-soft-float"="false" } +attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a57" "target-features"="+crc,+crypto,+fp-armv8,+neon" "use-soft-float"="false" } attributes #2 = { nounwind } diff --git a/llvm/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll index fc5935082ff07..593d629e17883 100644 --- a/llvm/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll +++ b/llvm/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll @@ -18,7 +18,7 @@ entry: ret i32 %1 } -attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "use-soft-float"="false" } !llvm.ident = !{!0} diff --git a/llvm/test/CodeGen/AArch64/bti-branch-relaxation.ll b/llvm/test/CodeGen/AArch64/bti-branch-relaxation.ll index 2e3b99fd8ef34..c4bf7d22f7a23 100644 --- a/llvm/test/CodeGen/AArch64/bti-branch-relaxation.ll +++ b/llvm/test/CodeGen/AArch64/bti-branch-relaxation.ll @@ -61,4 +61,4 @@ declare dso_local void @e(...) local_unnamed_addr #0 declare dso_local i64 @llvm.aarch64.space(i32, i64) local_unnamed_addr #0 -attributes #0 = { nounwind "branch-target-enforcement" "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon,+v8.5a" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind "branch-target-enforcement" "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon,+v8.5a" "use-soft-float"="false" } diff --git a/llvm/test/CodeGen/AArch64/consthoist-gep.ll b/llvm/test/CodeGen/AArch64/consthoist-gep.ll index 031ee353c0e8e..7d2aaecbda67e 100644 --- a/llvm/test/CodeGen/AArch64/consthoist-gep.ll +++ b/llvm/test/CodeGen/AArch64/consthoist-gep.ll @@ -108,7 +108,7 @@ bb19: ; preds = %bb3, %bb ret void } -attributes #0 = { norecurse nounwind optsize ssp "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { norecurse nounwind optsize ssp "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} diff --git a/llvm/test/CodeGen/AArch64/dag-combine-invaraints.ll b/llvm/test/CodeGen/AArch64/dag-combine-invaraints.ll index 61df3965ca1be..e5614814bf021 100644 --- a/llvm/test/CodeGen/AArch64/dag-combine-invaraints.ll +++ b/llvm/test/CodeGen/AArch64/dag-combine-invaraints.ll @@ -32,5 +32,5 @@ main_: declare i32 @printf(ptr, ...) #1 -attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } +attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } diff --git a/llvm/test/CodeGen/AArch64/partial-pipeline-execution.ll b/llvm/test/CodeGen/AArch64/partial-pipeline-execution.ll index c2ef2fa16a9a2..00a8c30229e57 100644 --- a/llvm/test/CodeGen/AArch64/partial-pipeline-execution.ll +++ b/llvm/test/CodeGen/AArch64/partial-pipeline-execution.ll @@ -74,7 +74,7 @@ for.body: ; preds = %for.body.preheader, br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !10 } -attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" } !llvm.module.flags = !{!0} !llvm.ident = !{!1} diff --git a/llvm/test/CodeGen/AArch64/recp-fastmath.ll b/llvm/test/CodeGen/AArch64/recp-fastmath.ll index 9f00621eff6b4..fa1da33007667 100644 --- a/llvm/test/CodeGen/AArch64/recp-fastmath.ll +++ b/llvm/test/CodeGen/AArch64/recp-fastmath.ll @@ -164,5 +164,5 @@ define <4 x double> @d4recp1(<4 x double> %x) #1 { ; CHECK-NOT: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} } -attributes #0 = { nounwind "unsafe-fp-math"="true" } -attributes #1 = { nounwind "unsafe-fp-math"="true" "reciprocal-estimates"="div,vec-div" } +attributes #0 = { nounwind } +attributes #1 = { nounwind "reciprocal-estimates"="div,vec-div" } diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-ex-1.ll b/llvm/test/CodeGen/AArch64/stack-tagging-ex-1.ll index 66ac04e52394a..22abb8ccceb19 100644 --- a/llvm/test/CodeGen/AArch64/stack-tagging-ex-1.ll +++ b/llvm/test/CodeGen/AArch64/stack-tagging-ex-1.ll @@ -64,6 +64,6 @@ declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1 ; Function Attrs: argmemonly nounwind willreturn declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1 -attributes #0 = { sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "use-soft-float"="false" } attributes #1 = { argmemonly nounwind willreturn } attributes #2 = { nounwind } diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-ex-2.ll b/llvm/test/CodeGen/AArch64/stack-tagging-ex-2.ll index e5725bcf66fdf..d689a76ec7708 100644 --- a/llvm/test/CodeGen/AArch64/stack-tagging-ex-2.ll +++ b/llvm/test/CodeGen/AArch64/stack-tagging-ex-2.ll @@ -158,10 +158,10 @@ eh.resume: ; preds = %lpad.body resume { ptr, i32 } %eh.lpad-body } -attributes #0 = { noreturn sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { noreturn sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "use-soft-float"="false" } attributes #1 = { argmemonly nounwind willreturn } attributes #2 = { nounwind readnone } -attributes #3 = { norecurse sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { norecurse sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "use-soft-float"="false" } attributes #4 = { nounwind } attributes #5 = { noreturn } diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll b/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll index 91adf8257c992..74836224052b0 100644 --- a/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll +++ b/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll @@ -77,6 +77,6 @@ declare void @llvm.lifetime.start.p0(ptr nocapture) #1 declare void @llvm.lifetime.end.p0(ptr nocapture) #1 -attributes #0 = { sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "use-soft-float"="false" } attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AArch64/stack_guard_remat.ll b/llvm/test/CodeGen/AArch64/stack_guard_remat.ll index 523eda6149eef..e41d82cd2cf17 100644 --- a/llvm/test/CodeGen/AArch64/stack_guard_remat.ll +++ b/llvm/test/CodeGen/AArch64/stack_guard_remat.ll @@ -54,7 +54,7 @@ declare void @foo3(ptr) ; Function Attrs: nounwind declare void @llvm.lifetime.end.p0(i64, ptr nocapture) -attributes #0 = { nounwind sspstrong "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind sspstrong "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } ;--- pic.ll !llvm.module.flags = !{!0} diff --git a/llvm/test/CodeGen/AArch64/vector_merge_dep_check.ll b/llvm/test/CodeGen/AArch64/vector_merge_dep_check.ll index 623ea22000c83..89b3b8906f7d2 100644 --- a/llvm/test/CodeGen/AArch64/vector_merge_dep_check.ll +++ b/llvm/test/CodeGen/AArch64/vector_merge_dep_check.ll @@ -24,7 +24,7 @@ define void @fn(ptr %argA, ptr %argB, ptr %a) #0 align 2 { ; CHECK: ret -attributes #0 = { noinline norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-features"="+crc,+crypto,+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { noinline norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-features"="+crc,+crypto,+neon" "use-soft-float"="false" } !llvm.ident = !{!0} diff --git a/llvm/test/CodeGen/AArch64/wineh-frame5.mir b/llvm/test/CodeGen/AArch64/wineh-frame5.mir index 97c5c85c1279d..32580f4f3f41c 100644 --- a/llvm/test/CodeGen/AArch64/wineh-frame5.mir +++ b/llvm/test/CodeGen/AArch64/wineh-frame5.mir @@ -64,9 +64,9 @@ ; Function Attrs: nounwind declare void @llvm.stackprotector(ptr, ptr) #3 - attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" } attributes #1 = { argmemonly nounwind } - attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" } attributes #3 = { nounwind } ... diff --git a/llvm/test/CodeGen/AArch64/wineh-frame6.mir b/llvm/test/CodeGen/AArch64/wineh-frame6.mir index 5ba7842642d47..d76fae1aad3dc 100644 --- a/llvm/test/CodeGen/AArch64/wineh-frame6.mir +++ b/llvm/test/CodeGen/AArch64/wineh-frame6.mir @@ -47,8 +47,8 @@ ; Function Attrs: nounwind declare void @llvm.stackprotector(ptr, ptr) #2 - attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" } + attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" } attributes #2 = { nounwind } ... diff --git a/llvm/test/CodeGen/AArch64/wineh-frame7.mir b/llvm/test/CodeGen/AArch64/wineh-frame7.mir index 159909814ff33..d4e71d94c7d73 100644 --- a/llvm/test/CodeGen/AArch64/wineh-frame7.mir +++ b/llvm/test/CodeGen/AArch64/wineh-frame7.mir @@ -71,8 +71,8 @@ ; Function Attrs: nounwind declare void @llvm.stackprotector(ptr, ptr) #2 - attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" } + attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" } attributes #2 = { nounwind } ... diff --git a/llvm/test/CodeGen/AArch64/wineh-frame8.mir b/llvm/test/CodeGen/AArch64/wineh-frame8.mir index 9de99ac25bb6c..56f92f230fcf6 100644 --- a/llvm/test/CodeGen/AArch64/wineh-frame8.mir +++ b/llvm/test/CodeGen/AArch64/wineh-frame8.mir @@ -29,7 +29,7 @@ ret i32 %add } - attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" } ... --- diff --git a/llvm/test/CodeGen/AArch64/wineh5.mir b/llvm/test/CodeGen/AArch64/wineh5.mir index efdd4b00c90b6..1c09b7875d9f5 100644 --- a/llvm/test/CodeGen/AArch64/wineh5.mir +++ b/llvm/test/CodeGen/AArch64/wineh5.mir @@ -73,8 +73,8 @@ ; Function Attrs: nounwind declare void @llvm.stackprotector(ptr, ptr) #2 - attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" } + attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" } attributes #2 = { nounwind } !llvm.module.flags = !{!0} diff --git a/llvm/test/CodeGen/AArch64/wineh_shrinkwrap.mir b/llvm/test/CodeGen/AArch64/wineh_shrinkwrap.mir index 2f631c217e8eb..52d0dffc20712 100644 --- a/llvm/test/CodeGen/AArch64/wineh_shrinkwrap.mir +++ b/llvm/test/CodeGen/AArch64/wineh_shrinkwrap.mir @@ -56,9 +56,9 @@ ; Function Attrs: nounwind declare void @llvm.stackprotector(ptr, ptr) #3 - attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" } attributes #1 = { argmemonly nounwind } - attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" } attributes #3 = { nounwind } !llvm.module.flags = !{!0} From 57412c3485c5614348015f37ce69be0f378fcad9 Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 22 Oct 2025 13:20:26 +0100 Subject: [PATCH 089/530] [GlobalISel] Update the documentation of abd. (#164594) The abd nodes do not perform the same as abs(x-y), although they are often mistaken to do so. They extend into a larger bit size before performing the sub / abs and so produce different results. Update the description of the instruction to avoid misunderstandings. --- llvm/docs/GlobalISel/GenericOpcode.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/docs/GlobalISel/GenericOpcode.rst b/llvm/docs/GlobalISel/GenericOpcode.rst index b055327466739..661a11537cf57 100644 --- a/llvm/docs/GlobalISel/GenericOpcode.rst +++ b/llvm/docs/GlobalISel/GenericOpcode.rst @@ -504,7 +504,7 @@ undefined. G_ABDS, G_ABDU ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Compute the absolute difference (signed and unsigned), e.g. abs(x-y). +Compute the absolute difference (signed and unsigned), e.g. trunc(abs(ext(x)-ext(y)). .. code-block:: none From c636a39e33594f493e3a4e831ddee952cd9b5cb6 Mon Sep 17 00:00:00 2001 From: Nathan Corbyn Date: Wed, 22 Oct 2025 13:36:39 +0100 Subject: [PATCH 090/530] [Matrix] Add tests identifying GVN and DSE opportunities for matrix store / load intrinsics (#163573) This patch adds several tests identifying potential opportunities for eliminating dead stores and redundant loads when using the `llvm.matrix.column.major.store.*` and `llvm.matrix.column.major.load.*` intrinsics. PR: https://github.com/llvm/llvm-project/pull/163573 --- .../Analysis/BasicAA/matrix-intrinsics.ll | 30 ++ .../DeadStoreElimination/matrix-intrinsics.ll | 338 ++++++++++++++++++ llvm/test/Transforms/GVN/matrix-intrinsics.ll | 136 +++++++ 3 files changed, 504 insertions(+) create mode 100644 llvm/test/Analysis/BasicAA/matrix-intrinsics.ll create mode 100644 llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll create mode 100644 llvm/test/Transforms/GVN/matrix-intrinsics.ll diff --git a/llvm/test/Analysis/BasicAA/matrix-intrinsics.ll b/llvm/test/Analysis/BasicAA/matrix-intrinsics.ll new file mode 100644 index 0000000000000..1de8ab5d3e590 --- /dev/null +++ b/llvm/test/Analysis/BasicAA/matrix-intrinsics.ll @@ -0,0 +1,30 @@ +; RUN: opt %s -aa-pipeline=basic-aa -passes=aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s + +; BasicAA should prove that loads from sufficiently large static offsets +; don't overlap with matrix loads with a statically known size. + +define <8 x double> @non_overlapping_strided_load(ptr %src) { +; CHECK-LABEL: Function: non_overlapping_strided_load: +; Just Ref: %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) <-> call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2) +; Just Mod: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2) <-> %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) +entry: + %src.offset = getelementptr inbounds double, ptr %src, i32 12 + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2) + ret <8 x double> %l +} + +define <8 x double> @overlapping_strided_load(ptr %src) { +; CHECK-LABEL: Function: overlapping_strided_load: +; CHECK: Just Ref: %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) <-> call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2) +; CHECK: Just Mod: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2) <-> %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) +; +entry: + %src.offset = getelementptr inbounds double, ptr %src, i32 11 + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2) + ret <8 x double> %l +} + +declare <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr, i32, i1, i32, i32) +declare void @llvm.matrix.column.major.store.v8f64.i32(<8 x double>, ptr, i32, i1, i32, i32) diff --git a/llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll b/llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll new file mode 100644 index 0000000000000..ae3c7464656df --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll @@ -0,0 +1,338 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes=dse -S %s | FileCheck %s + +define void @dead_unstrided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @dead_unstrided_store_non_matrix_load( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L:%.*]] = load double, ptr [[SRC]], align 8 +; CHECK-NEXT: ret void +; +entry: + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2) + %l = load double, ptr %src + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2) + ret void +} + +define void @live_unstrided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @live_unstrided_store_non_matrix_load( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[L_1:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L_2:%.*]] = load double, ptr [[DST]], align 8 +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L_1]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + %l.1 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2) + %l.2 = load double, ptr %dst + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l.1, ptr %dst, i32 4, i1 false, i32 4, i32 2) + ret void +} + +define void @dead_strided_store(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @dead_strided_store( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 100, i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 200, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 100, i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 100, i1 false, i32 4, i32 2) + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 200, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 100, i1 false, i32 4, i32 2) + ret void +} + +define void @live_strided_store(ptr %ptr) { +; CHECK-LABEL: define void @live_strided_store( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[PTR]], i32 100, i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[PTR]], i32 200, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[PTR]], i32 100, i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %ptr, i32 100, i1 false, i32 4, i32 2) + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %ptr, i32 200, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %ptr, i32 100, i1 false, i32 4, i32 2) + ret void +} + +define void @dead_strided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @dead_strided_store_non_matrix_load( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[L_1:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 100, i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L:%.*]] = load double, ptr [[SRC]], align 8 +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L_1]], ptr [[DST]], i32 100, i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + %l.1 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 100, i1 false, i32 4, i32 2) + %l.2 = load double, ptr %src + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l.1, ptr %dst, i32 100, i1 false, i32 4, i32 2) + ret void +} + +define void @live_strided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @live_strided_store_non_matrix_load( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[L_1:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 100, i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L_2:%.*]] = load double, ptr [[DST]], align 8 +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L_1]], ptr [[DST]], i32 100, i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + %l.1 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 100, i1 false, i32 4, i32 2) + %l.2 = load double, ptr %dst + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l.1, ptr %dst, i32 100, i1 false, i32 4, i32 2) + ret void +} + +define void @dead_dynamically_strided_store(ptr noalias %src, ptr noalias %dst, i32 %stride) { +; CHECK-LABEL: define void @dead_dynamically_strided_store( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[STRIDE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 %stride, i1 false, i32 4, i32 2) + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 %stride, i1 false, i32 4, i32 2) + ret void +} + +define void @live_dynamically_strided_store(ptr %ptr, i32 %stride) { +; CHECK-LABEL: define void @live_dynamically_strided_store( +; CHECK-SAME: ptr [[PTR:%.*]], i32 [[STRIDE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[PTR]], i32 [[STRIDE]], i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[PTR]], i32 [[STRIDE]], i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[PTR]], i32 [[STRIDE]], i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %ptr, i32 %stride, i1 false, i32 4, i32 2) + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %ptr, i32 %stride, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %ptr, i32 %stride, i1 false, i32 4, i32 2) + ret void +} + +define void @dead_dynamically_strided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst, i32 %stride) { +; CHECK-LABEL: define void @dead_dynamically_strided_store_non_matrix_load( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[STRIDE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[L_1:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L:%.*]] = load double, ptr [[SRC]], align 8 +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L_1]], ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + %l.1 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 %stride, i1 false, i32 4, i32 2) + %l.2 = load double, ptr %src + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l.1, ptr %dst, i32 %stride, i1 false, i32 4, i32 2) + ret void +} + +define void @live_dynamically_strided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst, i32 %stride) { +; CHECK-LABEL: define void @live_dynamically_strided_store_non_matrix_load( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[STRIDE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[L_1:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L_2:%.*]] = load double, ptr [[DST]], align 8 +; CHECK-NEXT: ret void +; +entry: + %l.1 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 %stride, i1 false, i32 4, i32 2) + %l.2 = load double, ptr %dst + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 %stride, i1 false, i32 4, i32 2) + ret void +} + +define void @dead_unstrided_store(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @dead_unstrided_store( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2) + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 4, i1 false, i32 4, i32 2) + ret void +} + +define void @live_unstrided_store(ptr %ptr) { +; CHECK-LABEL: define void @live_unstrided_store( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[PTR]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[PTR]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[PTR]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %ptr, i32 4, i1 false, i32 4, i32 2) + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %ptr, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %ptr, i32 4, i1 false, i32 4, i32 2) + ret void +} + +define void @dead_non_matrix_store(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @dead_non_matrix_store( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[DST_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 6 +; CHECK-NEXT: store double 4.200000e+01, ptr [[DST_OFFSET]], align 8 +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + %dst.offset = getelementptr inbounds double, ptr %src, i32 6 + store double 42.0, ptr %dst.offset + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 4, i1 false, i32 4, i32 2) + ret void +} + +define void @live_non_matrix_store(ptr %ptr) { +; CHECK-LABEL: define void @live_non_matrix_store( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[PTR_OFFSET:%.*]] = getelementptr inbounds double, ptr [[PTR]], i32 6 +; CHECK-NEXT: store double 4.200000e+01, ptr [[PTR_OFFSET]], align 8 +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[PTR]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[PTR]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + %ptr.offset = getelementptr inbounds double, ptr %ptr, i32 6 + store double 42.0, ptr %ptr.offset + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %ptr, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %ptr, i32 4, i1 false, i32 4, i32 2) + ret void +} + +define void @dead_matrix_store_non_matrix_overwrite_unstrided(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @dead_matrix_store_non_matrix_overwrite_unstrided( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: store <8 x double> zeroinitializer, ptr [[DST]], align 64 +; CHECK-NEXT: ret void +; +entry: + call void @llvm.matrix.column.major.store(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2) + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + store <8 x double> zeroinitializer, ptr %dst + ret void +} + +define void @dead_matrix_store_non_matrix_overwrite_strided(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @dead_matrix_store_non_matrix_overwrite_strided( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 8, i1 false, i32 4, i32 2) +; CHECK-NEXT: store <16 x double> zeroinitializer, ptr [[DST]], align 128 +; CHECK-NEXT: ret void +; +entry: + call void @llvm.matrix.column.major.store(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2) + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 8, i1 false, i32 4, i32 2) + store <16 x double> zeroinitializer, ptr %dst + ret void +} + +define void @live_matrix_store_non_matrix_overwrite_unstrided(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @live_matrix_store_non_matrix_overwrite_unstrided( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: store <4 x double> zeroinitializer, ptr [[DST]], align 32 +; CHECK-NEXT: ret void +; +entry: + call void @llvm.matrix.column.major.store(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2) + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + store <4 x double> zeroinitializer, ptr %dst + ret void +} + +define void @live_matrix_store_non_matrix_overwrite_strided(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @live_matrix_store_non_matrix_overwrite_strided( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 8, i1 false, i32 4, i32 2) +; CHECK-NEXT: store <8 x double> zeroinitializer, ptr [[DST]], align 64 +; CHECK-NEXT: ret void +; +entry: + call void @llvm.matrix.column.major.store(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2) + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 8, i1 false, i32 4, i32 2) + store <8 x double> zeroinitializer, ptr %dst + ret void +} + +define void @dead_matrix_store_dimension_change(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @dead_matrix_store_dimension_change( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 8, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v9f64.i32(<9 x double> zeroinitializer, ptr [[DST]], i32 3, i1 false, i32 3, i32 3) +; CHECK-NEXT: ret void +; +entry: + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 8, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v9f64.i32(<9 x double> zeroinitializer, ptr %dst, i32 3, i1 false, i32 3, i32 3) + ret void +} + +define void @live_matrix_store_dimension_change(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @live_matrix_store_dimension_change( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 8, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v9f64.i32(<9 x double> zeroinitializer, ptr [[DST]], i32 3, i1 false, i32 3, i32 3) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: ret void +; +entry: + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 8, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v9f64.i32(<9 x double> zeroinitializer, ptr %dst, i32 3, i1 false, i32 3, i32 3) + call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 4, i1 false, i32 4, i32 2) + ret void +} + +declare <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr, i32, i1, i32, i32) +declare <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr, i32, i1, i32, i32) +declare void @llvm.matrix.column.major.store.v8f64.i32(<8 x double>, ptr, i32, i1, i32, i32) diff --git a/llvm/test/Transforms/GVN/matrix-intrinsics.ll b/llvm/test/Transforms/GVN/matrix-intrinsics.ll new file mode 100644 index 0000000000000..78dbfe1ef6bd8 --- /dev/null +++ b/llvm/test/Transforms/GVN/matrix-intrinsics.ll @@ -0,0 +1,136 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes=gvn -S %s | FileCheck %s + +define void @redundant_unstrided_load(ptr %src) { +; CHECK-LABEL: define void @redundant_unstrided_load( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 8 +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L_2:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @use(<8 x double> [[L]]) +; CHECK-NEXT: call void @use(<8 x double> [[L_2]]) +; CHECK-NEXT: ret void +; +entry: + %src.offset = getelementptr inbounds double, ptr %src, i32 8 + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 4, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64(<8 x double> %l, ptr %src, i32 4, i1 false, i32 4, i32 2) + %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 4, i1 false, i32 4, i32 2) + call void @use(<8 x double> %l) + call void @use(<8 x double> %l.2) + ret void +} + +define void @redundant_unstrided_load_non_matrix_store(ptr %src) { +; CHECK-LABEL: define void @redundant_unstrided_load_non_matrix_store( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 1 +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: store double 4.200000e+01, ptr [[SRC]], align 8 +; CHECK-NEXT: [[L_2:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @use(<8 x double> [[L]]) +; CHECK-NEXT: call void @use(<8 x double> [[L_2]]) +; CHECK-NEXT: ret void +; +entry: + %src.offset = getelementptr inbounds double, ptr %src, i32 1 + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 4, i1 false, i32 4, i32 2) + store double 42.0, ptr %src + %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 4, i1 false, i32 4, i32 2) + call void @use(<8 x double> %l) + call void @use(<8 x double> %l.2) + ret void +} + +define void @redundant_strided_load(ptr %src) { +; CHECK-LABEL: define void @redundant_strided_load( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 16 +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 8, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[SRC]], i32 8, i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L_2:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 8, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @use(<8 x double> [[L]]) +; CHECK-NEXT: call void @use(<8 x double> [[L_2]]) +; CHECK-NEXT: ret void +; +entry: + %src.offset = getelementptr inbounds double, ptr %src, i32 16 + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) + call void @llvm.matrix.column.major.store.v8f64(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2) + %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) + call void @use(<8 x double> %l) + call void @use(<8 x double> %l.2) + ret void + +} + +define void @redundant_strided_load_non_matrix_store(ptr %src) { +; CHECK-LABEL: define void @redundant_strided_load_non_matrix_store( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 16 +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 8, i1 false, i32 4, i32 2) +; CHECK-NEXT: store double 4.200000e+01, ptr [[SRC]], align 8 +; CHECK-NEXT: [[L_2:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 8, i1 false, i32 4, i32 2) +; CHECK-NEXT: call void @use(<8 x double> [[L]]) +; CHECK-NEXT: call void @use(<8 x double> [[L_2]]) +; CHECK-NEXT: ret void +; +entry: + %src.offset = getelementptr inbounds double, ptr %src, i32 16 + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) + store double 42.0, ptr %src + %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) + call void @use(<8 x double> %l) + call void @use(<8 x double> %l.2) + ret void +} + +define void @repeat_load_dimension_change_project(ptr %src) { +; CHECK-LABEL: define void @repeat_load_dimension_change_project( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L_2:%.*]] = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr [[SRC]], i32 3, i1 false, i32 3, i32 3) +; CHECK-NEXT: [[L_3:%.*]] = shufflevector <9 x double> [[L_2]], <9 x double> zeroinitializer, <8 x i32> +; CHECK-NEXT: call void @use(<8 x double> [[L]]) +; CHECK-NEXT: call void @use(<8 x double> [[L_3]]) +; CHECK-NEXT: ret void +; +entry: + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + %l.2 = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr %src, i32 3, i1 false, i32 3, i32 3) + %l.3 = shufflevector <9 x double> %l.2, <9 x double> zeroinitializer, <8 x i32> + call void @use(<8 x double> %l) + call void @use(<8 x double> %l.3) + ret void +} + +define void @repeat_load_dimension_change_shuffle(ptr %src) { +; CHECK-LABEL: define void @repeat_load_dimension_change_shuffle( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2) +; CHECK-NEXT: [[L_2:%.*]] = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr [[SRC]], i32 3, i1 false, i32 3, i32 3) +; CHECK-NEXT: [[L_3:%.*]] = shufflevector <9 x double> [[L_2]], <9 x double> zeroinitializer, <8 x i32> +; CHECK-NEXT: call void @use(<8 x double> [[L]]) +; CHECK-NEXT: call void @use(<8 x double> [[L_3]]) +; CHECK-NEXT: ret void +; +entry: + %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2) + %l.2 = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr %src, i32 3, i1 false, i32 3, i32 3) + %l.3 = shufflevector <9 x double> %l.2, <9 x double> zeroinitializer, <8 x i32> + call void @use(<8 x double> %l) + call void @use(<8 x double> %l.3) + ret void +} + +declare <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr, i32, i1, i32, i32) +declare <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr, i32, i1, i32, i32) +declare void @llvm.matrix.column.major.store.v8f64.i32(<8 x double>, ptr, i32, i1, i32, i32) +declare void @use(<8 x double>) From f7fb52aea0b90a2fa76f162e8cbd481c5e1bd91b Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 22 Oct 2025 14:58:06 +0200 Subject: [PATCH 091/530] [Clang] Move AllocToken frontend options to LangOptions (#163635) Move the `AllocTokenMax` from `CodeGenOptions` and introduces a new `AllocTokenMode` to `LangOptions`. Note, `-falloc-token-mode=` deliberately remains an internal experimental option. This refactoring is necessary because these options influence frontend behavior, specifically constexpr evaluation of `__builtin_infer_alloc_token`. Placing them in `LangOptions` makes them accessible during semantic analysis, which occurs before codegen. --- clang/docs/AllocToken.rst | 4 +- clang/include/clang/Basic/CodeGenOptions.h | 4 -- clang/include/clang/Basic/LangOptions.h | 8 ++++ clang/include/clang/Driver/Options.td | 4 ++ clang/lib/CodeGen/BackendUtil.cpp | 9 ++-- clang/lib/Frontend/CompilerInvocation.cpp | 53 ++++++++++++++++------ clang/test/Driver/fsanitize-alloc-token.c | 11 +++++ 7 files changed, 71 insertions(+), 22 deletions(-) diff --git a/clang/docs/AllocToken.rst b/clang/docs/AllocToken.rst index bda84669456ce..b65e18ccfa967 100644 --- a/clang/docs/AllocToken.rst +++ b/clang/docs/AllocToken.rst @@ -37,8 +37,8 @@ The default mode to calculate tokens is: pointers. Other token ID assignment modes are supported, but they may be subject to -change or removal. These may (experimentally) be selected with ``-mllvm --alloc-token-mode=``: +change or removal. These may (experimentally) be selected with ``-Xclang +-falloc-token-mode=``: * ``typehash``: This mode assigns a token ID based on the hash of the allocated type's name. diff --git a/clang/include/clang/Basic/CodeGenOptions.h b/clang/include/clang/Basic/CodeGenOptions.h index cae06c3c9495a..5d5cf250b56b9 100644 --- a/clang/include/clang/Basic/CodeGenOptions.h +++ b/clang/include/clang/Basic/CodeGenOptions.h @@ -447,10 +447,6 @@ class CodeGenOptions : public CodeGenOptionsBase { std::optional AllowRuntimeCheckSkipHotCutoff; - /// Maximum number of allocation tokens (0 = no max), nullopt if none set (use - /// pass default). - std::optional AllocTokenMax; - /// List of backend command-line options for -fembed-bitcode. std::vector CmdArgs; diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h index 260a7537edb9d..8aa89d8c8c807 100644 --- a/clang/include/clang/Basic/LangOptions.h +++ b/clang/include/clang/Basic/LangOptions.h @@ -25,6 +25,7 @@ #include "llvm/ADT/FloatingPointMode.h" #include "llvm/ADT/StringRef.h" #include "llvm/BinaryFormat/DXContainer.h" +#include "llvm/Support/AllocToken.h" #include "llvm/TargetParser/Triple.h" #include #include @@ -565,6 +566,13 @@ class LangOptions : public LangOptionsBase { bool AtomicFineGrainedMemory = false; bool AtomicIgnoreDenormalMode = false; + /// Maximum number of allocation tokens (0 = no max), nullopt if none set (use + /// target default). + std::optional AllocTokenMax; + + /// The allocation token mode. + std::optional AllocTokenMode; + LangOptions(); /// Set language defaults for the given input language and diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 7ae153deb9a55..0c9584f1b479f 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -2751,6 +2751,10 @@ def falloc_token_max_EQ : Joined<["-"], "falloc-token-max=">, MetaVarName<"">, HelpText<"Limit to maximum N allocation tokens (0 = no max)">; +def falloc_token_mode_EQ : Joined<["-"], "falloc-token-mode=">, + Group, Visibility<[CC1Option]>, + HelpText<"Set the allocation token mode (experimental)">; + def fallow_runtime_check_skip_hot_cutoff_EQ : Joined<["-"], "fallow-runtime-check-skip-hot-cutoff=">, Group, diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index ecfbcb5970092..c423c4bebcac8 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -234,9 +234,12 @@ class EmitAssemblyHelper { }; } // namespace -static AllocTokenOptions getAllocTokenOptions(const CodeGenOptions &CGOpts) { +static AllocTokenOptions getAllocTokenOptions(const LangOptions &LangOpts, + const CodeGenOptions &CGOpts) { AllocTokenOptions Opts; - Opts.MaxTokens = CGOpts.AllocTokenMax; + if (LangOpts.AllocTokenMode) + Opts.Mode = *LangOpts.AllocTokenMode; + Opts.MaxTokens = LangOpts.AllocTokenMax; Opts.Extended = CGOpts.SanitizeAllocTokenExtended; Opts.FastABI = CGOpts.SanitizeAllocTokenFastABI; return Opts; @@ -802,7 +805,7 @@ static void addSanitizers(const Triple &TargetTriple, // memory allocation function detection. MPM.addPass(InferFunctionAttrsPass()); } - MPM.addPass(AllocTokenPass(getAllocTokenOptions(CodeGenOpts))); + MPM.addPass(AllocTokenPass(getAllocTokenOptions(LangOpts, CodeGenOpts))); } }; if (ClSanitizeOnOptimizerEarlyEP) { diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index 5bd15f5d4ca31..d2cb751c9f77c 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -1833,10 +1833,6 @@ void CompilerInvocationBase::GenerateCodeGenArgs(const CodeGenOptions &Opts, serializeSanitizerKinds(Opts.SanitizeAnnotateDebugInfo)) GenerateArg(Consumer, OPT_fsanitize_annotate_debug_info_EQ, Sanitizer); - if (Opts.AllocTokenMax) - GenerateArg(Consumer, OPT_falloc_token_max_EQ, - std::to_string(*Opts.AllocTokenMax)); - if (!Opts.EmitVersionIdentMetadata) GenerateArg(Consumer, OPT_Qn); @@ -2350,15 +2346,6 @@ bool CompilerInvocation::ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, } } - if (const auto *Arg = Args.getLastArg(options::OPT_falloc_token_max_EQ)) { - StringRef S = Arg->getValue(); - uint64_t Value = 0; - if (S.getAsInteger(0, Value)) - Diags.Report(diag::err_drv_invalid_value) << Arg->getAsString(Args) << S; - else - Opts.AllocTokenMax = Value; - } - Opts.EmitVersionIdentMetadata = Args.hasFlag(OPT_Qy, OPT_Qn, true); if (!LangOpts->CUDAIsDevice) @@ -3966,6 +3953,29 @@ void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts, if (!Opts.RandstructSeed.empty()) GenerateArg(Consumer, OPT_frandomize_layout_seed_EQ, Opts.RandstructSeed); + + if (Opts.AllocTokenMax) + GenerateArg(Consumer, OPT_falloc_token_max_EQ, + std::to_string(*Opts.AllocTokenMax)); + + if (Opts.AllocTokenMode) { + StringRef S; + switch (*Opts.AllocTokenMode) { + case llvm::AllocTokenMode::Increment: + S = "increment"; + break; + case llvm::AllocTokenMode::Random: + S = "random"; + break; + case llvm::AllocTokenMode::TypeHash: + S = "typehash"; + break; + case llvm::AllocTokenMode::TypeHashPointerSplit: + S = "typehashpointersplit"; + break; + } + GenerateArg(Consumer, OPT_falloc_token_mode_EQ, S); + } } bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args, @@ -4544,6 +4554,23 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args, if (const Arg *A = Args.getLastArg(OPT_frandomize_layout_seed_EQ)) Opts.RandstructSeed = A->getValue(0); + if (const auto *Arg = Args.getLastArg(options::OPT_falloc_token_max_EQ)) { + StringRef S = Arg->getValue(); + uint64_t Value = 0; + if (S.getAsInteger(0, Value)) + Diags.Report(diag::err_drv_invalid_value) << Arg->getAsString(Args) << S; + else + Opts.AllocTokenMax = Value; + } + + if (const auto *Arg = Args.getLastArg(options::OPT_falloc_token_mode_EQ)) { + StringRef S = Arg->getValue(); + if (auto Mode = getAllocTokenModeFromString(S)) + Opts.AllocTokenMode = Mode; + else + Diags.Report(diag::err_drv_invalid_value) << Arg->getAsString(Args) << S; + } + // Validate options for HLSL if (Opts.HLSL) { // TODO: Revisit restricting SPIR-V to logical once we've figured out how to diff --git a/clang/test/Driver/fsanitize-alloc-token.c b/clang/test/Driver/fsanitize-alloc-token.c index 2964f60c4f26f..6d8bda16dfb96 100644 --- a/clang/test/Driver/fsanitize-alloc-token.c +++ b/clang/test/Driver/fsanitize-alloc-token.c @@ -41,3 +41,14 @@ // CHECK-MAX: "-falloc-token-max=42" // RUN: not %clang --target=x86_64-linux-gnu -fsanitize=alloc-token -falloc-token-max=-1 %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-MAX %s // CHECK-INVALID-MAX: error: invalid value + +// RUN: %clang --target=x86_64-linux-gnu -Xclang -falloc-token-mode=increment %s -### 2>&1 | FileCheck -check-prefix=CHECK-MODE-INCREMENT %s +// CHECK-MODE-INCREMENT: "-falloc-token-mode=increment" +// RUN: %clang --target=x86_64-linux-gnu -Xclang -falloc-token-mode=random %s -### 2>&1 | FileCheck -check-prefix=CHECK-MODE-RANDOM %s +// CHECK-MODE-RANDOM: "-falloc-token-mode=random" +// RUN: %clang --target=x86_64-linux-gnu -Xclang -falloc-token-mode=typehash %s -### 2>&1 | FileCheck -check-prefix=CHECK-MODE-TYPEHASH %s +// CHECK-MODE-TYPEHASH: "-falloc-token-mode=typehash" +// RUN: %clang --target=x86_64-linux-gnu -Xclang -falloc-token-mode=typehashpointersplit %s -### 2>&1 | FileCheck -check-prefix=CHECK-MODE-TYPEHASHPTRSPLIT %s +// CHECK-MODE-TYPEHASHPTRSPLIT: "-falloc-token-mode=typehashpointersplit" +// RUN: not %clang --target=x86_64-linux-gnu -Xclang -falloc-token-mode=asdf %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-MODE %s +// CHECK-INVALID-MODE: error: invalid value 'asdf' From 50acc09c1d6074ae7d1ed4e258cb1d82492f7c1a Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Wed, 22 Oct 2025 05:59:52 -0700 Subject: [PATCH 092/530] [clang-fuzzer] Remove Dockerfile (#162555) Was going through Dockerfiles to see where we are missing FROM lines with fully qualified names and came across this one. I think it is safe to say it has not been used in a very long time or maintained at all since then since it still tries to download the source tree using svn. Given that, delete it to lower support surface slightly. --- clang/tools/clang-fuzzer/Dockerfile | 41 ----------------------------- clang/tools/clang-fuzzer/README.txt | 3 --- 2 files changed, 44 deletions(-) delete mode 100644 clang/tools/clang-fuzzer/Dockerfile diff --git a/clang/tools/clang-fuzzer/Dockerfile b/clang/tools/clang-fuzzer/Dockerfile deleted file mode 100644 index 1ddf82954e219..0000000000000 --- a/clang/tools/clang-fuzzer/Dockerfile +++ /dev/null @@ -1,41 +0,0 @@ -#===- llvm/tools/clang/tools/clang-fuzzer ---------------------------------===// -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -#===----------------------------------------------------------------------===// -# Produces an image that builds clang-proto-fuzzer -FROM ubuntu:16.04 -RUN apt-get update -y -RUN apt-get install -y autoconf automake libtool curl make g++ unzip wget git \ - binutils liblzma-dev libz-dev python-all cmake ninja-build subversion \ - pkg-config docbook2x - -WORKDIR /root - -# Get protobuf -RUN wget -qO- https://github.com/google/protobuf/releases/download/v3.3.0/protobuf-cpp-3.3.0.tar.gz | tar zxf - -RUN cd protobuf-3.3.0 && ./autogen.sh && ./configure && make -j $(nproc) && make check -j $(nproc) && make install && ldconfig -# Get LLVM -RUN svn co http://llvm.org/svn/llvm-project/llvm/trunk llvm -RUN cd llvm/tools && svn co http://llvm.org/svn/llvm-project/cfe/trunk clang -r $(cd ../ && svn info | grep Revision | awk '{print $2}') -RUN cd llvm/projects && svn co http://llvm.org/svn/llvm-project/compiler-rt/trunk compiler-rt -r $(cd ../ && svn info | grep Revision | awk '{print $2}') -# Build plain LLVM (stage 0) -RUN mkdir build0 && cd build0 && cmake -GNinja -DCMAKE_BUILD_TYPE=Release ../llvm && ninja -# Configure instrumented LLVM (stage 1) -RUN mkdir build1 && cd build1 && cmake -GNinja -DCMAKE_BUILD_TYPE=Release ../llvm \ - -DLLVM_ENABLE_ASSERTIONS=ON \ - -DCMAKE_C_COMPILER=`pwd`/../build0/bin/clang \ - -DCMAKE_CXX_COMPILER=`pwd`/../build0/bin/clang++ \ - -DLLVM_USE_SANITIZE_COVERAGE=YES \ - -DLLVM_USE_SANITIZER=Address -DCLANG_ENABLE_PROTO_FUZZER=ON -# Build the fuzzers -RUN cd build1 && ninja clang-fuzzer -RUN cd build1 && ninja clang-objc-fuzzer -RUN cd build1 && ninja clang-proto-fuzzer -RUN cd build1 && ninja clang-proto-to-cxx -RUN cd build1 && ninja clang-loop-proto-to-cxx -RUN cd build1 && ninja clang-loop-proto-to-llvm -RUN cd build1 && ninja clang-loop-proto-fuzzer -RUN cd build1 && ninja clang-llvm-proto-fuzzer diff --git a/clang/tools/clang-fuzzer/README.txt b/clang/tools/clang-fuzzer/README.txt index eec4a9efdfc66..218c54437a466 100644 --- a/clang/tools/clang-fuzzer/README.txt +++ b/clang/tools/clang-fuzzer/README.txt @@ -99,9 +99,6 @@ Example: -DCLANG_ENABLE_PROTO_FUZZER=ON ninja clang-proto-fuzzer clang-proto-to-cxx -This directory also contains a Dockerfile which sets up all required -dependencies and builds the fuzzers. - ============================ Running clang-proto-fuzzer ============================ From 6e0553f545df37a31b096f462f5319312728daca Mon Sep 17 00:00:00 2001 From: Michael Kruse Date: Wed, 22 Oct 2025 15:00:28 +0200 Subject: [PATCH 093/530] Reapply "[Polly] Update ScopInliner for NPM (#125427)" (#164601) An assertion failed when Polly was registering for the pass manager which assumed that there would be only Polly passes. Since this does not need to be the case, re-apply with the assert removed. Includes a non-Polly change to trigger the premerge CI to trigger check-llvm which failed for 0b9a7b80c0674c5c6f746139912111bea7eae63b, but pre-merge did not catch. --- llvm/lib/Transforms/Vectorize/VPlanValue.h | 10 +- polly/docs/ReleaseNotes.rst | 2 + polly/include/polly/LinkAllPasses.h | 2 +- polly/include/polly/ScopInliner.h | 34 ++++ polly/lib/Support/PollyPasses.def | 6 + polly/lib/Support/RegisterPasses.cpp | 39 ++++- polly/lib/Transform/ScopInliner.cpp | 159 +++++++++++------- polly/test/ScopInliner/ignore-declares.ll | 3 +- polly/test/ScopInliner/invariant-load-func.ll | 5 +- polly/test/ScopInliner/simple-inline-loop.ll | 3 +- 10 files changed, 187 insertions(+), 76 deletions(-) create mode 100644 polly/include/polly/ScopInliner.h diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 0678bc90ef4b5..83e3fcaaeee2b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -41,10 +41,10 @@ class VPRecipeBase; class VPInterleaveBase; class VPPhiAccessors; -// This is the base class of the VPlan Def/Use graph, used for modeling the data -// flow into, within and out of the VPlan. VPValues can stand for live-ins -// coming from the input IR and instructions which VPlan will generate if -// executed. +/// This is the base class of the VPlan Def/Use graph, used for modeling the +/// data flow into, within and out of the VPlan. VPValues can stand for live-ins +/// coming from the input IR and instructions which VPlan will generate if +/// executed. class LLVM_ABI_FOR_TEST VPValue { friend class VPDef; friend struct VPDoubleValueDef; @@ -57,7 +57,7 @@ class LLVM_ABI_FOR_TEST VPValue { SmallVector Users; protected: - // Hold the underlying Value, if any, attached to this VPValue. + /// Hold the underlying Value, if any, attached to this VPValue. Value *UnderlyingVal; /// Pointer to the VPDef that defines this VPValue. If it is nullptr, the diff --git a/polly/docs/ReleaseNotes.rst b/polly/docs/ReleaseNotes.rst index f7c9689089be2..f5ea47b69cf02 100644 --- a/polly/docs/ReleaseNotes.rst +++ b/polly/docs/ReleaseNotes.rst @@ -11,3 +11,5 @@ In Polly |version| the following important changes have been incorporated. the new features that have recently been committed to our development branch. + * ScopInliner has been updated for the New Pass Manager. + diff --git a/polly/include/polly/LinkAllPasses.h b/polly/include/polly/LinkAllPasses.h index c3b68a74056ac..9978344c73e9f 100644 --- a/polly/include/polly/LinkAllPasses.h +++ b/polly/include/polly/LinkAllPasses.h @@ -119,7 +119,7 @@ struct PollyForcePassLinking { namespace llvm { void initializeCodePreparationPass(llvm::PassRegistry &); -void initializeScopInlinerPass(llvm::PassRegistry &); +void initializeScopInlinerWrapperPassPass(llvm::PassRegistry &); void initializeScopDetectionWrapperPassPass(llvm::PassRegistry &); void initializeScopDetectionPrinterLegacyPassPass(llvm::PassRegistry &); void initializeScopInfoRegionPassPass(PassRegistry &); diff --git a/polly/include/polly/ScopInliner.h b/polly/include/polly/ScopInliner.h new file mode 100644 index 0000000000000..014667804330f --- /dev/null +++ b/polly/include/polly/ScopInliner.h @@ -0,0 +1,34 @@ +//===------ ScopInliner.h ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef POLLY_POLLYINLINER_H +#define POLLY_POLLYINLINER_H + +#include "llvm/Analysis/CGSCCPassManager.h" +#include "llvm/Analysis/LazyCallGraph.h" +#include "llvm/IR/PassManager.h" + +namespace polly { +class ScopInlinerPass : public llvm::PassInfoMixin { +public: + ScopInlinerPass(); + + llvm::PreservedAnalyses run(llvm::LazyCallGraph::SCC &C, + llvm::CGSCCAnalysisManager &AM, + llvm::LazyCallGraph &CG, + llvm::CGSCCUpdateResult &UR); +}; + +llvm::Pass *createScopInlinerWrapperPass(); +} // namespace polly + +namespace llvm { +void initializeScopInlinerWrapperPassPass(llvm::PassRegistry &); +} + +#endif /* POLLY_POLLYINLINER_H */ diff --git a/polly/lib/Support/PollyPasses.def b/polly/lib/Support/PollyPasses.def index e068f31fdb703..2c792a5867100 100644 --- a/polly/lib/Support/PollyPasses.def +++ b/polly/lib/Support/PollyPasses.def @@ -1,3 +1,9 @@ +#ifndef CGSCC_PASS +#define CGSCC_PASS(NAME, CREATE_PASS, PARSER) +#endif +CGSCC_PASS("polly-inline", ScopInlinerPass(), parseNoOptions) +#undef CGSCC_PASS + #ifndef FUNCTION_ANALYSIS #define FUNCTION_ANALYSIS(NAME, CREATE_PASS) #endif diff --git a/polly/lib/Support/RegisterPasses.cpp b/polly/lib/Support/RegisterPasses.cpp index 0420dff944f62..04f8715502c38 100644 --- a/polly/lib/Support/RegisterPasses.cpp +++ b/polly/lib/Support/RegisterPasses.cpp @@ -35,6 +35,7 @@ #include "polly/ScopDetection.h" #include "polly/ScopGraphPrinter.h" #include "polly/ScopInfo.h" +#include "polly/ScopInliner.h" #include "polly/Simplify.h" #include "polly/Support/DumpFunctionPass.h" #include "polly/Support/DumpModulePass.h" @@ -46,10 +47,13 @@ #include "llvm/Passes/PassBuilder.h" #include "llvm/Passes/PassPlugin.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Error.h" #include "llvm/Support/TargetSelect.h" #include "llvm/Transforms/IPO.h" +using namespace llvm; namespace cl = llvm::cl; +using namespace polly; using llvm::FunctionPassManager; using llvm::OptimizationLevel; @@ -233,7 +237,7 @@ void initializePollyPasses(llvm::PassRegistry &Registry) { initializePollyCanonicalizePass(Registry); initializeScopDetectionWrapperPassPass(Registry); initializeScopDetectionPrinterLegacyPassPass(Registry); - initializeScopInlinerPass(Registry); + initializeScopInlinerWrapperPassPass(Registry); initializeScopInfoRegionPassPass(Registry); initializeScopInfoPrinterLegacyRegionPassPass(Registry); initializeScopInfoWrapperPassPass(Registry); @@ -434,6 +438,16 @@ static void buildLatePollyPipeline(FunctionPassManager &PM, false); } +static llvm::Expected parseNoOptions(StringRef Params) { + if (!Params.empty()) + return make_error( + formatv("'{0}' passed to pass that does not take any options", Params) + .str(), + inconvertibleErrorCode()); + + return std::monostate{}; +} + static OwningScopAnalysisManagerFunctionProxy createScopAnalyses(FunctionAnalysisManager &FAM, PassInstrumentationCallbacks *PIC) { @@ -461,6 +475,23 @@ static void registerFunctionAnalyses(FunctionAnalysisManager &FAM, FAM.registerPass([&FAM, PIC] { return createScopAnalyses(FAM, PIC); }); } +static llvm::Expected +parseCGPipeline(StringRef Name, llvm::CGSCCPassManager &CGPM, + PassInstrumentationCallbacks *PIC, + ArrayRef Pipeline) { +#define CGSCC_PASS(NAME, CREATE_PASS, PARSER) \ + if (PassBuilder::checkParametrizedPassName(Name, NAME)) { \ + auto Params = PassBuilder::parsePassParameters(PARSER, Name, NAME); \ + if (!Params) \ + return Params.takeError(); \ + CGPM.addPass(CREATE_PASS); \ + return true; \ + } +#include "PollyPasses.def" + + return false; +} + static bool parseFunctionPipeline(StringRef Name, FunctionPassManager &FPM, ArrayRef Pipeline) { @@ -598,6 +629,12 @@ void registerPollyPasses(PassBuilder &PB) { ArrayRef Pipeline) -> bool { return parseScopPipeline(Name, FPM, PIC, Pipeline); }); + PB.registerPipelineParsingCallback( + [PIC](StringRef Name, CGSCCPassManager &CGPM, + ArrayRef Pipeline) -> bool { + ExitOnError Err("Unable to parse Polly call graph pass: "); + return Err(parseCGPipeline(Name, CGPM, PIC, Pipeline)); + }); PB.registerParseTopLevelPipelineCallback( [PIC](llvm::ModulePassManager &MPM, ArrayRef Pipeline) -> bool { diff --git a/polly/lib/Transform/ScopInliner.cpp b/polly/lib/Transform/ScopInliner.cpp index b78206c1e40ba..c04ba3498339e 100644 --- a/polly/lib/Transform/ScopInliner.cpp +++ b/polly/lib/Transform/ScopInliner.cpp @@ -13,10 +13,14 @@ // //===----------------------------------------------------------------------===// -#include "polly/LinkAllPasses.h" +#include "polly/ScopInliner.h" #include "polly/ScopDetection.h" +#include "polly/ScopInliner.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/RegionInfo.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/PassManager.h" #include "llvm/Passes/PassBuilder.h" #include "llvm/Transforms/IPO/AlwaysInliner.h" @@ -28,13 +32,77 @@ using namespace llvm; using namespace polly; namespace { -class ScopInliner final : public CallGraphSCCPass { + +/// Inliner implementation that works with both, LPM (using SCC_t=CallGraph) and +/// NPM (using SCC_t=LazyCallGraph::SCC) +template bool runScopInlinerImpl(Function *F, SCC_t &SCC) { + // We do not try to inline non-trivial SCCs because this would lead to + // "infinite" inlining if we are not careful. + if (SCC.size() > 1) + return false; + assert(SCC.size() == 1 && "found empty SCC"); + + // If the function is a nullptr, or the function is a declaration. + if (!F) + return false; + if (F->isDeclaration()) { + POLLY_DEBUG(dbgs() << "Skipping " << F->getName() + << "because it is a declaration.\n"); + return false; + } + + PassBuilder PB; + // Populate analysis managers and register Polly-specific analyses. + LoopAnalysisManager LAM; + FunctionAnalysisManager FAM; + CGSCCAnalysisManager CGAM; + ModuleAnalysisManager MAM; + PB.registerModuleAnalyses(MAM); + PB.registerCGSCCAnalyses(CGAM); + PB.registerFunctionAnalyses(FAM); + PB.registerLoopAnalyses(LAM); + PB.crossRegisterProxies(LAM, FAM, CGAM, MAM); + + auto &DT = FAM.getResult(*F); + auto &SE = FAM.getResult(*F); + auto &LI = FAM.getResult(*F); + auto &RI = FAM.getResult(*F); + auto &AA = FAM.getResult(*F); + auto &ORE = FAM.getResult(*F); + ScopDetection SD(DT, SE, LI, RI, AA, ORE); + SD.detect(*F); + + const bool HasScopAsTopLevelRegion = + SD.ValidRegions.contains(RI.getTopLevelRegion()); + + bool Changed = false; + if (HasScopAsTopLevelRegion) { + POLLY_DEBUG(dbgs() << "Skipping " << F->getName() + << " has scop as top level region"); + F->addFnAttr(llvm::Attribute::AlwaysInline); + + ModulePassManager MPM; + MPM.addPass(AlwaysInlinerPass()); + Module *M = F->getParent(); + assert(M && "Function has illegal module"); + PreservedAnalyses PA = MPM.run(*M, MAM); + if (!PA.areAllPreserved()) + Changed = true; + } else { + POLLY_DEBUG(dbgs() << F->getName() + << " does NOT have scop as top level region\n"); + } + + return Changed; +} + +class ScopInlinerWrapperPass final : public CallGraphSCCPass { using llvm::Pass::doInitialization; public: static char ID; - ScopInliner() : CallGraphSCCPass(ID) {} + ScopInlinerWrapperPass() : CallGraphSCCPass(ID) {} bool doInitialization(CallGraph &CG) override { if (!polly::PollyAllowFullFunction) { @@ -50,60 +118,8 @@ class ScopInliner final : public CallGraphSCCPass { } bool runOnSCC(CallGraphSCC &SCC) override { - // We do not try to inline non-trivial SCCs because this would lead to - // "infinite" inlining if we are not careful. - if (SCC.size() > 1) - return false; - assert(SCC.size() == 1 && "found empty SCC"); Function *F = (*SCC.begin())->getFunction(); - - // If the function is a nullptr, or the function is a declaration. - if (!F) - return false; - if (F->isDeclaration()) { - POLLY_DEBUG(dbgs() << "Skipping " << F->getName() - << "because it is a declaration.\n"); - return false; - } - - PassBuilder PB; - // Populate analysis managers and register Polly-specific analyses. - LoopAnalysisManager LAM; - FunctionAnalysisManager FAM; - CGSCCAnalysisManager CGAM; - ModuleAnalysisManager MAM; - FAM.registerPass([] { return ScopAnalysis(); }); - PB.registerModuleAnalyses(MAM); - PB.registerCGSCCAnalyses(CGAM); - PB.registerFunctionAnalyses(FAM); - PB.registerLoopAnalyses(LAM); - PB.crossRegisterProxies(LAM, FAM, CGAM, MAM); - - RegionInfo &RI = FAM.getResult(*F); - ScopDetection &SD = FAM.getResult(*F); - - const bool HasScopAsTopLevelRegion = - SD.ValidRegions.contains(RI.getTopLevelRegion()); - - bool Changed = false; - if (HasScopAsTopLevelRegion) { - POLLY_DEBUG(dbgs() << "Skipping " << F->getName() - << " has scop as top level region"); - F->addFnAttr(llvm::Attribute::AlwaysInline); - - ModulePassManager MPM; - MPM.addPass(AlwaysInlinerPass()); - Module *M = F->getParent(); - assert(M && "Function has illegal module"); - PreservedAnalyses PA = MPM.run(*M, MAM); - if (!PA.areAllPreserved()) - Changed = true; - } else { - POLLY_DEBUG(dbgs() << F->getName() - << " does NOT have scop as top level region\n"); - } - - return Changed; + return runScopInlinerImpl(F, SCC); }; void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -111,18 +127,39 @@ class ScopInliner final : public CallGraphSCCPass { } }; } // namespace -char ScopInliner::ID; +char ScopInlinerWrapperPass::ID; -Pass *polly::createScopInlinerPass() { - ScopInliner *pass = new ScopInliner(); +Pass *polly::createScopInlinerWrapperPass() { + ScopInlinerWrapperPass *pass = new ScopInlinerWrapperPass(); return pass; } INITIALIZE_PASS_BEGIN( - ScopInliner, "polly-scop-inliner", + ScopInlinerWrapperPass, "polly-scop-inliner", "inline functions based on how much of the function is a scop.", false, false) INITIALIZE_PASS_END( - ScopInliner, "polly-scop-inliner", + ScopInlinerWrapperPass, "polly-scop-inliner", "inline functions based on how much of the function is a scop.", false, false) + +polly::ScopInlinerPass::ScopInlinerPass() { + if (!polly::PollyAllowFullFunction) { + report_fatal_error( + "Aborting from ScopInliner because it only makes sense to run with " + "-polly-allow-full-function. " + "The heurtistic for ScopInliner checks that the full function is a " + "Scop, which happens if and only if polly-allow-full-function is " + " enabled. " + " If not, the entry block is not included in the Scop"); + } +} + +PreservedAnalyses polly::ScopInlinerPass::run(llvm::LazyCallGraph::SCC &SCC, + llvm::CGSCCAnalysisManager &AM, + llvm::LazyCallGraph &CG, + llvm::CGSCCUpdateResult &UR) { + Function *F = &SCC.begin()->getFunction(); + bool Changed = runScopInlinerImpl(F, SCC); + return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); +} diff --git a/polly/test/ScopInliner/ignore-declares.ll b/polly/test/ScopInliner/ignore-declares.ll index 11722dcb32166..5c0cfa103f0bf 100644 --- a/polly/test/ScopInliner/ignore-declares.ll +++ b/polly/test/ScopInliner/ignore-declares.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadPolly -polly-detect-full-functions -polly-scop-inliner \ -; RUN: -polly-scops -disable-output < %s +; RUN: opt %loadNPMPolly -polly-detect-full-functions '-passes=cgscc(polly-inline),function(print)' -disable-output < %s ; Check that we do not crash if there are declares. We should skip function ; declarations and not try to query for domtree. diff --git a/polly/test/ScopInliner/invariant-load-func.ll b/polly/test/ScopInliner/invariant-load-func.ll index ffd2ec9cdb60f..58c556a455fb9 100644 --- a/polly/test/ScopInliner/invariant-load-func.ll +++ b/polly/test/ScopInliner/invariant-load-func.ll @@ -1,12 +1,9 @@ -; RUN: opt %loadNPMPolly -polly-detect-full-functions -polly-scop-inliner \ -; RUN: -polly-invariant-load-hoisting '-passes=print' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-detect-full-functions -polly-invariant-load-hoisting '-passes=cgscc(polly-inline),function(print)' -disable-output < %s 2>&1 | FileCheck %s ; Check that we inline a function that requires invariant load hoisting ; correctly. ; CHECK: Max Loop Depth: 2 -; REQUIRES: pollyacc - ; void to_be_inlined(int A[], int *begin, int *end) { ; for(int i = *begin; i < *end; i++) { diff --git a/polly/test/ScopInliner/simple-inline-loop.ll b/polly/test/ScopInliner/simple-inline-loop.ll index a5e3483edad05..f12798a3d831a 100644 --- a/polly/test/ScopInliner/simple-inline-loop.ll +++ b/polly/test/ScopInliner/simple-inline-loop.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadPolly -polly-detect-full-functions -polly-scop-inliner \ -; RUN: -polly-print-scops -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-detect-full-functions '-passes=cgscc(polly-inline),function(print)' -disable-output < %s 2>&1 | FileCheck %s ; Check that we get the 2 nested loops by inlining `to_be_inlined` into ; `inline_site`. From 6ceefbe87c5e19655dce6323c2fca2fe53fd7bec Mon Sep 17 00:00:00 2001 From: Erich Keane Date: Wed, 22 Oct 2025 06:09:28 -0700 Subject: [PATCH 094/530] [OpenACC][CIR] Implement || and && reduction combiner lowering (#164298) These two operations are expressed as LHS = LHS || RHS, for any construct in which that is valid. Fortunately, the mechanism for codegen from previous reduction works great for this, so it saw minimal changes. This is the last of the reduction construct lowering. --- clang/lib/Sema/SemaOpenACC.cpp | 25 +- .../combined-reduction-clause-default-ops.cpp | 534 ++++++++++++++++- .../combined-reduction-clause-float.cpp | 180 +++++- .../combined-reduction-clause-inline-ops.cpp | 126 +++- .../combined-reduction-clause-int.cpp | 180 +++++- .../combined-reduction-clause-outline-ops.cpp | 126 +++- .../compute-reduction-clause-default-ops.c | 552 +++++++++++++++++- .../compute-reduction-clause-default-ops.cpp | 534 ++++++++++++++++- .../compute-reduction-clause-float.c | 186 +++++- .../compute-reduction-clause-float.cpp | 180 +++++- .../compute-reduction-clause-inline-ops.cpp | 126 +++- .../compute-reduction-clause-int.c | 180 +++++- .../compute-reduction-clause-int.cpp | 180 +++++- .../compute-reduction-clause-outline-ops.cpp | 126 +++- .../compute-reduction-clause-unsigned-int.c | 186 +++++- .../loop-reduction-clause-default-ops.cpp | 534 ++++++++++++++++- .../loop-reduction-clause-float.cpp | 180 +++++- .../loop-reduction-clause-inline-ops.cpp | 126 +++- .../loop-reduction-clause-int.cpp | 180 +++++- .../loop-reduction-clause-outline-ops.cpp | 126 +++- .../reduction-clause-recipes.cpp | 95 ++- 21 files changed, 4516 insertions(+), 146 deletions(-) diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp index 3bb8080f6e72c..ee9b2b399e762 100644 --- a/clang/lib/Sema/SemaOpenACC.cpp +++ b/clang/lib/Sema/SemaOpenACC.cpp @@ -2999,11 +2999,11 @@ bool SemaOpenACC::CreateReductionCombinerRecipe( BinOp = BinaryOperatorKind::BO_LT; break; case OpenACCReductionOperator::And: + BinOp = BinaryOperatorKind::BO_LAnd; + break; case OpenACCReductionOperator::Or: - // We just want a 'NYI' error in the backend, so leave an empty combiner - // recipe, and claim success. - CombinerRecipes.push_back({nullptr, nullptr, nullptr}); - return false; + BinOp = BinaryOperatorKind::BO_LOr; + break; } // If VarTy is an array type, at the top level only, we want to do our @@ -3068,8 +3068,21 @@ bool SemaOpenACC::CreateReductionCombinerRecipe( : CombinerFailureKind::Assignment}; } case OpenACCReductionOperator::And: - case OpenACCReductionOperator::Or: - llvm_unreachable("And/Or not implemented, but should fail earlier"); + case OpenACCReductionOperator::Or: { + // These are done as LHS = LHS && RHS (or LHS = LHS || RHS). So after the + // binop, all we have to do is the assignment. + if (!BinOpRes.isUsable()) + return {BinOpRes, CombinerFailureKind::BinOp}; + + // Build assignment. + ExprResult Assignment = SemaRef.BuildBinOp(SemaRef.getCurScope(), Loc, + BinaryOperatorKind::BO_Assign, + LHSDRE, BinOpRes.get(), + /*ForFoldExpr=*/false); + return {Assignment, Assignment.isUsable() + ? CombinerFailureKind::None + : CombinerFailureKind::Assignment}; + } case OpenACCReductionOperator::Invalid: llvm_unreachable("Invalid should have been caught above"); } diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp index c1c2e4b715365..53eba7bafb312 100644 --- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp +++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp @@ -1,4 +1,4 @@ -// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s struct DefaultOperators { int i; @@ -480,7 +480,77 @@ void acc_combined() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr // CHECK-NEXT: } for(int i = 0; i < 5; ++i); @@ -507,7 +577,77 @@ void acc_combined() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr // CHECK-NEXT: } for(int i = 0; i < 5; ++i); @@ -1532,7 +1672,101 @@ void acc_combined() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } for(int i = 0; i < 5; ++i); @@ -1576,7 +1810,101 @@ void acc_combined() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } for(int i = 0; i < 5; ++i); @@ -2398,6 +2726,104 @@ void acc_combined() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } for(int i = 0; i < 5; ++i); @@ -2446,6 +2872,104 @@ void acc_combined() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } for(int i = 0; i < 5; ++i); diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp index 853f345e53ddf..63d69529bee53 100644 --- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp +++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp @@ -1,4 +1,4 @@ -// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s template void acc_combined() { T someVar; @@ -92,7 +92,18 @@ void acc_combined() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !cir.float, !cir.ptr // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr // CHECK-NEXT: } for(int i=0;i < 5; ++i); @@ -106,7 +117,18 @@ void acc_combined() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !cir.float, !cir.ptr // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr // CHECK-NEXT: } for(int i=0;i < 5; ++i); @@ -371,7 +393,41 @@ void acc_combined() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } for(int i=0;i < 5; ++i); @@ -401,7 +457,41 @@ void acc_combined() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } for(int i=0;i < 5; ++i); @@ -708,7 +798,45 @@ void acc_combined() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } for(int i=0;i < 5; ++i); @@ -744,7 +872,45 @@ void acc_combined() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } for(int i=0;i < 5; ++i); diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-inline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-inline-ops.cpp index 67e8460649f7e..9c1b16174d5fb 100644 --- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-inline-ops.cpp +++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-inline-ops.cpp @@ -1,4 +1,4 @@ -// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s struct HasOperatorsInline { int i; @@ -14,8 +14,8 @@ struct HasOperatorsInline { HasOperatorsInline &operator&=(HasOperatorsInline& other); HasOperatorsInline &operator|=(HasOperatorsInline& other); HasOperatorsInline &operator^=(HasOperatorsInline& other); - bool &operator&&(HasOperatorsInline& other); - bool &operator||(HasOperatorsInline& other); + HasOperatorsInline &operator&&(HasOperatorsInline& other); + HasOperatorsInline &operator||(HasOperatorsInline& other); // For min/max bool operator<(HasOperatorsInline& other); HasOperatorsInline &operator=(HasOperatorsInline& other); @@ -277,7 +277,8 @@ void acc_combined() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineaaERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr {{.*}}, %[[ARG:.*]]: !cir.ptr {{.*}}): @@ -286,7 +287,7 @@ void acc_combined() { // CHECK-NEXT: } for(int i=0;i < 5; ++i); #pragma acc parallel loop reduction(||:someVar) -// CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTS18HasOperatorsInline : !cir.ptr reduction_operator init { +// CHECK: acc.reduction.recipe @reduction_lor__ZTS18HasOperatorsInline : !cir.ptr reduction_operator init { // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}}) // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsInline, !cir.ptr, ["openacc.reduction.init", init] // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr @@ -308,7 +309,8 @@ void acc_combined() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineooERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr {{.*}}, %[[ARG:.*]]: !cir.ptr {{.*}}): @@ -318,7 +320,7 @@ void acc_combined() { for(int i=0;i < 5; ++i); #pragma acc parallel loop reduction(+:someVarArr) -// CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_18HasOperatorsInline : !cir.ptr> reduction_operator init { +// CHECK: acc.reduction.recipe @reduction_add__ZTSA5_18HasOperatorsInline : !cir.ptr> reduction_operator init { // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}}) // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init] // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"] @@ -1254,7 +1256,31 @@ void acc_combined() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineaaERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr> {{.*}}, %[[ARG:.*]]: !cir.ptr> {{.*}}): @@ -1318,7 +1344,31 @@ void acc_combined() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineooERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr> {{.*}}, %[[ARG:.*]]: !cir.ptr> {{.*}}): @@ -2121,6 +2171,35 @@ void acc_combined() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineaaERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr> {{.*}}, %[[ARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): @@ -2198,6 +2277,35 @@ void acc_combined() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineooERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr> {{.*}}, %[[ARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp index d74de8220225a..78b43ddc8f182 100644 --- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp +++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp @@ -1,4 +1,4 @@ -// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s template void acc_combined() { @@ -145,7 +145,18 @@ void acc_combined() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !s32i, !cir.ptr // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr // CHECK-NEXT: } for(int i=0;i < 5; ++i); @@ -159,7 +170,18 @@ void acc_combined() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !s32i, !cir.ptr // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr // CHECK-NEXT: } for(int i=0;i < 5; ++i); @@ -587,7 +609,41 @@ void acc_combined() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } for(int i=0;i < 5; ++i); @@ -617,7 +673,41 @@ void acc_combined() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } for(int i=0;i < 5; ++i); @@ -1116,7 +1206,45 @@ void acc_combined() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } for(int i=0;i < 5; ++i); @@ -1152,7 +1280,45 @@ void acc_combined() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } for(int i=0;i < 5; ++i); diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-outline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-outline-ops.cpp index a6df6c03f5c8e..5b37071a22ddb 100644 --- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-outline-ops.cpp +++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-outline-ops.cpp @@ -1,4 +1,4 @@ -// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s struct HasOperatorsOutline { int i; unsigned u; @@ -15,8 +15,8 @@ HasOperatorsOutline &operator*=(HasOperatorsOutline &, HasOperatorsOutline &); HasOperatorsOutline &operator&=(HasOperatorsOutline &, HasOperatorsOutline &); HasOperatorsOutline &operator|=(HasOperatorsOutline &, HasOperatorsOutline &); HasOperatorsOutline &operator^=(HasOperatorsOutline &, HasOperatorsOutline &); -bool &operator&&(HasOperatorsOutline &, HasOperatorsOutline &); -bool &operator||(HasOperatorsOutline &, HasOperatorsOutline &); +HasOperatorsOutline &operator&&(HasOperatorsOutline &, HasOperatorsOutline &); +HasOperatorsOutline &operator||(HasOperatorsOutline &, HasOperatorsOutline &); // For min/max bool operator<(HasOperatorsOutline &, HasOperatorsOutline &); @@ -276,7 +276,8 @@ void acc_combined() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZaaR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr {{.*}}, %[[ARG:.*]]: !cir.ptr {{.*}}): @@ -285,7 +286,7 @@ void acc_combined() { // CHECK-NEXT: } for(int i=0;i < 5; ++i); #pragma acc parallel loop reduction(||:someVar) -// CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTS19HasOperatorsOutline : !cir.ptr reduction_operator init { +// CHECK: acc.reduction.recipe @reduction_lor__ZTS19HasOperatorsOutline : !cir.ptr reduction_operator init { // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}}) // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsOutline, !cir.ptr, ["openacc.reduction.init", init] // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr @@ -307,7 +308,8 @@ void acc_combined() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZooR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr {{.*}}, %[[ARG:.*]]: !cir.ptr {{.*}}): @@ -317,7 +319,7 @@ void acc_combined() { for(int i=0;i < 5; ++i); #pragma acc parallel loop reduction(+:someVarArr) -// CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_19HasOperatorsOutline : !cir.ptr> reduction_operator init { +// CHECK: acc.reduction.recipe @reduction_add__ZTSA5_19HasOperatorsOutline : !cir.ptr> reduction_operator init { // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}}) // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init] // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"] @@ -1253,7 +1255,31 @@ void acc_combined() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZaaR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr> {{.*}}, %[[ARG:.*]]: !cir.ptr> {{.*}}): @@ -1317,7 +1343,31 @@ void acc_combined() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZooR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr> {{.*}}, %[[ARG:.*]]: !cir.ptr> {{.*}}): @@ -2120,6 +2170,35 @@ void acc_combined() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZaaR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr> {{.*}}, %[[ARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): @@ -2197,6 +2276,35 @@ void acc_combined() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZooR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr> {{.*}}, %[[ARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c index d65d5d4add0ac..6ec1c43ebbe45 100644 --- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c +++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c @@ -1,4 +1,4 @@ -// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -std=c23 -triple x86_64-linux-pc %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -std=c23 -triple x86_64-linux-pc %s -o - | FileCheck %s struct DefaultOperators { int i; @@ -485,7 +485,80 @@ void acc_compute() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_INT]] : !s32i -> !u32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.double +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr // CHECK-NEXT: } ; @@ -512,7 +585,80 @@ void acc_compute() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_INT]] : !s32i -> !u32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.double +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr // CHECK-NEXT: } ; @@ -1516,7 +1662,104 @@ void acc_compute() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_INT]] : !s32i -> !u32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.double +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } ; @@ -1546,7 +1789,104 @@ void acc_compute() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_INT]] : !s32i -> !u32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.double +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } ; @@ -2376,6 +2716,107 @@ void acc_compute() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_INT]] : !s32i -> !u32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.double +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } ; @@ -2424,6 +2865,107 @@ void acc_compute() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_INT]] : !s32i -> !u32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.double +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } ; diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp index f32fa2d2d6372..7bd6f67a9e19e 100644 --- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp +++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp @@ -1,4 +1,4 @@ -// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s struct DefaultOperators { int i; @@ -480,7 +480,77 @@ void acc_compute() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr // CHECK-NEXT: } ; @@ -507,7 +577,77 @@ void acc_compute() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr // CHECK-NEXT: } ; @@ -1532,7 +1672,101 @@ void acc_compute() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } ; @@ -1576,7 +1810,101 @@ void acc_compute() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } ; @@ -2398,6 +2726,104 @@ void acc_compute() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } ; @@ -2446,6 +2872,104 @@ void acc_compute() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } ; diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c index 9f7336727e5a9..13c335b867044 100644 --- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c +++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c @@ -1,4 +1,4 @@ -// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s void acc_compute() { float someVar; @@ -92,7 +92,19 @@ void acc_compute() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !cir.float, !cir.ptr // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr // CHECK-NEXT: } ; @@ -106,7 +118,19 @@ void acc_compute() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !cir.float, !cir.ptr // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr // CHECK-NEXT: } ; @@ -371,7 +395,42 @@ void acc_compute() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } ; @@ -401,7 +460,42 @@ void acc_compute() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } ; @@ -708,7 +802,46 @@ void acc_compute() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } ; @@ -744,7 +877,46 @@ void acc_compute() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } ; diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp index ffd26319e9bfc..67378210ba83c 100644 --- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp +++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp @@ -1,4 +1,4 @@ -// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s template void acc_compute() { @@ -93,7 +93,18 @@ void acc_compute() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !cir.float, !cir.ptr // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr // CHECK-NEXT: } ; @@ -107,7 +118,18 @@ void acc_compute() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !cir.float, !cir.ptr // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr // CHECK-NEXT: } ; @@ -372,7 +394,41 @@ void acc_compute() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } ; @@ -402,7 +458,41 @@ void acc_compute() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } ; @@ -709,7 +799,45 @@ void acc_compute() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } ; @@ -745,7 +873,45 @@ void acc_compute() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } ; diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-inline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-inline-ops.cpp index 1e367ee37a30d..262fe98d8e1df 100644 --- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-inline-ops.cpp +++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-inline-ops.cpp @@ -1,4 +1,4 @@ -// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s struct HasOperatorsInline { int i; @@ -14,8 +14,8 @@ struct HasOperatorsInline { HasOperatorsInline &operator&=(HasOperatorsInline& other); HasOperatorsInline &operator|=(HasOperatorsInline& other); HasOperatorsInline &operator^=(HasOperatorsInline& other); - bool &operator&&(HasOperatorsInline& other); - bool &operator||(HasOperatorsInline& other); + HasOperatorsInline &operator&&(HasOperatorsInline& other); + HasOperatorsInline &operator||(HasOperatorsInline& other); // For min/max bool operator<(HasOperatorsInline& other); HasOperatorsInline &operator=(HasOperatorsInline& other); @@ -277,7 +277,8 @@ void acc_compute() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineaaERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr {{.*}}, %[[ARG:.*]]: !cir.ptr {{.*}}): @@ -286,7 +287,7 @@ void acc_compute() { // CHECK-NEXT: } ; #pragma acc parallel reduction(||:someVar) -// CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTS18HasOperatorsInline : !cir.ptr reduction_operator init { +// CHECK: acc.reduction.recipe @reduction_lor__ZTS18HasOperatorsInline : !cir.ptr reduction_operator init { // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}}) // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsInline, !cir.ptr, ["openacc.reduction.init", init] // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr @@ -308,7 +309,8 @@ void acc_compute() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineooERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr {{.*}}, %[[ARG:.*]]: !cir.ptr {{.*}}): @@ -318,7 +320,7 @@ void acc_compute() { ; #pragma acc parallel reduction(+:someVarArr) -// CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_18HasOperatorsInline : !cir.ptr> reduction_operator init { +// CHECK: acc.reduction.recipe @reduction_add__ZTSA5_18HasOperatorsInline : !cir.ptr> reduction_operator init { // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}}) // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init] // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"] @@ -1254,7 +1256,31 @@ void acc_compute() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineaaERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr> {{.*}}, %[[ARG:.*]]: !cir.ptr> {{.*}}): @@ -1318,7 +1344,31 @@ void acc_compute() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineooERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr> {{.*}}, %[[ARG:.*]]: !cir.ptr> {{.*}}): @@ -2121,6 +2171,35 @@ void acc_compute() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineaaERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr> {{.*}}, %[[ARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): @@ -2198,6 +2277,35 @@ void acc_compute() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineooERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr> {{.*}}, %[[ARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c index 2f42a5c63f149..be7b12350360d 100644 --- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c +++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c @@ -1,4 +1,4 @@ -// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s void acc_compute() { int someVar; @@ -144,7 +144,18 @@ void acc_compute() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !s32i, !cir.ptr // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr // CHECK-NEXT: } ; @@ -158,7 +169,18 @@ void acc_compute() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !s32i, !cir.ptr // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr // CHECK-NEXT: } ; @@ -586,7 +608,41 @@ void acc_compute() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } ; @@ -616,7 +672,41 @@ void acc_compute() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } ; @@ -1115,7 +1205,45 @@ void acc_compute() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } ; @@ -1151,7 +1279,45 @@ void acc_compute() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } ; diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp index af7bcf3770fe1..fb6984fcd0068 100644 --- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp +++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp @@ -1,4 +1,4 @@ -// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s template void acc_compute() { @@ -145,7 +145,18 @@ void acc_compute() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !s32i, !cir.ptr // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr // CHECK-NEXT: } ; @@ -159,7 +170,18 @@ void acc_compute() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !s32i, !cir.ptr // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr // CHECK-NEXT: } ; @@ -587,7 +609,41 @@ void acc_compute() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } ; @@ -617,7 +673,41 @@ void acc_compute() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } ; @@ -1116,7 +1206,45 @@ void acc_compute() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } ; @@ -1152,7 +1280,45 @@ void acc_compute() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } ; diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-outline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-outline-ops.cpp index ec890e2b1de65..3a80ed5304928 100644 --- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-outline-ops.cpp +++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-outline-ops.cpp @@ -1,4 +1,4 @@ -// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s struct HasOperatorsOutline { int i; unsigned u; @@ -15,8 +15,8 @@ HasOperatorsOutline &operator*=(HasOperatorsOutline &, HasOperatorsOutline &); HasOperatorsOutline &operator&=(HasOperatorsOutline &, HasOperatorsOutline &); HasOperatorsOutline &operator|=(HasOperatorsOutline &, HasOperatorsOutline &); HasOperatorsOutline &operator^=(HasOperatorsOutline &, HasOperatorsOutline &); -bool &operator&&(HasOperatorsOutline &, HasOperatorsOutline &); -bool &operator||(HasOperatorsOutline &, HasOperatorsOutline &); +HasOperatorsOutline &operator&&(HasOperatorsOutline &, HasOperatorsOutline &); +HasOperatorsOutline &operator||(HasOperatorsOutline &, HasOperatorsOutline &); // For min/max bool operator<(HasOperatorsOutline &, HasOperatorsOutline &); @@ -276,7 +276,8 @@ void acc_compute() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZaaR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr {{.*}}, %[[ARG:.*]]: !cir.ptr {{.*}}): @@ -285,7 +286,7 @@ void acc_compute() { // CHECK-NEXT: } ; #pragma acc parallel reduction(||:someVar) -// CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTS19HasOperatorsOutline : !cir.ptr reduction_operator init { +// CHECK: acc.reduction.recipe @reduction_lor__ZTS19HasOperatorsOutline : !cir.ptr reduction_operator init { // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}}) // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsOutline, !cir.ptr, ["openacc.reduction.init", init] // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr @@ -307,7 +308,8 @@ void acc_compute() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZooR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr {{.*}}, %[[ARG:.*]]: !cir.ptr {{.*}}): @@ -317,7 +319,7 @@ void acc_compute() { ; #pragma acc parallel reduction(+:someVarArr) -// CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_19HasOperatorsOutline : !cir.ptr> reduction_operator init { +// CHECK: acc.reduction.recipe @reduction_add__ZTSA5_19HasOperatorsOutline : !cir.ptr> reduction_operator init { // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}}) // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init] // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"] @@ -1253,7 +1255,31 @@ void acc_compute() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZaaR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr> {{.*}}, %[[ARG:.*]]: !cir.ptr> {{.*}}): @@ -1317,7 +1343,31 @@ void acc_compute() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZooR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr> {{.*}}, %[[ARG:.*]]: !cir.ptr> {{.*}}): @@ -2120,6 +2170,35 @@ void acc_compute() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZaaR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr> {{.*}}, %[[ARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): @@ -2197,6 +2276,35 @@ void acc_compute() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZooR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr> {{.*}}, %[[ARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c index 08daa702c47f8..9b10a296e99f5 100644 --- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c +++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c @@ -1,4 +1,4 @@ -// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s void acc_compute() { unsigned int someVar; @@ -143,7 +143,19 @@ void acc_compute() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_SINT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_SINT]] : !s32i -> !u32i +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !u32i, !cir.ptr // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr // CHECK-NEXT: } ; @@ -157,7 +169,19 @@ void acc_compute() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_SINT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_SINT]] : !s32i -> !u32i +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !u32i, !cir.ptr // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr // CHECK-NEXT: } ; @@ -585,7 +609,42 @@ void acc_compute() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_SINT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_SINT]] : !s32i -> !u32i +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !u32i, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } ; @@ -615,7 +674,42 @@ void acc_compute() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_SINT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_SINT]] : !s32i -> !u32i +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !u32i, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } ; @@ -1114,7 +1208,46 @@ void acc_compute() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_SINT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_SINT]] : !s32i -> !u32i +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !u32i, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } ; @@ -1150,7 +1283,46 @@ void acc_compute() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_SINT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_SINT]] : !s32i -> !u32i +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !u32i, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } ; diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp index 1a77c0f10a144..11ebd7b4c26cb 100644 --- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp +++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp @@ -1,4 +1,4 @@ -// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s struct DefaultOperators { int i; @@ -480,7 +480,77 @@ void acc_loop() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr // CHECK-NEXT: } for(int i = 0; i < 5; ++i); @@ -507,7 +577,77 @@ void acc_loop() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr // CHECK-NEXT: } for(int i = 0; i < 5; ++i); @@ -1532,7 +1672,101 @@ void acc_loop() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } for(int i = 0; i < 5; ++i); @@ -1576,7 +1810,101 @@ void acc_loop() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } for(int i = 0; i < 5; ++i); @@ -2398,6 +2726,104 @@ void acc_loop() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } for(int i = 0; i < 5; ++i); @@ -2446,6 +2872,104 @@ void acc_loop() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr, !u32i +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr, !cir.double +// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double +// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr +// +// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr, !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } for(int i = 0; i < 5; ++i); diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp index 7faef7111a9c8..57cc1afec2911 100644 --- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp +++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp @@ -1,4 +1,4 @@ -// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s template void acc_loop() { @@ -93,7 +93,18 @@ void acc_loop() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !cir.float, !cir.ptr // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr // CHECK-NEXT: } for(int i=0;i < 5; ++i); @@ -107,7 +118,18 @@ void acc_loop() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !cir.float, !cir.ptr // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr // CHECK-NEXT: } for(int i=0;i < 5; ++i); @@ -372,7 +394,41 @@ void acc_loop() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } for(int i=0;i < 5; ++i); @@ -402,7 +458,41 @@ void acc_loop() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } for(int i=0;i < 5; ++i); @@ -709,7 +799,45 @@ void acc_loop() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } for(int i=0;i < 5; ++i); @@ -745,7 +873,45 @@ void acc_loop() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !cir.float +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } for(int i=0;i < 5; ++i); diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-inline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-inline-ops.cpp index 43c9fbbce7533..8a5bf3ebb3e12 100644 --- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-inline-ops.cpp +++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-inline-ops.cpp @@ -1,4 +1,4 @@ -// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s struct HasOperatorsInline { int i; @@ -14,8 +14,8 @@ struct HasOperatorsInline { HasOperatorsInline &operator&=(HasOperatorsInline& other); HasOperatorsInline &operator|=(HasOperatorsInline& other); HasOperatorsInline &operator^=(HasOperatorsInline& other); - bool &operator&&(HasOperatorsInline& other); - bool &operator||(HasOperatorsInline& other); + HasOperatorsInline &operator&&(HasOperatorsInline& other); + HasOperatorsInline &operator||(HasOperatorsInline& other); // For min/max bool operator<(HasOperatorsInline& other); HasOperatorsInline &operator=(HasOperatorsInline& other); @@ -277,7 +277,8 @@ void acc_loop() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineaaERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr {{.*}}, %[[ARG:.*]]: !cir.ptr {{.*}}): @@ -286,7 +287,7 @@ void acc_loop() { // CHECK-NEXT: } for(int i=0;i < 5; ++i); #pragma acc loop reduction(||:someVar) -// CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTS18HasOperatorsInline : !cir.ptr reduction_operator init { +// CHECK: acc.reduction.recipe @reduction_lor__ZTS18HasOperatorsInline : !cir.ptr reduction_operator init { // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}}) // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsInline, !cir.ptr, ["openacc.reduction.init", init] // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr @@ -308,7 +309,8 @@ void acc_loop() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineooERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr {{.*}}, %[[ARG:.*]]: !cir.ptr {{.*}}): @@ -318,7 +320,7 @@ void acc_loop() { for(int i=0;i < 5; ++i); #pragma acc loop reduction(+:someVarArr) -// CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_18HasOperatorsInline : !cir.ptr> reduction_operator init { +// CHECK: acc.reduction.recipe @reduction_add__ZTSA5_18HasOperatorsInline : !cir.ptr> reduction_operator init { // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}}) // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init] // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"] @@ -1254,7 +1256,31 @@ void acc_loop() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineaaERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr> {{.*}}, %[[ARG:.*]]: !cir.ptr> {{.*}}): @@ -1318,7 +1344,31 @@ void acc_loop() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineooERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr> {{.*}}, %[[ARG:.*]]: !cir.ptr> {{.*}}): @@ -2121,6 +2171,35 @@ void acc_loop() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineaaERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr> {{.*}}, %[[ARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): @@ -2198,6 +2277,35 @@ void acc_loop() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineooERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr> {{.*}}, %[[ARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp index 5353218866d47..f60dff9385412 100644 --- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp +++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp @@ -1,4 +1,4 @@ -// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s template void acc_loop() { @@ -145,7 +145,18 @@ void acc_loop() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !s32i, !cir.ptr // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr // CHECK-NEXT: } for(int i=0;i < 5; ++i); @@ -159,7 +170,18 @@ void acc_loop() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !s32i, !cir.ptr // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr // CHECK-NEXT: } for(int i=0;i < 5; ++i); @@ -587,7 +609,41 @@ void acc_loop() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } for(int i=0;i < 5; ++i); @@ -617,7 +673,41 @@ void acc_loop() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } for(int i=0;i < 5; ++i); @@ -1116,7 +1206,45 @@ void acc_loop() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false +// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } for(int i=0;i < 5; ++i); @@ -1152,7 +1280,45 @@ void acc_loop() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr +// +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } for(int i=0;i < 5; ++i); diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-outline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-outline-ops.cpp index e193cfa1a5ab2..8613bc8a0f27e 100644 --- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-outline-ops.cpp +++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-outline-ops.cpp @@ -1,4 +1,4 @@ -// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s struct HasOperatorsOutline { int i; unsigned u; @@ -15,8 +15,8 @@ HasOperatorsOutline &operator*=(HasOperatorsOutline &, HasOperatorsOutline &); HasOperatorsOutline &operator&=(HasOperatorsOutline &, HasOperatorsOutline &); HasOperatorsOutline &operator|=(HasOperatorsOutline &, HasOperatorsOutline &); HasOperatorsOutline &operator^=(HasOperatorsOutline &, HasOperatorsOutline &); -bool &operator&&(HasOperatorsOutline &, HasOperatorsOutline &); -bool &operator||(HasOperatorsOutline &, HasOperatorsOutline &); +HasOperatorsOutline &operator&&(HasOperatorsOutline &, HasOperatorsOutline &); +HasOperatorsOutline &operator||(HasOperatorsOutline &, HasOperatorsOutline &); // For min/max bool operator<(HasOperatorsOutline &, HasOperatorsOutline &); @@ -276,7 +276,8 @@ void acc_loop() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZaaR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr {{.*}}, %[[ARG:.*]]: !cir.ptr {{.*}}): @@ -285,7 +286,7 @@ void acc_loop() { // CHECK-NEXT: } for(int i=0;i < 5; ++i); #pragma acc loop reduction(||:someVar) -// CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTS19HasOperatorsOutline : !cir.ptr reduction_operator init { +// CHECK: acc.reduction.recipe @reduction_lor__ZTS19HasOperatorsOutline : !cir.ptr reduction_operator init { // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr{{.*}}) // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsOutline, !cir.ptr, ["openacc.reduction.init", init] // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr -> !cir.ptr @@ -307,7 +308,8 @@ void acc_loop() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr {{.*}}, %[[RHSARG:.*]]: !cir.ptr {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZooR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr {{.*}}, %[[ARG:.*]]: !cir.ptr {{.*}}): @@ -317,7 +319,7 @@ void acc_loop() { for(int i=0;i < 5; ++i); #pragma acc loop reduction(+:someVarArr) -// CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_19HasOperatorsOutline : !cir.ptr> reduction_operator init { +// CHECK: acc.reduction.recipe @reduction_add__ZTSA5_19HasOperatorsOutline : !cir.ptr> reduction_operator init { // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr>{{.*}}) // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.reduction.init", init] // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp"] @@ -1253,7 +1255,31 @@ void acc_loop() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZaaR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr> {{.*}}, %[[ARG:.*]]: !cir.ptr> {{.*}}): @@ -1317,7 +1343,31 @@ void acc_loop() { // // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}) -// TODO OpenACC: Expecting combination operation here +// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr, ["itr"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !s64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZooR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr, !s64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr> {{.*}}, %[[ARG:.*]]: !cir.ptr> {{.*}}): @@ -2120,6 +2170,35 @@ void acc_loop() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZaaR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr> {{.*}}, %[[ARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): @@ -2197,6 +2276,35 @@ void acc_loop() { // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr> {{.*}}, %[[RHSARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}})) +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i +// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr> -> !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// +// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZooR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr +// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr, !cir.ptr) -> !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr> // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr> {{.*}}, %[[ARG:.*]]: !cir.ptr> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): diff --git a/clang/test/CIR/CodeGenOpenACC/reduction-clause-recipes.cpp b/clang/test/CIR/CodeGenOpenACC/reduction-clause-recipes.cpp index 20ad7a31b635e..29f1b5fe10158 100644 --- a/clang/test/CIR/CodeGenOpenACC/reduction-clause-recipes.cpp +++ b/clang/test/CIR/CodeGenOpenACC/reduction-clause-recipes.cpp @@ -1,4 +1,4 @@ -// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s // Note: unlike the 'private' recipe checks, this is just for spot-checking, // so this test isn't as comprehensive. The same code paths are used for @@ -753,8 +753,97 @@ void do_things(unsigned A, unsigned B) { // CHECK-NEXT: } // CHECK-NEXT: acc.yield // CHECK-NEXT: } combiner { -// CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr x 5>>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr x 5>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}): -// CHECK-NEXT: acc.yield +// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr x 5>>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr x 5>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}): +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB3:.*]] = acc.get_lowerbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB3]] : index to !u64i +// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i +// CHECK-NEXT: %[[ITR3:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB3_CAST]], %[[ITR3]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR3_LOAD]], %[[UB3_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_TLA_LOAD:.*]] = cir.load %[[LHSARG]] : !cir.ptr x 5>>>, !cir.ptr x 5>> +// CHECK-NEXT: %[[LHS_BOUND3_STRIDE:.*]] = cir.ptr_stride %[[LHS_TLA_LOAD]], %[[ITR3_LOAD]] : (!cir.ptr x 5>>, !u64i) -> !cir.ptr x 5>> +// CHECK-NEXT: %[[RHS_TLA_LOAD:.*]] = cir.load %[[RHSARG]] : !cir.ptr x 5>>>, !cir.ptr x 5>> +// CHECK-NEXT: %[[RHS_BOUND3_STRIDE:.*]] = cir.ptr_stride %[[RHS_TLA_LOAD]], %[[ITR3_LOAD]] : (!cir.ptr x 5>>, !u64i) -> !cir.ptr x 5>> +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i +// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i +// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_BOUND3_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHS_BOUND3_STRIDE]] : !cir.ptr x 5>> -> !cir.ptr> +// CHECK-NEXT: %[[LHS_BOUND2_STRIDE:.*]] = cir.ptr_stride %[[LHS_BOUND3_STRIDE_DECAY]], %[[ITR2_LOAD]] : (!cir.ptr>, !u64i) -> !cir.ptr> +// CHECK-NEXT: %[[RHS_BOUND3_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHS_BOUND3_STRIDE]] : !cir.ptr x 5>> -> !cir.ptr> +// CHECK-NEXT: %[[RHS_BOUND2_STRIDE:.*]] = cir.ptr_stride %[[RHS_BOUND3_STRIDE_DECAY]], %[[ITR2_LOAD]] : (!cir.ptr>, !u64i) -> !cir.ptr> +// CHECK-NEXT: cir.scope { +// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i +// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index +// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i +// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr, ["iter"] {alignment = 8 : i64} +// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.for : cond { +// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool +// CHECK-NEXT: cir.condition(%[[COND]]) +// CHECK-NEXT: } body { +// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[LHS_BOUND2_STRIDE_LOAD:.*]] = cir.load %[[LHS_BOUND2_STRIDE]] : !cir.ptr>, !cir.ptr +// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_BOUND2_STRIDE_LOAD]], %[[ITR1_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[RHS_BOUND2_STRIDE_LOAD:.*]] = cir.load %[[RHS_BOUND2_STRIDE]] : !cir.ptr>, !cir.ptr +// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_BOUND2_STRIDE_LOAD]], %[[ITR1_LOAD]] : (!cir.ptr, !u64i) -> !cir.ptr +// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr -> !cir.ptr +// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[LHS_CAST_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST_BOOL]], true { +// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true +// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool +// CHECK-NEXT: }, false { +// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr, !s32i +// CHECK-NEXT: %[[RHS_CAST_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool +// CHECK-NEXT: cir.yield %[[RHS_CAST_BOOL]] : !cir.bool +// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool +// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i +// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_INT]], %[[LHS_GET_I]] : !s32i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } step { +// CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr, !u64i +// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i +// CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK-NEXT: acc.yield %[[LHSARG]] // CHECK-NEXT: } destroy { // CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr x 5>>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr x 5>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}): // CHECK-NEXT: cir.scope { From aca53f4375d1792cfd706ef4215ab4b350042c5c Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 22 Oct 2025 14:10:00 +0100 Subject: [PATCH 095/530] [VPlan] Skip masked interleave groups in narrowInterleaveGroups. 8d29d09309 exposed a crash due to incorrectly trying to handle masked interleave recipes. For now, the current code does not support masked interleave recipes. Bail out for them. --- .../Transforms/Vectorize/VPlanTransforms.cpp | 2 +- ...row-interleave-to-widen-memory-scalable.ll | 172 ++++++++++++++++++ 2 files changed, 173 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index ff25ef52e3380..48cf763fa398e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -4051,7 +4051,7 @@ static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, static std::optional isConsecutiveInterleaveGroup( VPInterleaveRecipe *InterleaveR, ArrayRef VFs, VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI) { - if (!InterleaveR) + if (!InterleaveR || InterleaveR->getMask()) return std::nullopt; Type *GroupElementTy = nullptr; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll index 829acbbf71548..305a6920fad16 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll @@ -210,3 +210,175 @@ loop: exit: ret void } + +define void @test_masked_interleave_group(i32 %N, ptr %mask, ptr %src, ptr %dst) { +; IC1-LABEL: define void @test_masked_interleave_group( +; IC1-SAME: i32 [[N:%.*]], ptr [[MASK:%.*]], ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] { +; IC1-NEXT: [[ENTRY:.*:]] +; IC1-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; IC1-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1 +; IC1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; IC1-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 +; IC1-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP3]], i64 8) +; IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[UMAX]] +; IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; IC1: [[VECTOR_MEMCHECK]]: +; IC1-NEXT: [[TMP4:%.*]] = zext i32 [[N]] to i64 +; IC1-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4 +; IC1-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 16 +; IC1-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]] +; IC1-NEXT: [[TMP7:%.*]] = add nuw nsw i64 [[TMP4]], 1 +; IC1-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[TMP7]] +; IC1-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP6]] +; IC1-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; IC1-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[MASK]], [[SCEVGEP]] +; IC1-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; IC1-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; IC1-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; IC1-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; IC1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; IC1-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; IC1: [[VECTOR_PH]]: +; IC1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; IC1-NEXT: [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4 +; IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP9]] +; IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; IC1-NEXT: [[TMP10:%.*]] = trunc i64 [[N_VEC]] to i32 +; IC1-NEXT: [[TMP11:%.*]] = mul i64 [[N_VEC]], 16 +; IC1-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]] +; IC1-NEXT: [[TMP13:%.*]] = mul i64 [[N_VEC]], 16 +; IC1-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]] +; IC1-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[N_VEC]] +; IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; IC1: [[VECTOR_BODY]]: +; IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC1-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16 +; IC1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]] +; IC1-NEXT: [[OFFSET_IDX6:%.*]] = mul i64 [[INDEX]], 16 +; IC1-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX6]] +; IC1-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[INDEX]] +; IC1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[NEXT_GEP8]], align 1, !alias.scope [[META6:![0-9]+]] +; IC1-NEXT: [[TMP16:%.*]] = icmp eq [[WIDE_LOAD]], zeroinitializer +; IC1-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave4.nxv16i1( [[TMP16]], [[TMP16]], [[TMP16]], [[TMP16]]) +; IC1-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv16f32.p0(ptr align 4 [[NEXT_GEP7]], [[INTERLEAVED_MASK]], poison), !alias.scope [[META9:![0-9]+]] +; IC1-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv16f32( [[WIDE_MASKED_VEC]]) +; IC1-NEXT: [[TMP17:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 +; IC1-NEXT: [[TMP18:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 1 +; IC1-NEXT: [[TMP19:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 2 +; IC1-NEXT: [[TMP20:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 3 +; IC1-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave4.nxv16f32( [[TMP17]], [[TMP18]], [[TMP19]], [[TMP20]]) +; IC1-NEXT: [[INTERLEAVED_MASK9:%.*]] = call @llvm.vector.interleave4.nxv16i1( [[TMP16]], [[TMP16]], [[TMP16]], [[TMP16]]) +; IC1-NEXT: call void @llvm.masked.store.nxv16f32.p0( [[INTERLEAVED_VEC]], ptr align 4 [[NEXT_GEP]], [[INTERLEAVED_MASK9]]), !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]] +; IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; IC1-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IC1-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; IC1: [[MIDDLE_BLOCK]]: +; IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] +; IC1-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; IC1: [[SCALAR_PH]]: +; +; CHECK-LABEL: define void @test_masked_interleave_group( +; CHECK-SAME: i32 [[N:%.*]], ptr [[MASK:%.*]], ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 +; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP3]], i64 8) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[UMAX]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 16 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = add nuw nsw i64 [[TMP4]], 1 +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[TMP7]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP6]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[MASK]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP9]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[N_VEC]], 16 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[N_VEC]], 16 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[N_VEC]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[OFFSET_IDX6:%.*]] = mul i64 [[INDEX]], 16 +; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX6]] +; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[NEXT_GEP8]], align 1, !alias.scope [[META6:![0-9]+]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave4.nxv16i1( [[TMP16]], [[TMP16]], [[TMP16]], [[TMP16]]) +; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv16f32.p0(ptr align 4 [[NEXT_GEP7]], [[INTERLEAVED_MASK]], poison), !alias.scope [[META9:![0-9]+]] +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv16f32( [[WIDE_MASKED_VEC]]) +; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 2 +; CHECK-NEXT: [[TMP20:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 3 +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave4.nxv16f32( [[TMP17]], [[TMP18]], [[TMP19]], [[TMP20]]) +; CHECK-NEXT: [[INTERLEAVED_MASK9:%.*]] = call @llvm.vector.interleave4.nxv16i1( [[TMP16]], [[TMP16]], [[TMP16]], [[TMP16]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv16f32.p0( [[INTERLEAVED_VEC]], ptr align 4 [[NEXT_GEP]], [[INTERLEAVED_MASK9]]), !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop.header + +loop.header: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %dst.iv = phi ptr [ %dst, %entry ], [ %dst.iv.next, %loop.latch ] + %src.iv = phi ptr [ %src, %entry ], [ %src.iv.next, %loop.latch ] + %mask.iv = phi ptr [ %mask, %entry ], [ %mask.iv.next, %loop.latch ] + %mask.iv.next = getelementptr i8, ptr %mask.iv, i64 1 + %mask.val = load i8, ptr %mask.iv, align 1 + %should.copy = icmp eq i8 %mask.val, 0 + br i1 %should.copy, label %then, label %loop.latch + +then: + %elem0 = load float, ptr %src.iv, align 4 + store float %elem0, ptr %dst.iv, align 4 + %src.1.ptr = getelementptr i8, ptr %src.iv, i64 4 + %s1 = load float, ptr %src.1.ptr, align 4 + %dst.1.ptr = getelementptr i8, ptr %dst.iv, i64 4 + store float %s1, ptr %dst.1.ptr, align 4 + %src.2.ptr = getelementptr i8, ptr %src.iv, i64 8 + %s2 = load float, ptr %src.2.ptr, align 4 + %dst.2.ptr = getelementptr i8, ptr %dst.iv, i64 8 + store float %s2, ptr %dst.2.ptr, align 4 + %src.3.ptr = getelementptr i8, ptr %src.iv, i64 12 + %s3 = load float, ptr %src.3.ptr, align 4 + %dst.3.ptr = getelementptr i8, ptr %dst.iv, i64 12 + store float %s3, ptr %dst.3.ptr, align 4 + br label %loop.latch + +loop.latch: + %iv.next = add i32 %iv, 1 + %src.iv.next = getelementptr i8, ptr %src.iv, i64 16 + %dst.iv.next = getelementptr i8, ptr %dst.iv, i64 16 + %ec = icmp eq i32 %iv, %N + br i1 %ec, label %exit, label %loop.header + +exit: + ret void +} From b3073470424c9ef4c5f319d3eed4d42170e15cf1 Mon Sep 17 00:00:00 2001 From: Erich Keane Date: Wed, 22 Oct 2025 06:10:46 -0700 Subject: [PATCH 096/530] [OpenACC][CIR] Lowering for atomic-read (#164299) The OpenACC spec allows only `v = x` form for atomic-read, and only when both are L-values. The result is this ends up being a pretty trivial patch, however it adds a decent amount of infrastructure for the other forms of atomic. Additionally, the 3.4 spec starts allowing the 'if' clause on atomic, which has recently been added to the ACC dialect. This patch also ensures that can be lowered as well. Extensive testing of this feature was done on other clauses, so there isn't much further work/testing to be done for it. --- clang/include/clang/AST/StmtOpenACC.h | 11 ++++++ clang/lib/AST/StmtOpenACC.cpp | 34 +++++++++++++++++++ clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp | 19 +++++++---- clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp | 27 +++++++++++++-- clang/test/CIR/CodeGenOpenACC/atomic-read.cpp | 24 +++++++++++++ 5 files changed, 107 insertions(+), 8 deletions(-) create mode 100644 clang/test/CIR/CodeGenOpenACC/atomic-read.cpp diff --git a/clang/include/clang/AST/StmtOpenACC.h b/clang/include/clang/AST/StmtOpenACC.h index 8b4554e996326..4d52805033410 100644 --- a/clang/include/clang/AST/StmtOpenACC.h +++ b/clang/include/clang/AST/StmtOpenACC.h @@ -815,6 +815,17 @@ class OpenACCAtomicConstruct final Stmt *getAssociatedStmt() { return OpenACCAssociatedStmtConstruct::getAssociatedStmt(); } + + // A struct to represent a broken-down version of the associated statement, + // providing the information specified in OpenACC3.3 Section 2.12. + struct StmtInfo { + const Expr *V; + const Expr *X; + // TODO: OpenACC: We should expand this as we're implementing the other + // atomic construct kinds. + }; + + const StmtInfo getAssociatedStmtInfo() const; }; } // namespace clang diff --git a/clang/lib/AST/StmtOpenACC.cpp b/clang/lib/AST/StmtOpenACC.cpp index 07e3de8eeb00d..2b56c1eea547c 100644 --- a/clang/lib/AST/StmtOpenACC.cpp +++ b/clang/lib/AST/StmtOpenACC.cpp @@ -12,7 +12,9 @@ #include "clang/AST/StmtOpenACC.h" #include "clang/AST/ASTContext.h" +#include "clang/AST/ExprCXX.h" #include "clang/AST/StmtCXX.h" + using namespace clang; OpenACCComputeConstruct * @@ -322,6 +324,38 @@ OpenACCAtomicConstruct *OpenACCAtomicConstruct::Create( return Inst; } +const OpenACCAtomicConstruct::StmtInfo +OpenACCAtomicConstruct::getAssociatedStmtInfo() const { + // This ends up being a vastly simplified version of SemaOpenACCAtomic, since + // it doesn't have to worry about erroring out, but we should do a lot of + // asserts to ensure we don't get off into the weeds. + assert(getAssociatedStmt() && "invalid associated stmt?"); + + switch (AtomicKind) { + case OpenACCAtomicKind::None: + case OpenACCAtomicKind::Write: + case OpenACCAtomicKind::Update: + case OpenACCAtomicKind::Capture: + assert(false && "Only 'read' has been implemented here"); + return {}; + case OpenACCAtomicKind::Read: { + // Read only supports the format 'v = x'; where both sides are a scalar + // expression. This can come in 2 forms; BinaryOperator or + // CXXOperatorCallExpr (rarely). + const Expr *AssignExpr = cast(getAssociatedStmt()); + if (const auto *BO = dyn_cast(AssignExpr)) { + assert(BO->getOpcode() == BO_Assign); + return {BO->getLHS()->IgnoreImpCasts(), BO->getRHS()->IgnoreImpCasts()}; + } + + const auto *OO = cast(AssignExpr); + assert(OO->getOperator() == OO_Equal); + + return {OO->getArg(0)->IgnoreImpCasts(), OO->getArg(1)->IgnoreImpCasts()}; + } + } +} + OpenACCCacheConstruct *OpenACCCacheConstruct::CreateEmpty(const ASTContext &C, unsigned NumVars) { void *Mem = diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp index ce4ae7ec5efc4..385f89c5544d6 100644 --- a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp @@ -553,12 +553,15 @@ class OpenACCClauseCIREmitter final } void VisitIfClause(const OpenACCIfClause &clause) { - if constexpr (isOneOfTypes) { + if constexpr (isOneOfTypes< + OpTy, mlir::acc::ParallelOp, mlir::acc::SerialOp, + mlir::acc::KernelsOp, mlir::acc::InitOp, + mlir::acc::ShutdownOp, mlir::acc::SetOp, + mlir::acc::DataOp, mlir::acc::WaitOp, + mlir::acc::HostDataOp, mlir::acc::EnterDataOp, + mlir::acc::ExitDataOp, mlir::acc::UpdateOp, + mlir::acc::AtomicReadOp, mlir::acc::AtomicWriteOp, + mlir::acc::AtomicUpdateOp, mlir::acc::AtomicCaptureOp>) { operation.getIfCondMutable().append( createCondition(clause.getConditionExpr())); } else if constexpr (isCombinedType) { @@ -1144,6 +1147,10 @@ EXPL_SPEC(mlir::acc::HostDataOp) EXPL_SPEC(mlir::acc::EnterDataOp) EXPL_SPEC(mlir::acc::ExitDataOp) EXPL_SPEC(mlir::acc::UpdateOp) +EXPL_SPEC(mlir::acc::AtomicReadOp) +EXPL_SPEC(mlir::acc::AtomicWriteOp) +EXPL_SPEC(mlir::acc::AtomicCaptureOp) +EXPL_SPEC(mlir::acc::AtomicUpdateOp) #undef EXPL_SPEC template diff --git a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp index e89393c92db33..02bb46d0e4466 100644 --- a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp @@ -306,6 +306,29 @@ CIRGenFunction::emitOpenACCCacheConstruct(const OpenACCCacheConstruct &s) { mlir::LogicalResult CIRGenFunction::emitOpenACCAtomicConstruct(const OpenACCAtomicConstruct &s) { - cgm.errorNYI(s.getSourceRange(), "OpenACC Atomic Construct"); - return mlir::failure(); + // For now, we are only support 'read', so diagnose. We can switch on the kind + // later once we start implementing the other 3 forms. + if (s.getAtomicKind() != OpenACCAtomicKind::Read) { + cgm.errorNYI(s.getSourceRange(), "OpenACC Atomic Construct"); + return mlir::failure(); + } + + // While Atomic is an 'associated statement' construct, it 'steals' the + // expression it is associated with rather than emitting it inside of it. So + // it has custom emit logic. + mlir::Location start = getLoc(s.getSourceRange().getBegin()); + OpenACCAtomicConstruct::StmtInfo inf = s.getAssociatedStmtInfo(); + // Atomic 'read' only permits 'v = x', where v and x are both scalar L values. + // The getAssociatedStmtInfo strips off implicit casts, which includes + // implicit conversions and L-to-R-Value conversions, so we can just emit it + // as an L value. The Flang implementation has no problem with different + // types, so it appears that the dialect can handle the conversions. + mlir::Value v = emitLValue(inf.V).getPointer(); + mlir::Value x = emitLValue(inf.X).getPointer(); + mlir::Type resTy = convertType(inf.V->getType()); + auto op = mlir::acc::AtomicReadOp::create(builder, start, x, v, resTy, + /*ifCond=*/{}); + emitOpenACCClauses(op, s.getDirectiveKind(), s.getDirectiveLoc(), + s.clauses()); + return mlir::success(); } diff --git a/clang/test/CIR/CodeGenOpenACC/atomic-read.cpp b/clang/test/CIR/CodeGenOpenACC/atomic-read.cpp new file mode 100644 index 0000000000000..9882f050045d3 --- /dev/null +++ b/clang/test/CIR/CodeGenOpenACC/atomic-read.cpp @@ -0,0 +1,24 @@ +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s + +void use(int x, unsigned int y, float f) { + // CHECK: cir.func{{.*}}(%[[X_ARG:.*]]: !s32i{{.*}}, %[[Y_ARG:.*]]: !u32i{{.*}}, %[[F_ARG:.*]]: !cir.float{{.*}}){{.*}}{ + // CHECK-NEXT: %[[X_ALLOC:.*]] = cir.alloca !s32i, !cir.ptr, ["x", init] + // CHECK-NEXT: %[[Y_ALLOC:.*]] = cir.alloca !u32i, !cir.ptr, ["y", init] + // CHECK-NEXT: %[[F_ALLOC:.*]] = cir.alloca !cir.float, !cir.ptr, ["f", init] + // CHECK-NEXT: cir.store %[[X_ARG]], %[[X_ALLOC]] : !s32i, !cir.ptr + // CHECK-NEXT: cir.store %[[Y_ARG]], %[[Y_ALLOC]] : !u32i, !cir.ptr + // CHECK-NEXT: cir.store %[[F_ARG]], %[[F_ALLOC]] : !cir.float, !cir.ptr + + // CHECK-NEXT: acc.atomic.read %[[X_ALLOC]] = %[[Y_ALLOC]] : !cir.ptr, !cir.ptr, !s32i +#pragma acc atomic read + x = y; + + // CHECK-NEXT: %[[X_LOAD:.*]] = cir.load{{.*}} %[[X_ALLOC]] : !cir.ptr, !s32i + // CHECK-NEXT: %[[X_CAST:.*]] = cir.cast integral %[[X_LOAD]] : !s32i -> !u32i + // CHECK-NEXT: %[[Y_LOAD:.*]] = cir.load{{.*}} %[[Y_ALLOC]] : !cir.ptr, !u32i + // CHECK-NEXT: %[[CMP:.*]] = cir.cmp(eq, %[[X_CAST]], %[[Y_LOAD]]) : !u32i, !cir.bool + // CHECK-NEXT: %[[CMP_CAST:.*]] = builtin.unrealized_conversion_cast %[[CMP]] : !cir.bool to i1 + // CHECK-NEXT: acc.atomic.read if(%[[CMP_CAST]]) %[[F_ALLOC]] = %[[Y_ALLOC]] : !cir.ptr, !cir.ptr, !cir.float +#pragma acc atomic read if (x == y) + f = y; +} From d08cbc1cdd7b73e9a582f5602e8ca4829decab8c Mon Sep 17 00:00:00 2001 From: Hanumanth Date: Wed, 22 Oct 2025 09:19:40 -0400 Subject: [PATCH 097/530] [mlir][linalg] Fix Linalg runtime verification pass to handle tensors with dimensions of size 0 (#163791) Runtime verification on Linalg structured ops unconditionally computed `end - 1` to determine the last iteration index before composing indexing maps. This caused spurious "negative index" assertion failures while operating on empty tensors (tensors with a dimension of size 0). The issue occurs because: 1. Empty tensors create loop ranges [0, 0) with zero trip count 2. Computing end - 1 = 0 - 1 = -1 creates a fictitious negative index 3. The negative index check triggers even though no loop iterations occur The fix is to guard all runtime verification with a check that ensures all loop ranges are non-empty (start < end) before performing any index arithmetic. Example MLIR that previously failed: ```mlir func.func @fill_empty() -> tensor<0xi32> { %c0 = arith.constant 0 : i32 %empty = tensor.empty() : tensor<0xi32> %filled = linalg.fill ins(%c0 : i32) outs(%empty : tensor<0xi32>) -> tensor<0xi32> return %filled : tensor<0xi32> } ``` --------- Co-authored-by: Hanumanth Hanumantharayappa --- .../Transforms/RuntimeOpVerification.cpp | 28 +++++++++++++++++++ .../Linalg/CPU/runtime-verification.mlir | 23 +++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/mlir/lib/Dialect/Linalg/Transforms/RuntimeOpVerification.cpp b/mlir/lib/Dialect/Linalg/Transforms/RuntimeOpVerification.cpp index 15eb51a6dcab2..181b4846835c0 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/RuntimeOpVerification.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/RuntimeOpVerification.cpp @@ -17,6 +17,7 @@ #include "mlir/Dialect/Index/IR/IndexOps.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Interfaces/RuntimeVerifiableOpInterface.h" @@ -43,6 +44,32 @@ struct StructuredOpInterface auto zero = arith::ConstantIndexOp::create(builder, loc, 0); auto one = arith::ConstantIndexOp::create(builder, loc, 1); + Value iterationDomainIsNonDegenerate; + for (auto [start, end] : llvm::zip(starts, ends)) { + auto startValue = getValueOrCreateConstantIndexOp(builder, loc, start); + auto endValue = getValueOrCreateConstantIndexOp(builder, loc, end); + + // Loop Trip count > 0 iff start < end + Value dimensionHasNonZeroTripCount = builder.create( + loc, index::IndexCmpPredicate::SLT, startValue, endValue); + + if (!iterationDomainIsNonDegenerate) { + iterationDomainIsNonDegenerate = dimensionHasNonZeroTripCount; + } else { + // Iteration domain is non-degenerate iff all dimensions have loop trip + // count > 0 + iterationDomainIsNonDegenerate = builder.create( + loc, iterationDomainIsNonDegenerate, dimensionHasNonZeroTripCount); + } + } + + if (!iterationDomainIsNonDegenerate) + return; + + auto ifOp = builder.create(loc, iterationDomainIsNonDegenerate, + /*withElseRegion=*/false); + builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); + // Subtract one from the loop ends before composing with the indexing map transform(ends, ends.begin(), [&](OpFoldResult end) { auto endValue = getValueOrCreateConstantIndexOp(builder, loc, end); @@ -110,6 +137,7 @@ struct StructuredOpInterface builder.createOrFold(loc, cmpOp, msg); } } + builder.setInsertionPointAfter(ifOp); } }; diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/runtime-verification.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/runtime-verification.mlir index 9f4393efc87bf..127ab70cb4539 100644 --- a/mlir/test/Integration/Dialect/Linalg/CPU/runtime-verification.mlir +++ b/mlir/test/Integration/Dialect/Linalg/CPU/runtime-verification.mlir @@ -103,6 +103,17 @@ func.func @main() { // CHECK: unexpected negative result on dimension #0 of input/output operand #0 func.call @reverse_from_3(%d5x) : (tensor) -> (tensor) + %c0x = arith.constant dense<1.0> : tensor<0xf32> + %d0x = tensor.cast %c0x : tensor<0xf32> to tensor + // CHECK-NOT: ERROR: Runtime op verification failed + func.call @fill_empty_1d(%d0x) : (tensor) -> (tensor) + + %c0x5 = arith.constant dense<0.0> : tensor<0x5xf32> + %d0x5 = tensor.cast %c0x5 : tensor<0x5xf32> to tensor + + // CHECK-NOT: ERROR: Runtime op verification failed + func.call @fill_empty_2d(%d0x5) : (tensor) -> (tensor) + return } @@ -297,3 +308,15 @@ func.func @reverse_from_3(%arg0: tensor) -> (tensor) { } -> tensor return %result : tensor } + +func.func @fill_empty_1d(%arg0: tensor) -> (tensor) { + %c0 = arith.constant 0.0 : f32 + %0 = linalg.fill ins(%c0 : f32) outs(%arg0 : tensor) -> tensor + return %0 : tensor +} + +func.func @fill_empty_2d(%arg0: tensor) -> (tensor) { + %c0 = arith.constant 0.0 : f32 + %0 = linalg.fill ins(%c0 : f32) outs(%arg0 : tensor) -> tensor + return %0 : tensor +} From 64a8d73fc76f32ade700ff0126d356e2bf469a60 Mon Sep 17 00:00:00 2001 From: Phoebe Wang Date: Wed, 22 Oct 2025 21:22:25 +0800 Subject: [PATCH 098/530] [NFC] Use macros only when __AVX512IFMA__ and __AVXIFMA__ undefined (#162760) --- clang/lib/Headers/avx512ifmavlintrin.h | 44 +++++++++++++++++++++----- clang/lib/Headers/avxifmaintrin.h | 7 ++++ 2 files changed, 43 insertions(+), 8 deletions(-) diff --git a/clang/lib/Headers/avx512ifmavlintrin.h b/clang/lib/Headers/avx512ifmavlintrin.h index c4449c7ece9ff..b377c17166ffb 100644 --- a/clang/lib/Headers/avx512ifmavlintrin.h +++ b/clang/lib/Headers/avx512ifmavlintrin.h @@ -37,6 +37,7 @@ #endif +#if !(defined(__AVXIFMA__) || defined(__AVX512IFMA__)) #define _mm_madd52hi_epu64(X, Y, Z) \ ((__m128i)__builtin_ia32_vpmadd52huq128((__v2di)(X), (__v2di)(Y), \ (__v2di)(Z))) @@ -52,56 +53,83 @@ #define _mm256_madd52lo_epu64(X, Y, Z) \ ((__m256i)__builtin_ia32_vpmadd52luq256((__v4di)(X), (__v4di)(Y), \ (__v4di)(Z))) +#endif + +#if defined(__AVX512IFMA__) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_madd52hi_epu64(__m128i __X, __m128i __Y, __m128i __Z) { + return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di)__X, (__v2di)__Y, + (__v2di)__Z); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_madd52hi_epu64(__m256i __X, __m256i __Y, __m256i __Z) { + return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y, + (__v4di)__Z); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_madd52lo_epu64(__m128i __X, __m128i __Y, __m128i __Z) { + return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y, + (__v2di)__Z); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_madd52lo_epu64(__m256i __X, __m256i __Y, __m256i __Z) { + return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y, + (__v4di)__Z); +} +#endif static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_madd52hi_epu64(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectq_128( - __M, (__v2di)_mm_madd52hi_epu64(__W, __X, __Y), (__v2di)__W); + __M, (__v2di)__builtin_ia32_vpmadd52huq128(__W, __X, __Y), (__v2di)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_madd52hi_epu64(__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) { return (__m128i)__builtin_ia32_selectq_128( - __M, (__v2di)_mm_madd52hi_epu64(__X, __Y, __Z), + __M, (__v2di)__builtin_ia32_vpmadd52huq128(__X, __Y, __Z), (__v2di)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_madd52hi_epu64( __m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectq_256( - __M, (__v4di)_mm256_madd52hi_epu64(__W, __X, __Y), (__v4di)__W); + __M, (__v4di)__builtin_ia32_vpmadd52huq256(__W, __X, __Y), (__v4di)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_madd52hi_epu64( __mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z) { return (__m256i)__builtin_ia32_selectq_256( - __M, (__v4di)_mm256_madd52hi_epu64(__X, __Y, __Z), + __M, (__v4di)__builtin_ia32_vpmadd52huq256(__X, __Y, __Z), (__v4di)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_madd52lo_epu64(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectq_128( - __M, (__v2di)_mm_madd52lo_epu64(__W, __X, __Y), (__v2di)__W); + __M, (__v2di)__builtin_ia32_vpmadd52luq128(__W, __X, __Y), (__v2di)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_madd52lo_epu64(__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) { return (__m128i)__builtin_ia32_selectq_128( - __M, (__v2di)_mm_madd52lo_epu64(__X, __Y, __Z), + __M, (__v2di)__builtin_ia32_vpmadd52luq128(__X, __Y, __Z), (__v2di)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_madd52lo_epu64( __m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectq_256( - __M, (__v4di)_mm256_madd52lo_epu64(__W, __X, __Y), (__v4di)__W); + __M, (__v4di)__builtin_ia32_vpmadd52luq256(__W, __X, __Y), (__v4di)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_madd52lo_epu64( __mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z) { return (__m256i)__builtin_ia32_selectq_256( - __M, (__v4di)_mm256_madd52lo_epu64(__X, __Y, __Z), + __M, (__v4di)__builtin_ia32_vpmadd52luq256(__X, __Y, __Z), (__v4di)_mm256_setzero_si256()); } diff --git a/clang/lib/Headers/avxifmaintrin.h b/clang/lib/Headers/avxifmaintrin.h index a2ef601913431..e452d5f0920e9 100644 --- a/clang/lib/Headers/avxifmaintrin.h +++ b/clang/lib/Headers/avxifmaintrin.h @@ -31,6 +31,13 @@ __min_vector_width__(256))) #endif +#if !defined(__AVX512IFMA__) && defined(__AVXIFMA__) +#define _mm_madd52hi_epu64(X, Y, Z) _mm_madd52hi_avx_epu64(X, Y, Z) +#define _mm_madd52lo_epu64(X, Y, Z) _mm_madd52lo_avx_epu64(X, Y, Z) +#define _mm256_madd52hi_epu64(X, Y, Z) _mm256_madd52hi_avx_epu64(X, Y, Z) +#define _mm256_madd52lo_epu64(X, Y, Z) _mm256_madd52lo_avx_epu64(X, Y, Z) +#endif + // must vex-encoding /// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y From 411be14eab7ac4600595ad622810e8918c54856d Mon Sep 17 00:00:00 2001 From: Orlando Cazalet-Hyams Date: Wed, 22 Oct 2025 14:36:21 +0100 Subject: [PATCH 099/530] [AgressiveInstCombine] Merge debug info on merged stores (#164449) A bit of debug info maintenaince for #147540. --- .../AggressiveInstCombine.cpp | 12 ++++- .../aggressive-instcombine-store-merge-dbg.ll | 49 +++++++++++++++++++ 2 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 llvm/test/DebugInfo/X86/aggressive-instcombine-store-merge-dbg.ll diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp index bbbac45e225a6..7a95df4b2a47c 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -907,10 +907,20 @@ static bool mergeConsecutivePartStores(ArrayRef Parts, StoreInst *Store = Builder.CreateAlignedStore( Val, First.Store->getPointerOperand(), First.Store->getAlign()); + // Merge various metadata onto the new store. AAMDNodes AATags = First.Store->getAAMetadata(); - for (const PartStore &Part : drop_begin(Parts)) + SmallVector Stores = {First.Store}; + Stores.reserve(Parts.size()); + SmallVector DbgLocs = {First.Store->getDebugLoc()}; + DbgLocs.reserve(Parts.size()); + for (const PartStore &Part : drop_begin(Parts)) { AATags = AATags.concat(Part.Store->getAAMetadata()); + Stores.push_back(Part.Store); + DbgLocs.push_back(Part.Store->getDebugLoc()); + } Store->setAAMetadata(AATags); + Store->mergeDIAssignID(Stores); + Store->setDebugLoc(DebugLoc::getMergedLocations(DbgLocs)); // Remove the old stores. for (const PartStore &Part : Parts) diff --git a/llvm/test/DebugInfo/X86/aggressive-instcombine-store-merge-dbg.ll b/llvm/test/DebugInfo/X86/aggressive-instcombine-store-merge-dbg.ll new file mode 100644 index 0000000000000..f6e941a21deb0 --- /dev/null +++ b/llvm/test/DebugInfo/X86/aggressive-instcombine-store-merge-dbg.ll @@ -0,0 +1,49 @@ +; RUN: opt -S -passes=aggressive-instcombine -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +;; Aggressive instcombine merges the two i8 stores into an i16 store. Check +;; the debug location and DIAssignID metadata get merged. + +; CHECK: define void @test_i16(i16 %x, ptr %p) !dbg ![[#]] { +; CHECK-NEXT: store i16 %x, ptr %p, align 1, !dbg ![[DBG:[0-9]+]], !DIAssignID ![[ID:[0-9]+]] +; CHECK-NEXT: #dbg_assign(i16 %x, ![[#]], +; CHECK-SAME: !DIExpression(DW_OP_LLVM_convert, 16, DW_ATE_unsigned, DW_OP_LLVM_convert, 8, DW_ATE_unsigned, DW_OP_stack_value, DW_OP_LLVM_fragment, 0, 8), +; CHECK-SAME: ![[ID]], ptr %p, !DIExpression(), ![[#]]) +; CHECK-NEXT: #dbg_assign(i16 %x, ![[#]], +; CHECK-SAME: !DIExpression(DW_OP_constu, 8, DW_OP_shr, DW_OP_LLVM_convert, 16, DW_ATE_unsigned, DW_OP_LLVM_convert, 8, DW_ATE_unsigned, DW_OP_stack_value, DW_OP_LLVM_fragment, 8, 8), +; CHECK-SAME: ![[ID]], ptr %p, !DIExpression(DW_OP_plus_uconst, 1), ![[#]]) +; CHECK-NEXT: ret void + +; CHECK: ![[DBG]] = !DILocation(line: 0, scope: ![[#]]) + +define void @test_i16(i16 %x, ptr %p) !dbg !5 { + %x.0 = trunc i16 %x to i8 + store i8 %x.0, ptr %p, align 1, !dbg !16, !DIAssignID !17 + #dbg_assign(i8 %x.0, !9, !DIExpression(DW_OP_LLVM_fragment, 0, 8), !17, ptr %p, !DIExpression(), !18) + %shr.1 = lshr i16 %x, 8 + %x.1 = trunc i16 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1, align 1, !dbg !19, !DIAssignID !20 + #dbg_assign(i8 %x.1, !9, !DIExpression(DW_OP_LLVM_fragment, 8, 8), !20, ptr %gep.1, !DIExpression(), !18) + ret void +} + +!llvm.dbg.cu = !{!0} +!llvm.debugify = !{!2, !3} +!llvm.module.flags = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!1 = !DIFile(filename: "/app/example.ll", directory: "/") +!2 = !{i32 7} +!3 = !{i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = distinct !DISubprogram(name: "test_i16", linkageName: "test_i16", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8) +!6 = !DISubroutineType(types: !7) +!7 = !{} +!8 = !{!9} +!9 = !DILocalVariable(name: "1", scope: !5, file: !1, line: 1, type: !10) +!10 = !DIBasicType(name: "ty16", size: 16, encoding: DW_ATE_unsigned) +!16 = !DILocation(line: 2, column: 1, scope: !5) +!17 = distinct !DIAssignID() +!18 = !DILocation(line: 1, column: 1, scope: !5) +!19 = !DILocation(line: 6, column: 1, scope: !5) +!20 = distinct !DIAssignID() From 9abbec66bfa34922521ef88fad1d6fcd43c1c462 Mon Sep 17 00:00:00 2001 From: LU-JOHN Date: Wed, 22 Oct 2025 08:42:29 -0500 Subject: [PATCH 100/530] [AMDGPU] Reland "Remove redundant s_cmp_lg_* sX, 0" (#164201) Reland PR https://github.com/llvm/llvm-project/pull/162352. Fix by excluding SI_PC_ADD_REL_OFFSET from instructions that set SCC = DST!=0. Passes check-libc-amdgcn-amd-amdhsa now. Distribution of instructions that allowed a redundant S_CMP to be deleted in check-libc-amdgcn-amd-amdhsa test: ``` S_AND_B32 485 S_AND_B64 47 S_ANDN2_B32 42 S_ANDN2_B64 277492 S_CSELECT_B64 17631 S_LSHL_B32 6 S_OR_B64 11 ``` --------- Signed-off-by: John Lu Co-authored-by: Matt Arsenault --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 69 +- llvm/lib/Target/AMDGPU/SIInstrInfo.h | 46 + .../GlobalISel/llvm.amdgcn.ballot.i32.ll | 2 - .../GlobalISel/llvm.amdgcn.ballot.i64.ll | 2 - llvm/test/CodeGen/AMDGPU/addsub64_carry.ll | 36 +- .../AMDGPU/amdgpu-codegenprepare-idiv.ll | 1260 ++++++++--------- .../AMDGPU/atomic_optimizations_buffer.ll | 135 +- .../atomic_optimizations_global_pointer.ll | 210 ++- .../atomic_optimizations_local_pointer.ll | 585 +++----- .../AMDGPU/atomic_optimizations_raw_buffer.ll | 90 +- .../atomic_optimizations_struct_buffer.ll | 90 +- .../test/CodeGen/AMDGPU/carryout-selection.ll | 614 ++++---- llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll | 3 +- llvm/test/CodeGen/AMDGPU/ctpop16.ll | 2 - .../expand-scalar-carry-out-select-user.ll | 25 +- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 35 +- llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll | 128 +- llvm/test/CodeGen/AMDGPU/fptrunc.ll | 36 +- .../AMDGPU/global_atomics_scan_fadd.ll | 115 +- .../AMDGPU/global_atomics_scan_fmax.ll | 81 +- .../AMDGPU/global_atomics_scan_fmin.ll | 81 +- .../AMDGPU/global_atomics_scan_fsub.ll | 115 +- .../CodeGen/AMDGPU/insert-delay-alu-bug.ll | 20 +- .../CodeGen/AMDGPU/local-atomicrmw-fadd.ll | 128 +- llvm/test/CodeGen/AMDGPU/optimize-compare.mir | 82 +- llvm/test/CodeGen/AMDGPU/s_cmp_0.ll | 64 +- .../CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll | 4 - llvm/test/CodeGen/AMDGPU/sdiv64.ll | 146 +- llvm/test/CodeGen/AMDGPU/srem.ll | 654 ++++----- llvm/test/CodeGen/AMDGPU/srem64.ll | 207 ++- llvm/test/CodeGen/AMDGPU/uaddo.ll | 54 +- llvm/test/CodeGen/AMDGPU/udiv64.ll | 80 +- llvm/test/CodeGen/AMDGPU/urem64.ll | 146 +- llvm/test/CodeGen/AMDGPU/usubo.ll | 54 +- llvm/test/CodeGen/AMDGPU/wave32.ll | 190 ++- .../CodeGen/AMDGPU/workitem-intrinsic-opts.ll | 8 - 36 files changed, 2461 insertions(+), 3136 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 2ff2d2f62cff7..d930a21c2d7f5 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -10628,6 +10628,59 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue)) return false; + const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI, + this]() -> bool { + if (CmpValue != 0) + return false; + + MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg); + if (!Def || Def->getParent() != CmpInstr.getParent()) + return false; + + const auto foldableSelect = [](MachineInstr *Def) -> bool { + if (Def->getOpcode() == AMDGPU::S_CSELECT_B32 || + Def->getOpcode() == AMDGPU::S_CSELECT_B64) { + bool Op1IsNonZeroImm = + Def->getOperand(1).isImm() && Def->getOperand(1).getImm() != 0; + bool Op2IsZeroImm = + Def->getOperand(2).isImm() && Def->getOperand(2).getImm() == 0; + if (Op1IsNonZeroImm && Op2IsZeroImm) + return true; + } + return false; + }; + + // For S_OP that set SCC = DST!=0, do the transformation + // + // s_cmp_lg_* (S_OP ...), 0 => (S_OP ...) + + // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value + // for S_CSELECT* already has the same value that will be calculated by + // s_cmp_lg_* + // + // s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero + // imm), 0) + if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(Def)) + return false; + + MachineInstr *KillsSCC = nullptr; + for (MachineInstr &MI : + make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) { + if (MI.modifiesRegister(AMDGPU::SCC, &RI)) + return false; + if (MI.killsRegister(AMDGPU::SCC, &RI)) + KillsSCC = &MI; + } + + if (MachineOperand *SccDef = + Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr)) + SccDef->setIsDead(false); + if (KillsSCC) + KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr); + CmpInstr.eraseFromParent(); + return true; + }; + const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI, this](int64_t ExpectedValue, unsigned SrcSize, bool IsReversible, bool IsSigned) -> bool { @@ -10702,16 +10755,20 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg)) return false; - for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator(); - I != E; ++I) { - if (I->modifiesRegister(AMDGPU::SCC, &RI) || - I->killsRegister(AMDGPU::SCC, &RI)) + MachineInstr *KillsSCC = nullptr; + for (MachineInstr &MI : + make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) { + if (MI.modifiesRegister(AMDGPU::SCC, &RI)) return false; + if (MI.killsRegister(AMDGPU::SCC, &RI)) + KillsSCC = &MI; } MachineOperand *SccDef = Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr); SccDef->setIsDead(false); + if (KillsSCC) + KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr); CmpInstr.eraseFromParent(); if (!MRI->use_nodbg_empty(DefReg)) { @@ -10755,7 +10812,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, case AMDGPU::S_CMP_LG_I32: case AMDGPU::S_CMPK_LG_U32: case AMDGPU::S_CMPK_LG_I32: - return optimizeCmpAnd(0, 32, true, false); + return optimizeCmpAnd(0, 32, true, false) || optimizeCmpSelect(); case AMDGPU::S_CMP_GT_U32: case AMDGPU::S_CMPK_GT_U32: return optimizeCmpAnd(0, 32, false, false); @@ -10763,7 +10820,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, case AMDGPU::S_CMPK_GT_I32: return optimizeCmpAnd(0, 32, false, true); case AMDGPU::S_CMP_LG_U64: - return optimizeCmpAnd(0, 64, true, false); + return optimizeCmpAnd(0, 64, true, false) || optimizeCmpSelect(); } return false; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index e1d7a07b0d169..5fdeddaf3f736 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -714,6 +714,52 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { } } + static bool setsSCCifResultIsNonZero(const MachineInstr &MI) { + switch (MI.getOpcode()) { + case AMDGPU::S_ABSDIFF_I32: + case AMDGPU::S_ABS_I32: + case AMDGPU::S_AND_B32: + case AMDGPU::S_AND_B64: + case AMDGPU::S_ANDN2_B32: + case AMDGPU::S_ANDN2_B64: + case AMDGPU::S_ASHR_I32: + case AMDGPU::S_ASHR_I64: + case AMDGPU::S_BCNT0_I32_B32: + case AMDGPU::S_BCNT0_I32_B64: + case AMDGPU::S_BCNT1_I32_B32: + case AMDGPU::S_BCNT1_I32_B64: + case AMDGPU::S_BFE_I32: + case AMDGPU::S_BFE_I64: + case AMDGPU::S_BFE_U32: + case AMDGPU::S_BFE_U64: + case AMDGPU::S_LSHL_B32: + case AMDGPU::S_LSHL_B64: + case AMDGPU::S_LSHR_B32: + case AMDGPU::S_LSHR_B64: + case AMDGPU::S_NAND_B32: + case AMDGPU::S_NAND_B64: + case AMDGPU::S_NOR_B32: + case AMDGPU::S_NOR_B64: + case AMDGPU::S_NOT_B32: + case AMDGPU::S_NOT_B64: + case AMDGPU::S_OR_B32: + case AMDGPU::S_OR_B64: + case AMDGPU::S_ORN2_B32: + case AMDGPU::S_ORN2_B64: + case AMDGPU::S_QUADMASK_B32: + case AMDGPU::S_QUADMASK_B64: + case AMDGPU::S_WQM_B32: + case AMDGPU::S_WQM_B64: + case AMDGPU::S_XNOR_B32: + case AMDGPU::S_XNOR_B64: + case AMDGPU::S_XOR_B32: + case AMDGPU::S_XOR_B64: + return true; + default: + return false; + } + } + static bool isEXP(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::EXP; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll index 51714035352a3..7714c032d1737 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll @@ -140,7 +140,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) { ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB9_2 ; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 @@ -345,7 +344,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) { ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 ; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1 ; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB17_2 ; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll index 7b01f13b9ef1c..7b8166948610b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll @@ -143,7 +143,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) { ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB9_2 ; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 @@ -348,7 +347,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) { ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0 ; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1 ; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB17_2 ; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 diff --git a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll index b72eba8cdb519..8088c1b4c8fc7 100644 --- a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll +++ b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll @@ -180,11 +180,7 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B ; CHECK-LABEL: s_add64_32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 s0, s0, s2 -; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0 ; CHECK-NEXT: s_addc_u32 s1, s1, s3 -; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0 ; CHECK-NEXT: s_addc_u32 s2, s4, 0 ; CHECK-NEXT: ; return to shader part epilog %sum64 = add i64 %val64A, %val64B @@ -199,14 +195,10 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) { ; CHECK-LABEL: s_uadd_v2i64: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s2, s6 -; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0 -; CHECK-NEXT: s_addc_u32 s8, s3, s7 +; CHECK-NEXT: s_add_u32 s6, s2, s6 +; CHECK-NEXT: s_addc_u32 s7, s3, s7 ; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 ; CHECK-NEXT: s_add_u32 s0, s0, s4 -; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0 ; CHECK-NEXT: s_addc_u32 s1, s1, s5 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 @@ -215,8 +207,8 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v7 ; CHECK-NEXT: v_readfirstlane_b32 s2, v6 -; CHECK-NEXT: v_mov_b32_e32 v4, s10 -; CHECK-NEXT: v_mov_b32_e32 v5, s8 +; CHECK-NEXT: v_mov_b32_e32 v4, s6 +; CHECK-NEXT: v_mov_b32_e32 v5, s7 ; CHECK-NEXT: s_mov_b32 s1, s0 ; CHECK-NEXT: s_mov_b32 s3, s2 ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] @@ -233,14 +225,10 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) { ; CHECK-LABEL: s_usub_v2i64: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_sub_u32 s10, s2, s6 -; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0 -; CHECK-NEXT: s_subb_u32 s8, s3, s7 +; CHECK-NEXT: s_sub_u32 s6, s2, s6 +; CHECK-NEXT: s_subb_u32 s7, s3, s7 ; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 ; CHECK-NEXT: s_sub_u32 s0, s0, s4 -; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0 ; CHECK-NEXT: s_subb_u32 s1, s1, s5 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 @@ -249,8 +237,8 @@ define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v7 ; CHECK-NEXT: v_readfirstlane_b32 s2, v6 -; CHECK-NEXT: v_mov_b32_e32 v4, s10 -; CHECK-NEXT: v_mov_b32_e32 v5, s8 +; CHECK-NEXT: v_mov_b32_e32 v4, s6 +; CHECK-NEXT: v_mov_b32_e32 v5, s7 ; CHECK-NEXT: s_mov_b32 s1, s0 ; CHECK-NEXT: s_mov_b32 s3, s2 ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] @@ -268,8 +256,6 @@ define amdgpu_ps i64 @s_uadd_i64(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) ; CHECK-LABEL: s_uadd_i64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 s0, s0, s2 -; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0 ; CHECK-NEXT: s_addc_u32 s1, s1, s3 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 @@ -292,8 +278,6 @@ define amdgpu_ps i64 @s_uadd_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) { ; CHECK-LABEL: s_uadd_p1: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 s0, s0, 1 -; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 @@ -339,8 +323,6 @@ define amdgpu_ps i64 @s_usub_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) { ; CHECK-LABEL: s_usub_p1: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_sub_u32 s0, s0, 1 -; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0 ; CHECK-NEXT: s_subb_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 @@ -363,8 +345,6 @@ define amdgpu_ps i64 @s_usub_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) { ; CHECK-LABEL: s_usub_n1: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_sub_u32 s0, s0, -1 -; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0 ; CHECK-NEXT: s_subb_u32 s1, s1, -1 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index 948811ea45f77..51df8c34cc55e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -7821,10 +7821,9 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_addc_u32 s15, 0, s16 ; GFX6-NEXT: s_add_u32 s16, s0, s1 ; GFX6-NEXT: v_mov_b32_e32 v0, s16 -; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 +; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_cmp_lg_u32 s0, 0 ; GFX6-NEXT: s_addc_u32 s14, s14, s15 ; GFX6-NEXT: s_mul_i32 s0, s12, s14 ; GFX6-NEXT: v_readfirstlane_b32 s1, v0 @@ -7855,7 +7854,6 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_add_u32 s15, s16, s0 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_cmp_lg_u32 s0, 0 ; GFX6-NEXT: s_addc_u32 s14, s14, s12 ; GFX6-NEXT: s_ashr_i32 s12, s7, 31 ; GFX6-NEXT: s_add_u32 s0, s6, s12 @@ -7881,52 +7879,50 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: v_readfirstlane_b32 s4, v0 ; GFX6-NEXT: s_addc_u32 s4, s4, 0 ; GFX6-NEXT: s_mul_i32 s14, s7, s14 -; GFX6-NEXT: s_add_u32 s14, s1, s14 -; GFX6-NEXT: v_mov_b32_e32 v0, s14 +; GFX6-NEXT: s_add_u32 s16, s1, s14 +; GFX6-NEXT: v_mov_b32_e32 v0, s16 ; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0 -; GFX6-NEXT: s_addc_u32 s15, 0, s4 +; GFX6-NEXT: s_addc_u32 s17, 0, s4 ; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: s_mul_i32 s4, s10, s15 +; GFX6-NEXT: s_mul_i32 s4, s10, s17 ; GFX6-NEXT: v_readfirstlane_b32 s5, v0 ; GFX6-NEXT: s_add_i32 s4, s5, s4 -; GFX6-NEXT: s_mul_i32 s5, s11, s14 -; GFX6-NEXT: s_add_i32 s16, s4, s5 -; GFX6-NEXT: s_sub_i32 s17, s7, s16 -; GFX6-NEXT: s_mul_i32 s4, s10, s14 +; GFX6-NEXT: s_mul_i32 s5, s11, s16 +; GFX6-NEXT: s_add_i32 s18, s4, s5 +; GFX6-NEXT: s_sub_i32 s14, s7, s18 +; GFX6-NEXT: s_mul_i32 s4, s10, s16 ; GFX6-NEXT: s_sub_u32 s6, s6, s4 ; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX6-NEXT: s_or_b32 s18, s4, s5 -; GFX6-NEXT: s_cmp_lg_u32 s18, 0 -; GFX6-NEXT: s_subb_u32 s17, s17, s11 -; GFX6-NEXT: s_sub_u32 s19, s6, s10 -; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX6-NEXT: s_or_b32 s15, s4, s5 +; GFX6-NEXT: s_subb_u32 s19, s14, s11 +; GFX6-NEXT: s_sub_u32 s20, s6, s10 +; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX6-NEXT: s_or_b32 s14, s14, s15 +; GFX6-NEXT: s_subb_u32 s14, s19, 0 +; GFX6-NEXT: s_cmp_ge_u32 s14, s11 +; GFX6-NEXT: s_cselect_b32 s15, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s20, s10 +; GFX6-NEXT: s_cselect_b32 s19, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s14, s11 +; GFX6-NEXT: s_cselect_b32 s14, s19, s15 +; GFX6-NEXT: s_add_u32 s15, s16, 1 +; GFX6-NEXT: s_addc_u32 s19, s17, 0 +; GFX6-NEXT: s_add_u32 s20, s16, 2 +; GFX6-NEXT: s_addc_u32 s21, s17, 0 +; GFX6-NEXT: s_cmp_lg_u32 s14, 0 +; GFX6-NEXT: s_cselect_b32 s14, s20, s15 +; GFX6-NEXT: s_cselect_b32 s15, s21, s19 ; GFX6-NEXT: s_or_b32 s4, s4, s5 -; GFX6-NEXT: s_cmp_lg_u32 s4, 0 -; GFX6-NEXT: s_subb_u32 s4, s17, 0 +; GFX6-NEXT: s_subb_u32 s4, s7, s18 ; GFX6-NEXT: s_cmp_ge_u32 s4, s11 ; GFX6-NEXT: s_cselect_b32 s5, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s19, s10 -; GFX6-NEXT: s_cselect_b32 s17, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s4, s11 -; GFX6-NEXT: s_cselect_b32 s4, s17, s5 -; GFX6-NEXT: s_add_u32 s5, s14, 1 -; GFX6-NEXT: s_addc_u32 s17, s15, 0 -; GFX6-NEXT: s_add_u32 s19, s14, 2 -; GFX6-NEXT: s_addc_u32 s20, s15, 0 -; GFX6-NEXT: s_cmp_lg_u32 s4, 0 -; GFX6-NEXT: s_cselect_b32 s4, s19, s5 -; GFX6-NEXT: s_cselect_b32 s5, s20, s17 -; GFX6-NEXT: s_cmp_lg_u32 s18, 0 -; GFX6-NEXT: s_subb_u32 s7, s7, s16 -; GFX6-NEXT: s_cmp_ge_u32 s7, s11 -; GFX6-NEXT: s_cselect_b32 s16, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s6, s10 ; GFX6-NEXT: s_cselect_b32 s6, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s7, s11 -; GFX6-NEXT: s_cselect_b32 s6, s6, s16 -; GFX6-NEXT: s_cmp_lg_u32 s6, 0 -; GFX6-NEXT: s_cselect_b32 s5, s5, s15 -; GFX6-NEXT: s_cselect_b32 s4, s4, s14 +; GFX6-NEXT: s_cmp_eq_u32 s4, s11 +; GFX6-NEXT: s_cselect_b32 s4, s6, s5 +; GFX6-NEXT: s_cmp_lg_u32 s4, 0 +; GFX6-NEXT: s_cselect_b32 s5, s15, s17 +; GFX6-NEXT: s_cselect_b32 s4, s14, s16 ; GFX6-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9] ; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GFX6-NEXT: s_sub_u32 s4, s4, s6 @@ -7949,8 +7945,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_sub_u32 s10, 0, s8 -; GFX9-NEXT: s_subb_u32 s11, 0, s9 +; GFX9-NEXT: s_sub_u32 s4, 0, s8 +; GFX9-NEXT: s_subb_u32 s5, 0, s9 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -7960,56 +7956,52 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s12, v2 -; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: s_mul_i32 s5, s10, s12 -; GFX9-NEXT: s_mul_hi_u32 s14, s10, s4 -; GFX9-NEXT: s_mul_i32 s13, s11, s4 -; GFX9-NEXT: s_add_i32 s5, s14, s5 -; GFX9-NEXT: s_mul_i32 s15, s10, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s13 -; GFX9-NEXT: s_mul_hi_u32 s14, s4, s15 -; GFX9-NEXT: s_mul_i32 s16, s4, s5 -; GFX9-NEXT: s_mul_hi_u32 s13, s4, s5 +; GFX9-NEXT: v_readfirstlane_b32 s10, v2 +; GFX9-NEXT: v_readfirstlane_b32 s11, v1 +; GFX9-NEXT: s_mul_i32 s12, s4, s10 +; GFX9-NEXT: s_mul_hi_u32 s14, s4, s11 +; GFX9-NEXT: s_mul_i32 s13, s5, s11 +; GFX9-NEXT: s_add_i32 s12, s14, s12 +; GFX9-NEXT: s_mul_i32 s15, s4, s11 +; GFX9-NEXT: s_add_i32 s12, s12, s13 +; GFX9-NEXT: s_mul_hi_u32 s14, s11, s15 +; GFX9-NEXT: s_mul_i32 s16, s11, s12 +; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12 ; GFX9-NEXT: s_add_u32 s14, s14, s16 ; GFX9-NEXT: s_addc_u32 s13, 0, s13 -; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15 -; GFX9-NEXT: s_mul_i32 s15, s12, s15 +; GFX9-NEXT: s_mul_hi_u32 s17, s10, s15 +; GFX9-NEXT: s_mul_i32 s15, s10, s15 ; GFX9-NEXT: s_add_u32 s14, s14, s15 -; GFX9-NEXT: s_mul_hi_u32 s16, s12, s5 +; GFX9-NEXT: s_mul_hi_u32 s16, s10, s12 ; GFX9-NEXT: s_addc_u32 s13, s13, s17 ; GFX9-NEXT: s_addc_u32 s14, s16, 0 -; GFX9-NEXT: s_mul_i32 s5, s12, s5 -; GFX9-NEXT: s_add_u32 s5, s13, s5 +; GFX9-NEXT: s_mul_i32 s12, s10, s12 +; GFX9-NEXT: s_add_u32 s12, s13, s12 ; GFX9-NEXT: s_addc_u32 s13, 0, s14 -; GFX9-NEXT: s_add_u32 s14, s4, s5 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s12, s12, s13 -; GFX9-NEXT: s_mul_i32 s4, s10, s12 -; GFX9-NEXT: s_mul_hi_u32 s5, s10, s14 -; GFX9-NEXT: s_add_i32 s4, s5, s4 -; GFX9-NEXT: s_mul_i32 s11, s11, s14 -; GFX9-NEXT: s_add_i32 s4, s4, s11 -; GFX9-NEXT: s_mul_i32 s10, s10, s14 -; GFX9-NEXT: s_mul_hi_u32 s11, s12, s10 -; GFX9-NEXT: s_mul_i32 s13, s12, s10 -; GFX9-NEXT: s_mul_i32 s16, s14, s4 -; GFX9-NEXT: s_mul_hi_u32 s10, s14, s10 -; GFX9-NEXT: s_mul_hi_u32 s15, s14, s4 -; GFX9-NEXT: s_add_u32 s10, s10, s16 +; GFX9-NEXT: s_add_u32 s11, s11, s12 +; GFX9-NEXT: s_addc_u32 s10, s10, s13 +; GFX9-NEXT: s_mul_i32 s12, s4, s10 +; GFX9-NEXT: s_mul_hi_u32 s13, s4, s11 +; GFX9-NEXT: s_add_i32 s12, s13, s12 +; GFX9-NEXT: s_mul_i32 s5, s5, s11 +; GFX9-NEXT: s_add_i32 s12, s12, s5 +; GFX9-NEXT: s_mul_i32 s4, s4, s11 +; GFX9-NEXT: s_mul_hi_u32 s13, s10, s4 +; GFX9-NEXT: s_mul_i32 s14, s10, s4 +; GFX9-NEXT: s_mul_i32 s16, s11, s12 +; GFX9-NEXT: s_mul_hi_u32 s4, s11, s4 +; GFX9-NEXT: s_mul_hi_u32 s15, s11, s12 +; GFX9-NEXT: s_add_u32 s4, s4, s16 ; GFX9-NEXT: s_addc_u32 s15, 0, s15 -; GFX9-NEXT: s_add_u32 s10, s10, s13 -; GFX9-NEXT: s_mul_hi_u32 s5, s12, s4 -; GFX9-NEXT: s_addc_u32 s10, s15, s11 +; GFX9-NEXT: s_add_u32 s4, s4, s14 +; GFX9-NEXT: s_mul_hi_u32 s5, s10, s12 +; GFX9-NEXT: s_addc_u32 s4, s15, s13 ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: s_mul_i32 s4, s12, s4 -; GFX9-NEXT: s_add_u32 s4, s10, s4 -; GFX9-NEXT: s_addc_u32 s10, 0, s5 -; GFX9-NEXT: s_add_u32 s11, s14, s4 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s10, s12, s10 +; GFX9-NEXT: s_mul_i32 s12, s10, s12 +; GFX9-NEXT: s_add_u32 s4, s4, s12 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 +; GFX9-NEXT: s_add_u32 s11, s11, s4 +; GFX9-NEXT: s_addc_u32 s10, s10, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 ; GFX9-NEXT: s_add_u32 s2, s2, s4 @@ -8028,38 +8020,35 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_addc_u32 s11, s12, s15 ; GFX9-NEXT: s_addc_u32 s12, s14, 0 ; GFX9-NEXT: s_mul_i32 s10, s3, s10 -; GFX9-NEXT: s_add_u32 s14, s11, s10 -; GFX9-NEXT: s_addc_u32 s15, 0, s12 -; GFX9-NEXT: s_mul_i32 s10, s8, s15 -; GFX9-NEXT: s_mul_hi_u32 s11, s8, s14 +; GFX9-NEXT: s_add_u32 s13, s11, s10 +; GFX9-NEXT: s_addc_u32 s12, 0, s12 +; GFX9-NEXT: s_mul_i32 s10, s8, s12 +; GFX9-NEXT: s_mul_hi_u32 s11, s8, s13 ; GFX9-NEXT: s_add_i32 s10, s11, s10 -; GFX9-NEXT: s_mul_i32 s11, s9, s14 -; GFX9-NEXT: s_add_i32 s16, s10, s11 -; GFX9-NEXT: s_sub_i32 s12, s3, s16 -; GFX9-NEXT: s_mul_i32 s10, s8, s14 +; GFX9-NEXT: s_mul_i32 s11, s9, s13 +; GFX9-NEXT: s_add_i32 s14, s10, s11 +; GFX9-NEXT: s_sub_i32 s15, s3, s14 +; GFX9-NEXT: s_mul_i32 s10, s8, s13 ; GFX9-NEXT: s_sub_u32 s2, s2, s10 ; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s17, s12, s9 -; GFX9-NEXT: s_sub_u32 s18, s2, s8 -; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 -; GFX9-NEXT: s_subb_u32 s12, s17, 0 -; GFX9-NEXT: s_cmp_ge_u32 s12, s9 -; GFX9-NEXT: s_cselect_b32 s13, -1, 0 -; GFX9-NEXT: s_cmp_ge_u32 s18, s8 +; GFX9-NEXT: s_subb_u32 s15, s15, s9 +; GFX9-NEXT: s_sub_u32 s16, s2, s8 +; GFX9-NEXT: s_subb_u32 s15, s15, 0 +; GFX9-NEXT: s_cmp_ge_u32 s15, s9 ; GFX9-NEXT: s_cselect_b32 s17, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s12, s9 -; GFX9-NEXT: s_cselect_b32 s12, s17, s13 -; GFX9-NEXT: s_add_u32 s13, s14, 1 -; GFX9-NEXT: s_addc_u32 s17, s15, 0 -; GFX9-NEXT: s_add_u32 s18, s14, 2 -; GFX9-NEXT: s_addc_u32 s19, s15, 0 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 -; GFX9-NEXT: s_cselect_b32 s12, s18, s13 -; GFX9-NEXT: s_cselect_b32 s13, s19, s17 +; GFX9-NEXT: s_cmp_ge_u32 s16, s8 +; GFX9-NEXT: s_cselect_b32 s16, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s15, s9 +; GFX9-NEXT: s_cselect_b32 s15, s16, s17 +; GFX9-NEXT: s_add_u32 s16, s13, 1 +; GFX9-NEXT: s_addc_u32 s17, s12, 0 +; GFX9-NEXT: s_add_u32 s18, s13, 2 +; GFX9-NEXT: s_addc_u32 s19, s12, 0 +; GFX9-NEXT: s_cmp_lg_u32 s15, 0 +; GFX9-NEXT: s_cselect_b32 s15, s18, s16 +; GFX9-NEXT: s_cselect_b32 s16, s19, s17 ; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s3, s3, s16 +; GFX9-NEXT: s_subb_u32 s3, s3, s14 ; GFX9-NEXT: s_cmp_ge_u32 s3, s9 ; GFX9-NEXT: s_cselect_b32 s10, -1, 0 ; GFX9-NEXT: s_cmp_ge_u32 s2, s8 @@ -8067,8 +8056,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_cmp_eq_u32 s3, s9 ; GFX9-NEXT: s_cselect_b32 s2, s2, s10 ; GFX9-NEXT: s_cmp_lg_u32 s2, 0 -; GFX9-NEXT: s_cselect_b32 s3, s13, s15 -; GFX9-NEXT: s_cselect_b32 s2, s12, s14 +; GFX9-NEXT: s_cselect_b32 s3, s16, s12 +; GFX9-NEXT: s_cselect_b32 s2, s15, s13 ; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] ; GFX9-NEXT: s_sub_u32 s2, s2, s4 @@ -8328,10 +8317,9 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_addc_u32 s17, 0, s18 ; GFX6-NEXT: s_add_u32 s18, s12, s13 ; GFX6-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GFX6-NEXT: v_mul_hi_u32 v0, s14, v0 +; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 ; GFX6-NEXT: s_addc_u32 s16, s16, s17 ; GFX6-NEXT: s_mul_i32 s12, s14, s16 ; GFX6-NEXT: v_readfirstlane_b32 s13, v0 @@ -8362,7 +8350,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_add_u32 s15, s18, s12 ; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 ; GFX6-NEXT: s_addc_u32 s14, s16, s14 ; GFX6-NEXT: s_ashr_i32 s12, s9, 31 ; GFX6-NEXT: s_add_u32 s8, s8, s12 @@ -8387,55 +8374,53 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_readfirstlane_b32 s16, v0 ; GFX6-NEXT: s_addc_u32 s16, s16, 0 ; GFX6-NEXT: s_mul_i32 s14, s9, s14 -; GFX6-NEXT: s_add_u32 s17, s15, s14 -; GFX6-NEXT: v_mov_b32_e32 v0, s17 +; GFX6-NEXT: s_add_u32 s18, s15, s14 +; GFX6-NEXT: v_mov_b32_e32 v0, s18 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 -; GFX6-NEXT: s_addc_u32 s16, 0, s16 -; GFX6-NEXT: s_mul_i32 s14, s6, s16 +; GFX6-NEXT: s_addc_u32 s19, 0, s16 +; GFX6-NEXT: s_mul_i32 s14, s6, s19 ; GFX6-NEXT: v_readfirstlane_b32 s15, v0 ; GFX6-NEXT: s_add_i32 s14, s15, s14 -; GFX6-NEXT: s_mul_i32 s15, s7, s17 -; GFX6-NEXT: s_add_i32 s18, s14, s15 -; GFX6-NEXT: s_sub_i32 s19, s9, s18 -; GFX6-NEXT: s_mul_i32 s14, s6, s17 +; GFX6-NEXT: s_mul_i32 s15, s7, s18 +; GFX6-NEXT: s_add_i32 s20, s14, s15 +; GFX6-NEXT: s_sub_i32 s16, s9, s20 +; GFX6-NEXT: s_mul_i32 s14, s6, s18 ; GFX6-NEXT: s_sub_u32 s8, s8, s14 ; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX6-NEXT: s_or_b32 s20, s14, s15 -; GFX6-NEXT: s_cmp_lg_u32 s20, 0 -; GFX6-NEXT: s_subb_u32 s19, s19, s7 -; GFX6-NEXT: s_sub_u32 s21, s8, s6 -; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX6-NEXT: s_or_b32 s17, s14, s15 +; GFX6-NEXT: s_subb_u32 s21, s16, s7 +; GFX6-NEXT: s_sub_u32 s22, s8, s6 +; GFX6-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GFX6-NEXT: s_or_b32 s16, s16, s17 +; GFX6-NEXT: s_subb_u32 s16, s21, 0 +; GFX6-NEXT: s_cmp_ge_u32 s16, s7 +; GFX6-NEXT: s_cselect_b32 s17, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s22, s6 +; GFX6-NEXT: s_cselect_b32 s21, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s16, s7 +; GFX6-NEXT: s_cselect_b32 s16, s21, s17 +; GFX6-NEXT: s_add_u32 s17, s18, 1 +; GFX6-NEXT: s_addc_u32 s21, s19, 0 +; GFX6-NEXT: s_add_u32 s22, s18, 2 +; GFX6-NEXT: s_addc_u32 s23, s19, 0 +; GFX6-NEXT: s_cmp_lg_u32 s16, 0 +; GFX6-NEXT: s_cselect_b32 s16, s22, s17 +; GFX6-NEXT: s_cselect_b32 s17, s23, s21 ; GFX6-NEXT: s_or_b32 s14, s14, s15 -; GFX6-NEXT: s_cmp_lg_u32 s14, 0 -; GFX6-NEXT: s_subb_u32 s14, s19, 0 -; GFX6-NEXT: s_cmp_ge_u32 s14, s7 -; GFX6-NEXT: s_cselect_b32 s15, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s21, s6 -; GFX6-NEXT: s_cselect_b32 s19, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s14, s7 -; GFX6-NEXT: s_cselect_b32 s14, s19, s15 -; GFX6-NEXT: s_add_u32 s15, s17, 1 -; GFX6-NEXT: s_addc_u32 s19, s16, 0 -; GFX6-NEXT: s_add_u32 s21, s17, 2 -; GFX6-NEXT: s_addc_u32 s22, s16, 0 -; GFX6-NEXT: s_cmp_lg_u32 s14, 0 -; GFX6-NEXT: s_cselect_b32 s14, s21, s15 -; GFX6-NEXT: s_cselect_b32 s15, s22, s19 -; GFX6-NEXT: s_cmp_lg_u32 s20, 0 -; GFX6-NEXT: s_subb_u32 s9, s9, s18 +; GFX6-NEXT: s_subb_u32 s9, s9, s20 ; GFX6-NEXT: s_cmp_ge_u32 s9, s7 -; GFX6-NEXT: s_cselect_b32 s18, -1, 0 +; GFX6-NEXT: s_cselect_b32 s14, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s8, s6 ; GFX6-NEXT: s_cselect_b32 s6, -1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s9, s7 -; GFX6-NEXT: s_cselect_b32 s6, s6, s18 +; GFX6-NEXT: s_cselect_b32 s6, s6, s14 ; GFX6-NEXT: s_cmp_lg_u32 s6, 0 -; GFX6-NEXT: s_cselect_b32 s7, s15, s16 -; GFX6-NEXT: s_cselect_b32 s6, s14, s17 +; GFX6-NEXT: s_cselect_b32 s7, s17, s19 +; GFX6-NEXT: s_cselect_b32 s6, s16, s18 ; GFX6-NEXT: s_xor_b64 s[2:3], s[12:13], s[2:3] ; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] -; GFX6-NEXT: s_sub_u32 s14, s6, s2 -; GFX6-NEXT: s_subb_u32 s15, s7, s3 +; GFX6-NEXT: s_sub_u32 s16, s6, s2 +; GFX6-NEXT: s_subb_u32 s17, s7, s3 ; GFX6-NEXT: s_ashr_i32 s6, s1, 31 ; GFX6-NEXT: s_add_u32 s0, s0, s6 ; GFX6-NEXT: s_mov_b32 s7, s6 @@ -8454,40 +8439,39 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0 -; GFX6-NEXT: v_readfirstlane_b32 s16, v1 +; GFX6-NEXT: v_readfirstlane_b32 s14, v1 ; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s1, s12, s16 +; GFX6-NEXT: s_mul_i32 s1, s12, s14 ; GFX6-NEXT: v_readfirstlane_b32 s3, v2 ; GFX6-NEXT: s_mul_i32 s0, s13, s2 ; GFX6-NEXT: s_add_i32 s1, s3, s1 ; GFX6-NEXT: s_add_i32 s3, s1, s0 -; GFX6-NEXT: s_mul_i32 s17, s12, s2 +; GFX6-NEXT: s_mul_i32 s15, s12, s2 ; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, s17 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s15 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mul_i32 s4, s2, s3 ; GFX6-NEXT: v_readfirstlane_b32 s5, v2 ; GFX6-NEXT: v_readfirstlane_b32 s18, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, v1, s17 +; GFX6-NEXT: v_mul_hi_u32 v0, v1, s15 ; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3 ; GFX6-NEXT: s_add_u32 s4, s18, s4 ; GFX6-NEXT: s_addc_u32 s5, 0, s5 -; GFX6-NEXT: s_mul_i32 s17, s16, s17 +; GFX6-NEXT: s_mul_i32 s15, s14, s15 ; GFX6-NEXT: v_readfirstlane_b32 s18, v0 -; GFX6-NEXT: s_add_u32 s4, s4, s17 +; GFX6-NEXT: s_add_u32 s4, s4, s15 ; GFX6-NEXT: s_addc_u32 s4, s5, s18 ; GFX6-NEXT: v_readfirstlane_b32 s5, v1 ; GFX6-NEXT: s_addc_u32 s5, s5, 0 -; GFX6-NEXT: s_mul_i32 s3, s16, s3 +; GFX6-NEXT: s_mul_i32 s3, s14, s3 ; GFX6-NEXT: s_add_u32 s3, s4, s3 ; GFX6-NEXT: s_addc_u32 s4, 0, s5 ; GFX6-NEXT: s_add_u32 s5, s2, s3 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 +; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_cmp_lg_u32 s2, 0 -; GFX6-NEXT: s_addc_u32 s4, s16, s4 +; GFX6-NEXT: s_addc_u32 s4, s14, s4 ; GFX6-NEXT: s_mul_i32 s2, s12, s4 ; GFX6-NEXT: v_readfirstlane_b32 s3, v0 ; GFX6-NEXT: s_add_i32 s2, s3, s2 @@ -8501,14 +8485,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX6-NEXT: s_mul_i32 s13, s5, s2 -; GFX6-NEXT: v_readfirstlane_b32 s17, v2 -; GFX6-NEXT: s_add_u32 s13, s17, s13 -; GFX6-NEXT: v_readfirstlane_b32 s16, v0 +; GFX6-NEXT: v_readfirstlane_b32 s15, v2 +; GFX6-NEXT: s_add_u32 s13, s15, s13 +; GFX6-NEXT: v_readfirstlane_b32 s14, v0 ; GFX6-NEXT: s_mul_i32 s3, s4, s3 -; GFX6-NEXT: s_addc_u32 s16, 0, s16 +; GFX6-NEXT: s_addc_u32 s14, 0, s14 ; GFX6-NEXT: v_readfirstlane_b32 s12, v3 ; GFX6-NEXT: s_add_u32 s3, s13, s3 -; GFX6-NEXT: s_addc_u32 s3, s16, s12 +; GFX6-NEXT: s_addc_u32 s3, s14, s12 ; GFX6-NEXT: v_readfirstlane_b32 s12, v1 ; GFX6-NEXT: s_addc_u32 s12, s12, 0 ; GFX6-NEXT: s_mul_i32 s2, s4, s2 @@ -8517,7 +8501,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_add_u32 s13, s5, s2 ; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_cmp_lg_u32 s2, 0 ; GFX6-NEXT: s_addc_u32 s12, s4, s12 ; GFX6-NEXT: s_ashr_i32 s4, s11, 31 ; GFX6-NEXT: s_add_u32 s2, s10, s4 @@ -8529,72 +8512,70 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mov_b32_e32 v2, s13 ; GFX6-NEXT: v_mul_hi_u32 v3, s10, v2 ; GFX6-NEXT: s_mul_i32 s2, s10, s12 -; GFX6-NEXT: v_readfirstlane_b32 s16, v1 +; GFX6-NEXT: v_readfirstlane_b32 s14, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s11, v2 -; GFX6-NEXT: v_readfirstlane_b32 s17, v3 +; GFX6-NEXT: v_readfirstlane_b32 s15, v3 ; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0 -; GFX6-NEXT: s_add_u32 s2, s17, s2 -; GFX6-NEXT: s_addc_u32 s16, 0, s16 +; GFX6-NEXT: s_add_u32 s2, s15, s2 +; GFX6-NEXT: s_addc_u32 s14, 0, s14 ; GFX6-NEXT: s_mul_i32 s13, s11, s13 -; GFX6-NEXT: v_readfirstlane_b32 s17, v1 +; GFX6-NEXT: v_readfirstlane_b32 s15, v1 ; GFX6-NEXT: s_add_u32 s2, s2, s13 -; GFX6-NEXT: s_addc_u32 s2, s16, s17 +; GFX6-NEXT: s_addc_u32 s2, s14, s15 ; GFX6-NEXT: v_readfirstlane_b32 s13, v0 ; GFX6-NEXT: s_addc_u32 s13, s13, 0 ; GFX6-NEXT: s_mul_i32 s12, s11, s12 -; GFX6-NEXT: s_add_u32 s16, s2, s12 -; GFX6-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NEXT: s_add_u32 s18, s2, s12 +; GFX6-NEXT: v_mov_b32_e32 v0, s18 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX6-NEXT: s_addc_u32 s17, 0, s13 -; GFX6-NEXT: s_mul_i32 s12, s8, s17 +; GFX6-NEXT: s_addc_u32 s19, 0, s13 +; GFX6-NEXT: s_mul_i32 s12, s8, s19 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_readfirstlane_b32 s13, v0 ; GFX6-NEXT: s_add_i32 s12, s13, s12 -; GFX6-NEXT: s_mul_i32 s13, s9, s16 -; GFX6-NEXT: s_add_i32 s18, s12, s13 -; GFX6-NEXT: s_sub_i32 s19, s11, s18 -; GFX6-NEXT: s_mul_i32 s12, s8, s16 +; GFX6-NEXT: s_mul_i32 s13, s9, s18 +; GFX6-NEXT: s_add_i32 s20, s12, s13 +; GFX6-NEXT: s_sub_i32 s14, s11, s20 +; GFX6-NEXT: s_mul_i32 s12, s8, s18 ; GFX6-NEXT: s_sub_u32 s10, s10, s12 ; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s20, s12, s13 -; GFX6-NEXT: s_cmp_lg_u32 s20, 0 -; GFX6-NEXT: s_subb_u32 s19, s19, s9 -; GFX6-NEXT: s_sub_u32 s21, s10, s8 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX6-NEXT: s_or_b32 s15, s12, s13 +; GFX6-NEXT: s_subb_u32 s21, s14, s9 +; GFX6-NEXT: s_sub_u32 s22, s10, s8 +; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX6-NEXT: s_or_b32 s14, s14, s15 +; GFX6-NEXT: s_subb_u32 s14, s21, 0 +; GFX6-NEXT: s_cmp_ge_u32 s14, s9 +; GFX6-NEXT: s_cselect_b32 s15, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s22, s8 +; GFX6-NEXT: s_cselect_b32 s21, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s14, s9 +; GFX6-NEXT: s_cselect_b32 s14, s21, s15 +; GFX6-NEXT: s_add_u32 s15, s18, 1 +; GFX6-NEXT: s_addc_u32 s21, s19, 0 +; GFX6-NEXT: s_add_u32 s22, s18, 2 +; GFX6-NEXT: s_addc_u32 s23, s19, 0 +; GFX6-NEXT: s_cmp_lg_u32 s14, 0 +; GFX6-NEXT: s_cselect_b32 s14, s22, s15 +; GFX6-NEXT: s_cselect_b32 s15, s23, s21 ; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_subb_u32 s12, s19, 0 -; GFX6-NEXT: s_cmp_ge_u32 s12, s9 -; GFX6-NEXT: s_cselect_b32 s13, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s21, s8 -; GFX6-NEXT: s_cselect_b32 s19, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s12, s9 -; GFX6-NEXT: s_cselect_b32 s12, s19, s13 -; GFX6-NEXT: s_add_u32 s13, s16, 1 -; GFX6-NEXT: s_addc_u32 s19, s17, 0 -; GFX6-NEXT: s_add_u32 s21, s16, 2 -; GFX6-NEXT: s_addc_u32 s22, s17, 0 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_cselect_b32 s12, s21, s13 -; GFX6-NEXT: s_cselect_b32 s13, s22, s19 -; GFX6-NEXT: s_cmp_lg_u32 s20, 0 -; GFX6-NEXT: s_subb_u32 s11, s11, s18 +; GFX6-NEXT: s_subb_u32 s11, s11, s20 ; GFX6-NEXT: s_cmp_ge_u32 s11, s9 -; GFX6-NEXT: s_cselect_b32 s18, -1, 0 +; GFX6-NEXT: s_cselect_b32 s12, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s10, s8 ; GFX6-NEXT: s_cselect_b32 s8, -1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s11, s9 -; GFX6-NEXT: s_cselect_b32 s8, s8, s18 +; GFX6-NEXT: s_cselect_b32 s8, s8, s12 ; GFX6-NEXT: s_cmp_lg_u32 s8, 0 -; GFX6-NEXT: s_cselect_b32 s9, s13, s17 -; GFX6-NEXT: s_cselect_b32 s8, s12, s16 +; GFX6-NEXT: s_cselect_b32 s9, s15, s19 +; GFX6-NEXT: s_cselect_b32 s8, s14, s18 ; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GFX6-NEXT: s_xor_b64 s[6:7], s[8:9], s[4:5] ; GFX6-NEXT: s_sub_u32 s4, s6, s4 ; GFX6-NEXT: s_subb_u32 s5, s7, s5 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mov_b32_e32 v0, s14 -; GFX6-NEXT: v_mov_b32_e32 v1, s15 +; GFX6-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NEXT: v_mov_b32_e32 v1, s17 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8614,8 +8595,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_sub_u32 s14, 0, s6 -; GFX9-NEXT: s_subb_u32 s15, 0, s7 +; GFX9-NEXT: s_sub_u32 s12, 0, s6 +; GFX9-NEXT: s_subb_u32 s13, 0, s7 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -8624,56 +8605,52 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s16, v1 -; GFX9-NEXT: v_readfirstlane_b32 s12, v0 -; GFX9-NEXT: s_mul_i32 s13, s14, s16 -; GFX9-NEXT: s_mul_hi_u32 s18, s14, s12 -; GFX9-NEXT: s_mul_i32 s17, s15, s12 -; GFX9-NEXT: s_add_i32 s13, s18, s13 -; GFX9-NEXT: s_mul_i32 s19, s14, s12 -; GFX9-NEXT: s_add_i32 s13, s13, s17 -; GFX9-NEXT: s_mul_hi_u32 s18, s12, s19 -; GFX9-NEXT: s_mul_i32 s20, s12, s13 -; GFX9-NEXT: s_mul_hi_u32 s17, s12, s13 +; GFX9-NEXT: v_readfirstlane_b32 s14, v1 +; GFX9-NEXT: v_readfirstlane_b32 s15, v0 +; GFX9-NEXT: s_mul_i32 s16, s12, s14 +; GFX9-NEXT: s_mul_hi_u32 s18, s12, s15 +; GFX9-NEXT: s_mul_i32 s17, s13, s15 +; GFX9-NEXT: s_add_i32 s16, s18, s16 +; GFX9-NEXT: s_mul_i32 s19, s12, s15 +; GFX9-NEXT: s_add_i32 s16, s16, s17 +; GFX9-NEXT: s_mul_hi_u32 s18, s15, s19 +; GFX9-NEXT: s_mul_i32 s20, s15, s16 +; GFX9-NEXT: s_mul_hi_u32 s17, s15, s16 ; GFX9-NEXT: s_add_u32 s18, s18, s20 ; GFX9-NEXT: s_addc_u32 s17, 0, s17 -; GFX9-NEXT: s_mul_hi_u32 s20, s16, s19 -; GFX9-NEXT: s_mul_i32 s19, s16, s19 +; GFX9-NEXT: s_mul_hi_u32 s20, s14, s19 +; GFX9-NEXT: s_mul_i32 s19, s14, s19 ; GFX9-NEXT: s_add_u32 s18, s18, s19 -; GFX9-NEXT: s_mul_hi_u32 s21, s16, s13 +; GFX9-NEXT: s_mul_hi_u32 s21, s14, s16 ; GFX9-NEXT: s_addc_u32 s17, s17, s20 ; GFX9-NEXT: s_addc_u32 s18, s21, 0 -; GFX9-NEXT: s_mul_i32 s13, s16, s13 -; GFX9-NEXT: s_add_u32 s13, s17, s13 +; GFX9-NEXT: s_mul_i32 s16, s14, s16 +; GFX9-NEXT: s_add_u32 s16, s17, s16 ; GFX9-NEXT: s_addc_u32 s17, 0, s18 -; GFX9-NEXT: s_add_u32 s18, s12, s13 -; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 -; GFX9-NEXT: s_addc_u32 s16, s16, s17 -; GFX9-NEXT: s_mul_i32 s12, s14, s16 -; GFX9-NEXT: s_mul_hi_u32 s13, s14, s18 -; GFX9-NEXT: s_add_i32 s12, s13, s12 -; GFX9-NEXT: s_mul_i32 s15, s15, s18 -; GFX9-NEXT: s_add_i32 s12, s12, s15 -; GFX9-NEXT: s_mul_i32 s14, s14, s18 -; GFX9-NEXT: s_mul_hi_u32 s15, s16, s14 -; GFX9-NEXT: s_mul_i32 s17, s16, s14 -; GFX9-NEXT: s_mul_i32 s20, s18, s12 -; GFX9-NEXT: s_mul_hi_u32 s14, s18, s14 -; GFX9-NEXT: s_mul_hi_u32 s19, s18, s12 -; GFX9-NEXT: s_add_u32 s14, s14, s20 +; GFX9-NEXT: s_add_u32 s15, s15, s16 +; GFX9-NEXT: s_addc_u32 s14, s14, s17 +; GFX9-NEXT: s_mul_i32 s16, s12, s14 +; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15 +; GFX9-NEXT: s_add_i32 s16, s17, s16 +; GFX9-NEXT: s_mul_i32 s13, s13, s15 +; GFX9-NEXT: s_add_i32 s16, s16, s13 +; GFX9-NEXT: s_mul_i32 s12, s12, s15 +; GFX9-NEXT: s_mul_hi_u32 s17, s14, s12 +; GFX9-NEXT: s_mul_i32 s18, s14, s12 +; GFX9-NEXT: s_mul_i32 s20, s15, s16 +; GFX9-NEXT: s_mul_hi_u32 s12, s15, s12 +; GFX9-NEXT: s_mul_hi_u32 s19, s15, s16 +; GFX9-NEXT: s_add_u32 s12, s12, s20 ; GFX9-NEXT: s_addc_u32 s19, 0, s19 -; GFX9-NEXT: s_add_u32 s14, s14, s17 -; GFX9-NEXT: s_mul_hi_u32 s13, s16, s12 -; GFX9-NEXT: s_addc_u32 s14, s19, s15 +; GFX9-NEXT: s_add_u32 s12, s12, s18 +; GFX9-NEXT: s_mul_hi_u32 s13, s14, s16 +; GFX9-NEXT: s_addc_u32 s12, s19, s17 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-NEXT: s_mul_i32 s12, s16, s12 -; GFX9-NEXT: s_add_u32 s12, s14, s12 -; GFX9-NEXT: s_addc_u32 s14, 0, s13 -; GFX9-NEXT: s_add_u32 s15, s18, s12 -; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 -; GFX9-NEXT: s_addc_u32 s14, s16, s14 +; GFX9-NEXT: s_mul_i32 s16, s14, s16 +; GFX9-NEXT: s_add_u32 s12, s12, s16 +; GFX9-NEXT: s_addc_u32 s13, 0, s13 +; GFX9-NEXT: s_add_u32 s15, s15, s12 +; GFX9-NEXT: s_addc_u32 s14, s14, s13 ; GFX9-NEXT: s_ashr_i32 s12, s9, 31 ; GFX9-NEXT: s_add_u32 s8, s8, s12 ; GFX9-NEXT: s_mov_b32 s13, s12 @@ -8691,38 +8668,35 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_addc_u32 s15, s16, s19 ; GFX9-NEXT: s_addc_u32 s16, s18, 0 ; GFX9-NEXT: s_mul_i32 s14, s9, s14 -; GFX9-NEXT: s_add_u32 s18, s15, s14 -; GFX9-NEXT: s_addc_u32 s19, 0, s16 -; GFX9-NEXT: s_mul_i32 s14, s6, s19 -; GFX9-NEXT: s_mul_hi_u32 s15, s6, s18 +; GFX9-NEXT: s_add_u32 s17, s15, s14 +; GFX9-NEXT: s_addc_u32 s16, 0, s16 +; GFX9-NEXT: s_mul_i32 s14, s6, s16 +; GFX9-NEXT: s_mul_hi_u32 s15, s6, s17 ; GFX9-NEXT: s_add_i32 s14, s15, s14 -; GFX9-NEXT: s_mul_i32 s15, s7, s18 -; GFX9-NEXT: s_add_i32 s20, s14, s15 -; GFX9-NEXT: s_sub_i32 s16, s9, s20 -; GFX9-NEXT: s_mul_i32 s14, s6, s18 +; GFX9-NEXT: s_mul_i32 s15, s7, s17 +; GFX9-NEXT: s_add_i32 s18, s14, s15 +; GFX9-NEXT: s_sub_i32 s19, s9, s18 +; GFX9-NEXT: s_mul_i32 s14, s6, s17 ; GFX9-NEXT: s_sub_u32 s8, s8, s14 ; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0 -; GFX9-NEXT: s_subb_u32 s21, s16, s7 -; GFX9-NEXT: s_sub_u32 s22, s8, s6 -; GFX9-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[16:17], 0 -; GFX9-NEXT: s_subb_u32 s16, s21, 0 -; GFX9-NEXT: s_cmp_ge_u32 s16, s7 -; GFX9-NEXT: s_cselect_b32 s17, -1, 0 -; GFX9-NEXT: s_cmp_ge_u32 s22, s6 +; GFX9-NEXT: s_subb_u32 s19, s19, s7 +; GFX9-NEXT: s_sub_u32 s20, s8, s6 +; GFX9-NEXT: s_subb_u32 s19, s19, 0 +; GFX9-NEXT: s_cmp_ge_u32 s19, s7 ; GFX9-NEXT: s_cselect_b32 s21, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s16, s7 -; GFX9-NEXT: s_cselect_b32 s16, s21, s17 -; GFX9-NEXT: s_add_u32 s17, s18, 1 -; GFX9-NEXT: s_addc_u32 s21, s19, 0 -; GFX9-NEXT: s_add_u32 s22, s18, 2 -; GFX9-NEXT: s_addc_u32 s23, s19, 0 -; GFX9-NEXT: s_cmp_lg_u32 s16, 0 -; GFX9-NEXT: s_cselect_b32 s16, s22, s17 -; GFX9-NEXT: s_cselect_b32 s17, s23, s21 +; GFX9-NEXT: s_cmp_ge_u32 s20, s6 +; GFX9-NEXT: s_cselect_b32 s20, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s19, s7 +; GFX9-NEXT: s_cselect_b32 s19, s20, s21 +; GFX9-NEXT: s_add_u32 s20, s17, 1 +; GFX9-NEXT: s_addc_u32 s21, s16, 0 +; GFX9-NEXT: s_add_u32 s22, s17, 2 +; GFX9-NEXT: s_addc_u32 s23, s16, 0 +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cselect_b32 s19, s22, s20 +; GFX9-NEXT: s_cselect_b32 s20, s23, s21 ; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0 -; GFX9-NEXT: s_subb_u32 s9, s9, s20 +; GFX9-NEXT: s_subb_u32 s9, s9, s18 ; GFX9-NEXT: s_cmp_ge_u32 s9, s7 ; GFX9-NEXT: s_cselect_b32 s14, -1, 0 ; GFX9-NEXT: s_cmp_ge_u32 s8, s6 @@ -8730,12 +8704,12 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_cmp_eq_u32 s9, s7 ; GFX9-NEXT: s_cselect_b32 s6, s6, s14 ; GFX9-NEXT: s_cmp_lg_u32 s6, 0 -; GFX9-NEXT: s_cselect_b32 s7, s17, s19 -; GFX9-NEXT: s_cselect_b32 s6, s16, s18 +; GFX9-NEXT: s_cselect_b32 s7, s20, s16 +; GFX9-NEXT: s_cselect_b32 s6, s19, s17 ; GFX9-NEXT: s_xor_b64 s[2:3], s[12:13], s[2:3] ; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] -; GFX9-NEXT: s_sub_u32 s14, s6, s2 -; GFX9-NEXT: s_subb_u32 s15, s7, s3 +; GFX9-NEXT: s_sub_u32 s12, s6, s2 +; GFX9-NEXT: s_subb_u32 s13, s7, s3 ; GFX9-NEXT: s_ashr_i32 s2, s1, 31 ; GFX9-NEXT: s_add_u32 s0, s0, s2 ; GFX9-NEXT: s_mov_b32 s3, s2 @@ -8744,8 +8718,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: s_sub_u32 s8, 0, s6 -; GFX9-NEXT: s_subb_u32 s9, 0, s7 +; GFX9-NEXT: s_sub_u32 s4, 0, s6 +; GFX9-NEXT: s_subb_u32 s5, 0, s7 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -8755,105 +8729,98 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: v_readfirstlane_b32 s13, v2 -; GFX9-NEXT: s_mul_hi_u32 s12, s8, s4 -; GFX9-NEXT: s_mul_i32 s16, s8, s13 -; GFX9-NEXT: s_mul_i32 s5, s9, s4 -; GFX9-NEXT: s_add_i32 s12, s12, s16 -; GFX9-NEXT: s_add_i32 s12, s12, s5 -; GFX9-NEXT: s_mul_i32 s17, s8, s4 -; GFX9-NEXT: s_mul_i32 s16, s4, s12 -; GFX9-NEXT: s_mul_hi_u32 s18, s4, s17 -; GFX9-NEXT: s_mul_hi_u32 s5, s4, s12 +; GFX9-NEXT: v_readfirstlane_b32 s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s15, v2 +; GFX9-NEXT: s_mul_hi_u32 s14, s4, s8 +; GFX9-NEXT: s_mul_i32 s16, s4, s15 +; GFX9-NEXT: s_mul_i32 s9, s5, s8 +; GFX9-NEXT: s_add_i32 s14, s14, s16 +; GFX9-NEXT: s_add_i32 s14, s14, s9 +; GFX9-NEXT: s_mul_i32 s17, s4, s8 +; GFX9-NEXT: s_mul_i32 s16, s8, s14 +; GFX9-NEXT: s_mul_hi_u32 s18, s8, s17 +; GFX9-NEXT: s_mul_hi_u32 s9, s8, s14 ; GFX9-NEXT: s_add_u32 s16, s18, s16 -; GFX9-NEXT: s_addc_u32 s5, 0, s5 -; GFX9-NEXT: s_mul_hi_u32 s19, s13, s17 -; GFX9-NEXT: s_mul_i32 s17, s13, s17 +; GFX9-NEXT: s_addc_u32 s9, 0, s9 +; GFX9-NEXT: s_mul_hi_u32 s19, s15, s17 +; GFX9-NEXT: s_mul_i32 s17, s15, s17 ; GFX9-NEXT: s_add_u32 s16, s16, s17 -; GFX9-NEXT: s_mul_hi_u32 s18, s13, s12 -; GFX9-NEXT: s_addc_u32 s5, s5, s19 +; GFX9-NEXT: s_mul_hi_u32 s18, s15, s14 +; GFX9-NEXT: s_addc_u32 s9, s9, s19 ; GFX9-NEXT: s_addc_u32 s16, s18, 0 -; GFX9-NEXT: s_mul_i32 s12, s13, s12 -; GFX9-NEXT: s_add_u32 s5, s5, s12 -; GFX9-NEXT: s_addc_u32 s12, 0, s16 -; GFX9-NEXT: s_add_u32 s16, s4, s5 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s12, s13, s12 -; GFX9-NEXT: s_mul_i32 s4, s8, s12 -; GFX9-NEXT: s_mul_hi_u32 s5, s8, s16 -; GFX9-NEXT: s_add_i32 s4, s5, s4 -; GFX9-NEXT: s_mul_i32 s9, s9, s16 -; GFX9-NEXT: s_add_i32 s4, s4, s9 -; GFX9-NEXT: s_mul_i32 s8, s8, s16 -; GFX9-NEXT: s_mul_hi_u32 s9, s12, s8 -; GFX9-NEXT: s_mul_i32 s13, s12, s8 -; GFX9-NEXT: s_mul_i32 s18, s16, s4 -; GFX9-NEXT: s_mul_hi_u32 s8, s16, s8 -; GFX9-NEXT: s_mul_hi_u32 s17, s16, s4 -; GFX9-NEXT: s_add_u32 s8, s8, s18 +; GFX9-NEXT: s_mul_i32 s14, s15, s14 +; GFX9-NEXT: s_add_u32 s9, s9, s14 +; GFX9-NEXT: s_addc_u32 s14, 0, s16 +; GFX9-NEXT: s_add_u32 s8, s8, s9 +; GFX9-NEXT: s_addc_u32 s9, s15, s14 +; GFX9-NEXT: s_mul_i32 s14, s4, s9 +; GFX9-NEXT: s_mul_hi_u32 s15, s4, s8 +; GFX9-NEXT: s_add_i32 s14, s15, s14 +; GFX9-NEXT: s_mul_i32 s5, s5, s8 +; GFX9-NEXT: s_add_i32 s14, s14, s5 +; GFX9-NEXT: s_mul_i32 s4, s4, s8 +; GFX9-NEXT: s_mul_hi_u32 s15, s9, s4 +; GFX9-NEXT: s_mul_i32 s16, s9, s4 +; GFX9-NEXT: s_mul_i32 s18, s8, s14 +; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4 +; GFX9-NEXT: s_mul_hi_u32 s17, s8, s14 +; GFX9-NEXT: s_add_u32 s4, s4, s18 ; GFX9-NEXT: s_addc_u32 s17, 0, s17 -; GFX9-NEXT: s_add_u32 s8, s8, s13 -; GFX9-NEXT: s_mul_hi_u32 s5, s12, s4 -; GFX9-NEXT: s_addc_u32 s8, s17, s9 +; GFX9-NEXT: s_add_u32 s4, s4, s16 +; GFX9-NEXT: s_mul_hi_u32 s5, s9, s14 +; GFX9-NEXT: s_addc_u32 s4, s17, s15 ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: s_mul_i32 s4, s12, s4 -; GFX9-NEXT: s_add_u32 s4, s8, s4 -; GFX9-NEXT: s_addc_u32 s8, 0, s5 -; GFX9-NEXT: s_add_u32 s13, s16, s4 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s12, s12, s8 +; GFX9-NEXT: s_mul_i32 s14, s9, s14 +; GFX9-NEXT: s_add_u32 s4, s4, s14 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 +; GFX9-NEXT: s_add_u32 s14, s8, s4 +; GFX9-NEXT: s_addc_u32 s15, s9, s5 ; GFX9-NEXT: s_ashr_i32 s4, s11, 31 ; GFX9-NEXT: s_add_u32 s8, s10, s4 ; GFX9-NEXT: s_mov_b32 s5, s4 ; GFX9-NEXT: s_addc_u32 s9, s11, s4 ; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[4:5] -; GFX9-NEXT: s_mul_i32 s11, s8, s12 -; GFX9-NEXT: s_mul_hi_u32 s16, s8, s13 -; GFX9-NEXT: s_mul_hi_u32 s10, s8, s12 +; GFX9-NEXT: s_mul_i32 s11, s8, s15 +; GFX9-NEXT: s_mul_hi_u32 s16, s8, s14 +; GFX9-NEXT: s_mul_hi_u32 s10, s8, s15 ; GFX9-NEXT: s_add_u32 s11, s16, s11 ; GFX9-NEXT: s_addc_u32 s10, 0, s10 -; GFX9-NEXT: s_mul_hi_u32 s17, s9, s13 -; GFX9-NEXT: s_mul_i32 s13, s9, s13 -; GFX9-NEXT: s_add_u32 s11, s11, s13 -; GFX9-NEXT: s_mul_hi_u32 s16, s9, s12 +; GFX9-NEXT: s_mul_hi_u32 s17, s9, s14 +; GFX9-NEXT: s_mul_i32 s14, s9, s14 +; GFX9-NEXT: s_add_u32 s11, s11, s14 +; GFX9-NEXT: s_mul_hi_u32 s16, s9, s15 ; GFX9-NEXT: s_addc_u32 s10, s10, s17 ; GFX9-NEXT: s_addc_u32 s11, s16, 0 -; GFX9-NEXT: s_mul_i32 s12, s9, s12 -; GFX9-NEXT: s_add_u32 s16, s10, s12 -; GFX9-NEXT: s_addc_u32 s17, 0, s11 -; GFX9-NEXT: s_mul_i32 s10, s6, s17 -; GFX9-NEXT: s_mul_hi_u32 s11, s6, s16 +; GFX9-NEXT: s_mul_i32 s14, s9, s15 +; GFX9-NEXT: s_add_u32 s14, s10, s14 +; GFX9-NEXT: s_addc_u32 s15, 0, s11 +; GFX9-NEXT: s_mul_i32 s10, s6, s15 +; GFX9-NEXT: s_mul_hi_u32 s11, s6, s14 ; GFX9-NEXT: s_add_i32 s10, s11, s10 -; GFX9-NEXT: s_mul_i32 s11, s7, s16 -; GFX9-NEXT: s_add_i32 s18, s10, s11 -; GFX9-NEXT: s_sub_i32 s12, s9, s18 -; GFX9-NEXT: s_mul_i32 s10, s6, s16 +; GFX9-NEXT: s_mul_i32 s11, s7, s14 +; GFX9-NEXT: s_add_i32 s16, s10, s11 +; GFX9-NEXT: s_sub_i32 s17, s9, s16 +; GFX9-NEXT: s_mul_i32 s10, s6, s14 ; GFX9-NEXT: s_sub_u32 s8, s8, s10 ; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s19, s12, s7 -; GFX9-NEXT: s_sub_u32 s20, s8, s6 -; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 -; GFX9-NEXT: s_subb_u32 s12, s19, 0 -; GFX9-NEXT: s_cmp_ge_u32 s12, s7 -; GFX9-NEXT: s_cselect_b32 s13, -1, 0 -; GFX9-NEXT: s_cmp_ge_u32 s20, s6 +; GFX9-NEXT: s_subb_u32 s17, s17, s7 +; GFX9-NEXT: s_sub_u32 s18, s8, s6 +; GFX9-NEXT: s_subb_u32 s17, s17, 0 +; GFX9-NEXT: s_cmp_ge_u32 s17, s7 ; GFX9-NEXT: s_cselect_b32 s19, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s12, s7 -; GFX9-NEXT: s_cselect_b32 s12, s19, s13 -; GFX9-NEXT: s_add_u32 s13, s16, 1 -; GFX9-NEXT: s_addc_u32 s19, s17, 0 -; GFX9-NEXT: s_add_u32 s20, s16, 2 -; GFX9-NEXT: s_addc_u32 s21, s17, 0 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 -; GFX9-NEXT: s_cselect_b32 s12, s20, s13 -; GFX9-NEXT: s_cselect_b32 s13, s21, s19 +; GFX9-NEXT: s_cmp_ge_u32 s18, s6 +; GFX9-NEXT: s_cselect_b32 s18, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s17, s7 +; GFX9-NEXT: s_cselect_b32 s17, s18, s19 +; GFX9-NEXT: s_add_u32 s18, s14, 1 +; GFX9-NEXT: s_addc_u32 s19, s15, 0 +; GFX9-NEXT: s_add_u32 s20, s14, 2 +; GFX9-NEXT: s_addc_u32 s21, s15, 0 +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cselect_b32 s17, s20, s18 +; GFX9-NEXT: s_cselect_b32 s18, s21, s19 ; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s9, s9, s18 +; GFX9-NEXT: s_subb_u32 s9, s9, s16 ; GFX9-NEXT: s_cmp_ge_u32 s9, s7 ; GFX9-NEXT: s_cselect_b32 s10, -1, 0 ; GFX9-NEXT: s_cmp_ge_u32 s8, s6 @@ -8861,14 +8828,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_cmp_eq_u32 s9, s7 ; GFX9-NEXT: s_cselect_b32 s6, s6, s10 ; GFX9-NEXT: s_cmp_lg_u32 s6, 0 -; GFX9-NEXT: s_cselect_b32 s7, s13, s17 -; GFX9-NEXT: s_cselect_b32 s6, s12, s16 +; GFX9-NEXT: s_cselect_b32 s7, s18, s15 +; GFX9-NEXT: s_cselect_b32 s6, s17, s14 ; GFX9-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] ; GFX9-NEXT: s_xor_b64 s[4:5], s[6:7], s[2:3] ; GFX9-NEXT: s_sub_u32 s2, s4, s2 ; GFX9-NEXT: s_subb_u32 s3, s5, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-NEXT: v_mov_b32_e32 v2, s15 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -9089,10 +9056,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_addc_u32 s13, 0, s14 ; GFX6-NEXT: s_add_u32 s14, s0, s1 ; GFX6-NEXT: v_mov_b32_e32 v0, s14 -; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0 +; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_cmp_lg_u32 s0, 0 ; GFX6-NEXT: s_addc_u32 s12, s12, s13 ; GFX6-NEXT: s_mul_i32 s0, s10, s12 ; GFX6-NEXT: v_readfirstlane_b32 s1, v0 @@ -9123,7 +9089,6 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_add_u32 s13, s14, s0 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_cmp_lg_u32 s0, 0 ; GFX6-NEXT: s_addc_u32 s12, s12, s10 ; GFX6-NEXT: s_ashr_i32 s10, s7, 31 ; GFX6-NEXT: s_add_u32 s0, s6, s10 @@ -9158,46 +9123,43 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: v_readfirstlane_b32 s5, v0 ; GFX6-NEXT: s_add_i32 s4, s5, s4 ; GFX6-NEXT: s_mul_i32 s5, s9, s12 -; GFX6-NEXT: s_add_i32 s13, s4, s5 -; GFX6-NEXT: s_sub_i32 s14, s7, s13 +; GFX6-NEXT: s_add_i32 s14, s4, s5 +; GFX6-NEXT: s_sub_i32 s13, s7, s14 ; GFX6-NEXT: s_mul_i32 s4, s8, s12 ; GFX6-NEXT: s_sub_u32 s6, s6, s4 ; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX6-NEXT: s_or_b32 s12, s4, s5 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_subb_u32 s14, s14, s9 -; GFX6-NEXT: s_sub_u32 s15, s6, s8 -; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX6-NEXT: s_subb_u32 s15, s13, s9 +; GFX6-NEXT: s_sub_u32 s16, s6, s8 +; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX6-NEXT: s_or_b32 s17, s12, s13 +; GFX6-NEXT: s_subb_u32 s17, s15, 0 +; GFX6-NEXT: s_cmp_ge_u32 s17, s9 +; GFX6-NEXT: s_cselect_b32 s18, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s16, s8 +; GFX6-NEXT: s_cselect_b32 s19, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s17, s9 +; GFX6-NEXT: s_cselect_b32 s18, s19, s18 +; GFX6-NEXT: s_or_b32 s12, s12, s13 +; GFX6-NEXT: s_subb_u32 s15, s15, s9 +; GFX6-NEXT: s_sub_u32 s19, s16, s8 +; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX6-NEXT: s_or_b32 s12, s12, s13 +; GFX6-NEXT: s_subb_u32 s12, s15, 0 +; GFX6-NEXT: s_cmp_lg_u32 s18, 0 +; GFX6-NEXT: s_cselect_b32 s13, s19, s16 +; GFX6-NEXT: s_cselect_b32 s12, s12, s17 ; GFX6-NEXT: s_or_b32 s4, s4, s5 -; GFX6-NEXT: s_cmp_lg_u32 s4, 0 -; GFX6-NEXT: s_subb_u32 s16, s14, 0 -; GFX6-NEXT: s_cmp_ge_u32 s16, s9 +; GFX6-NEXT: s_subb_u32 s4, s7, s14 +; GFX6-NEXT: s_cmp_ge_u32 s4, s9 ; GFX6-NEXT: s_cselect_b32 s5, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s15, s8 -; GFX6-NEXT: s_cselect_b32 s17, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s16, s9 -; GFX6-NEXT: s_cselect_b32 s17, s17, s5 -; GFX6-NEXT: s_cmp_lg_u32 s4, 0 -; GFX6-NEXT: s_subb_u32 s14, s14, s9 -; GFX6-NEXT: s_sub_u32 s18, s15, s8 -; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX6-NEXT: s_or_b32 s4, s4, s5 -; GFX6-NEXT: s_cmp_lg_u32 s4, 0 -; GFX6-NEXT: s_subb_u32 s4, s14, 0 -; GFX6-NEXT: s_cmp_lg_u32 s17, 0 -; GFX6-NEXT: s_cselect_b32 s14, s18, s15 -; GFX6-NEXT: s_cselect_b32 s4, s4, s16 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_subb_u32 s5, s7, s13 -; GFX6-NEXT: s_cmp_ge_u32 s5, s9 -; GFX6-NEXT: s_cselect_b32 s7, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s6, s8 -; GFX6-NEXT: s_cselect_b32 s8, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s5, s9 -; GFX6-NEXT: s_cselect_b32 s7, s8, s7 -; GFX6-NEXT: s_cmp_lg_u32 s7, 0 -; GFX6-NEXT: s_cselect_b32 s5, s4, s5 -; GFX6-NEXT: s_cselect_b32 s4, s14, s6 +; GFX6-NEXT: s_cselect_b32 s7, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s4, s9 +; GFX6-NEXT: s_cselect_b32 s5, s7, s5 +; GFX6-NEXT: s_cmp_lg_u32 s5, 0 +; GFX6-NEXT: s_cselect_b32 s5, s12, s4 +; GFX6-NEXT: s_cselect_b32 s4, s13, s6 ; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11] ; GFX6-NEXT: s_sub_u32 s4, s4, s10 ; GFX6-NEXT: s_subb_u32 s5, s5, s10 @@ -9219,8 +9181,8 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_sub_u32 s8, 0, s6 -; GFX9-NEXT: s_subb_u32 s9, 0, s7 +; GFX9-NEXT: s_sub_u32 s4, 0, s6 +; GFX9-NEXT: s_subb_u32 s5, 0, s7 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -9230,56 +9192,52 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s10, v2 -; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: s_mul_i32 s5, s8, s10 -; GFX9-NEXT: s_mul_hi_u32 s12, s8, s4 -; GFX9-NEXT: s_mul_i32 s11, s9, s4 -; GFX9-NEXT: s_add_i32 s5, s12, s5 -; GFX9-NEXT: s_mul_i32 s13, s8, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s11 -; GFX9-NEXT: s_mul_hi_u32 s12, s4, s13 -; GFX9-NEXT: s_mul_i32 s14, s4, s5 -; GFX9-NEXT: s_mul_hi_u32 s11, s4, s5 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v1 +; GFX9-NEXT: s_mul_i32 s10, s4, s8 +; GFX9-NEXT: s_mul_hi_u32 s12, s4, s9 +; GFX9-NEXT: s_mul_i32 s11, s5, s9 +; GFX9-NEXT: s_add_i32 s10, s12, s10 +; GFX9-NEXT: s_mul_i32 s13, s4, s9 +; GFX9-NEXT: s_add_i32 s10, s10, s11 +; GFX9-NEXT: s_mul_hi_u32 s12, s9, s13 +; GFX9-NEXT: s_mul_i32 s14, s9, s10 +; GFX9-NEXT: s_mul_hi_u32 s11, s9, s10 ; GFX9-NEXT: s_add_u32 s12, s12, s14 ; GFX9-NEXT: s_addc_u32 s11, 0, s11 -; GFX9-NEXT: s_mul_hi_u32 s15, s10, s13 -; GFX9-NEXT: s_mul_i32 s13, s10, s13 +; GFX9-NEXT: s_mul_hi_u32 s15, s8, s13 +; GFX9-NEXT: s_mul_i32 s13, s8, s13 ; GFX9-NEXT: s_add_u32 s12, s12, s13 -; GFX9-NEXT: s_mul_hi_u32 s14, s10, s5 +; GFX9-NEXT: s_mul_hi_u32 s14, s8, s10 ; GFX9-NEXT: s_addc_u32 s11, s11, s15 ; GFX9-NEXT: s_addc_u32 s12, s14, 0 -; GFX9-NEXT: s_mul_i32 s5, s10, s5 -; GFX9-NEXT: s_add_u32 s5, s11, s5 +; GFX9-NEXT: s_mul_i32 s10, s8, s10 +; GFX9-NEXT: s_add_u32 s10, s11, s10 ; GFX9-NEXT: s_addc_u32 s11, 0, s12 -; GFX9-NEXT: s_add_u32 s12, s4, s5 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s10, s10, s11 -; GFX9-NEXT: s_mul_i32 s4, s8, s10 -; GFX9-NEXT: s_mul_hi_u32 s5, s8, s12 -; GFX9-NEXT: s_add_i32 s4, s5, s4 -; GFX9-NEXT: s_mul_i32 s9, s9, s12 -; GFX9-NEXT: s_add_i32 s4, s4, s9 -; GFX9-NEXT: s_mul_i32 s8, s8, s12 -; GFX9-NEXT: s_mul_hi_u32 s9, s10, s8 -; GFX9-NEXT: s_mul_i32 s11, s10, s8 -; GFX9-NEXT: s_mul_i32 s14, s12, s4 -; GFX9-NEXT: s_mul_hi_u32 s8, s12, s8 -; GFX9-NEXT: s_mul_hi_u32 s13, s12, s4 -; GFX9-NEXT: s_add_u32 s8, s8, s14 +; GFX9-NEXT: s_add_u32 s9, s9, s10 +; GFX9-NEXT: s_addc_u32 s8, s8, s11 +; GFX9-NEXT: s_mul_i32 s10, s4, s8 +; GFX9-NEXT: s_mul_hi_u32 s11, s4, s9 +; GFX9-NEXT: s_add_i32 s10, s11, s10 +; GFX9-NEXT: s_mul_i32 s5, s5, s9 +; GFX9-NEXT: s_add_i32 s10, s10, s5 +; GFX9-NEXT: s_mul_i32 s4, s4, s9 +; GFX9-NEXT: s_mul_hi_u32 s11, s8, s4 +; GFX9-NEXT: s_mul_i32 s12, s8, s4 +; GFX9-NEXT: s_mul_i32 s14, s9, s10 +; GFX9-NEXT: s_mul_hi_u32 s4, s9, s4 +; GFX9-NEXT: s_mul_hi_u32 s13, s9, s10 +; GFX9-NEXT: s_add_u32 s4, s4, s14 ; GFX9-NEXT: s_addc_u32 s13, 0, s13 -; GFX9-NEXT: s_add_u32 s8, s8, s11 -; GFX9-NEXT: s_mul_hi_u32 s5, s10, s4 -; GFX9-NEXT: s_addc_u32 s8, s13, s9 +; GFX9-NEXT: s_add_u32 s4, s4, s12 +; GFX9-NEXT: s_mul_hi_u32 s5, s8, s10 +; GFX9-NEXT: s_addc_u32 s4, s13, s11 ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: s_mul_i32 s4, s10, s4 -; GFX9-NEXT: s_add_u32 s4, s8, s4 -; GFX9-NEXT: s_addc_u32 s8, 0, s5 -; GFX9-NEXT: s_add_u32 s9, s12, s4 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s8, s10, s8 +; GFX9-NEXT: s_mul_i32 s10, s8, s10 +; GFX9-NEXT: s_add_u32 s4, s4, s10 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 +; GFX9-NEXT: s_add_u32 s9, s9, s4 +; GFX9-NEXT: s_addc_u32 s8, s8, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 ; GFX9-NEXT: s_add_u32 s2, s2, s4 @@ -9309,11 +9267,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_mul_i32 s8, s6, s8 ; GFX9-NEXT: s_sub_u32 s2, s2, s8 ; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX9-NEXT: s_subb_u32 s13, s10, s7 ; GFX9-NEXT: s_sub_u32 s14, s2, s6 ; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 ; GFX9-NEXT: s_subb_u32 s15, s13, 0 ; GFX9-NEXT: s_cmp_ge_u32 s15, s7 ; GFX9-NEXT: s_cselect_b32 s16, -1, 0 @@ -9322,13 +9278,11 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_cmp_eq_u32 s15, s7 ; GFX9-NEXT: s_cselect_b32 s16, s17, s16 ; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s13, s13, s7 -; GFX9-NEXT: s_sub_u32 s17, s14, s6 -; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s10, s13, 0 +; GFX9-NEXT: s_subb_u32 s10, s13, s7 +; GFX9-NEXT: s_sub_u32 s11, s14, s6 +; GFX9-NEXT: s_subb_u32 s10, s10, 0 ; GFX9-NEXT: s_cmp_lg_u32 s16, 0 -; GFX9-NEXT: s_cselect_b32 s11, s17, s14 +; GFX9-NEXT: s_cselect_b32 s11, s11, s14 ; GFX9-NEXT: s_cselect_b32 s10, s10, s15 ; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX9-NEXT: s_subb_u32 s3, s3, s12 @@ -9490,10 +9444,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_addc_u32 s15, 0, s16 ; GFX6-NEXT: s_add_u32 s16, s6, s7 ; GFX6-NEXT: v_mov_b32_e32 v0, s16 -; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 +; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX6-NEXT: s_or_b32 s6, s6, s7 -; GFX6-NEXT: s_cmp_lg_u32 s6, 0 ; GFX6-NEXT: s_addc_u32 s14, s14, s15 ; GFX6-NEXT: s_mul_i32 s6, s12, s14 ; GFX6-NEXT: v_readfirstlane_b32 s7, v0 @@ -9524,7 +9477,6 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_add_u32 s13, s16, s6 ; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX6-NEXT: s_or_b32 s6, s6, s7 -; GFX6-NEXT: s_cmp_lg_u32 s6, 0 ; GFX6-NEXT: s_addc_u32 s12, s14, s12 ; GFX6-NEXT: s_ashr_i32 s6, s9, 31 ; GFX6-NEXT: s_add_u32 s8, s8, s6 @@ -9557,49 +9509,46 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_readfirstlane_b32 s14, v0 ; GFX6-NEXT: s_add_i32 s13, s14, s13 ; GFX6-NEXT: s_mul_i32 s14, s3, s12 -; GFX6-NEXT: s_add_i32 s14, s13, s14 -; GFX6-NEXT: s_sub_i32 s15, s9, s14 +; GFX6-NEXT: s_add_i32 s16, s13, s14 +; GFX6-NEXT: s_sub_i32 s14, s9, s16 ; GFX6-NEXT: s_mul_i32 s12, s2, s12 ; GFX6-NEXT: s_sub_u32 s8, s8, s12 ; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s16, s12, s13 -; GFX6-NEXT: s_cmp_lg_u32 s16, 0 -; GFX6-NEXT: s_subb_u32 s15, s15, s3 -; GFX6-NEXT: s_sub_u32 s17, s8, s2 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_subb_u32 s18, s15, 0 -; GFX6-NEXT: s_cmp_ge_u32 s18, s3 -; GFX6-NEXT: s_cselect_b32 s13, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s17, s2 -; GFX6-NEXT: s_cselect_b32 s19, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s18, s3 -; GFX6-NEXT: s_cselect_b32 s19, s19, s13 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_subb_u32 s15, s15, s3 -; GFX6-NEXT: s_sub_u32 s20, s17, s2 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX6-NEXT: s_or_b32 s15, s12, s13 +; GFX6-NEXT: s_subb_u32 s17, s14, s3 +; GFX6-NEXT: s_sub_u32 s18, s8, s2 +; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX6-NEXT: s_or_b32 s19, s14, s15 +; GFX6-NEXT: s_subb_u32 s19, s17, 0 +; GFX6-NEXT: s_cmp_ge_u32 s19, s3 +; GFX6-NEXT: s_cselect_b32 s20, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s18, s2 +; GFX6-NEXT: s_cselect_b32 s21, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s19, s3 +; GFX6-NEXT: s_cselect_b32 s20, s21, s20 +; GFX6-NEXT: s_or_b32 s14, s14, s15 +; GFX6-NEXT: s_subb_u32 s17, s17, s3 +; GFX6-NEXT: s_sub_u32 s21, s18, s2 +; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX6-NEXT: s_or_b32 s14, s14, s15 +; GFX6-NEXT: s_subb_u32 s14, s17, 0 +; GFX6-NEXT: s_cmp_lg_u32 s20, 0 +; GFX6-NEXT: s_cselect_b32 s15, s21, s18 +; GFX6-NEXT: s_cselect_b32 s14, s14, s19 ; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_subb_u32 s12, s15, 0 -; GFX6-NEXT: s_cmp_lg_u32 s19, 0 -; GFX6-NEXT: s_cselect_b32 s13, s20, s17 -; GFX6-NEXT: s_cselect_b32 s12, s12, s18 -; GFX6-NEXT: s_cmp_lg_u32 s16, 0 -; GFX6-NEXT: s_subb_u32 s9, s9, s14 +; GFX6-NEXT: s_subb_u32 s9, s9, s16 ; GFX6-NEXT: s_cmp_ge_u32 s9, s3 -; GFX6-NEXT: s_cselect_b32 s14, -1, 0 +; GFX6-NEXT: s_cselect_b32 s12, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s8, s2 ; GFX6-NEXT: s_cselect_b32 s2, -1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s9, s3 -; GFX6-NEXT: s_cselect_b32 s2, s2, s14 +; GFX6-NEXT: s_cselect_b32 s2, s2, s12 ; GFX6-NEXT: s_cmp_lg_u32 s2, 0 -; GFX6-NEXT: s_cselect_b32 s3, s12, s9 -; GFX6-NEXT: s_cselect_b32 s2, s13, s8 +; GFX6-NEXT: s_cselect_b32 s3, s14, s9 +; GFX6-NEXT: s_cselect_b32 s2, s15, s8 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7] -; GFX6-NEXT: s_sub_u32 s12, s2, s6 -; GFX6-NEXT: s_subb_u32 s13, s3, s6 +; GFX6-NEXT: s_sub_u32 s14, s2, s6 +; GFX6-NEXT: s_subb_u32 s15, s3, s6 ; GFX6-NEXT: s_ashr_i32 s2, s1, 31 ; GFX6-NEXT: s_add_u32 s0, s0, s2 ; GFX6-NEXT: s_mov_b32 s3, s2 @@ -9618,40 +9567,39 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_hi_u32 v2, s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s14, v1 +; GFX6-NEXT: v_readfirstlane_b32 s12, v1 ; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s1, s8, s14 +; GFX6-NEXT: s_mul_i32 s1, s8, s12 ; GFX6-NEXT: v_readfirstlane_b32 s3, v2 ; GFX6-NEXT: s_mul_i32 s0, s9, s2 ; GFX6-NEXT: s_add_i32 s1, s3, s1 ; GFX6-NEXT: s_add_i32 s3, s1, s0 -; GFX6-NEXT: s_mul_i32 s15, s8, s2 +; GFX6-NEXT: s_mul_i32 s13, s8, s2 ; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, s15 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s13 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mul_i32 s4, s2, s3 ; GFX6-NEXT: v_readfirstlane_b32 s5, v2 ; GFX6-NEXT: v_readfirstlane_b32 s16, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, v1, s15 +; GFX6-NEXT: v_mul_hi_u32 v0, v1, s13 ; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3 ; GFX6-NEXT: s_add_u32 s4, s16, s4 ; GFX6-NEXT: s_addc_u32 s5, 0, s5 -; GFX6-NEXT: s_mul_i32 s15, s14, s15 +; GFX6-NEXT: s_mul_i32 s13, s12, s13 ; GFX6-NEXT: v_readfirstlane_b32 s16, v0 -; GFX6-NEXT: s_add_u32 s4, s4, s15 +; GFX6-NEXT: s_add_u32 s4, s4, s13 ; GFX6-NEXT: s_addc_u32 s4, s5, s16 ; GFX6-NEXT: v_readfirstlane_b32 s5, v1 ; GFX6-NEXT: s_addc_u32 s5, s5, 0 -; GFX6-NEXT: s_mul_i32 s3, s14, s3 +; GFX6-NEXT: s_mul_i32 s3, s12, s3 ; GFX6-NEXT: s_add_u32 s3, s4, s3 ; GFX6-NEXT: s_addc_u32 s4, 0, s5 ; GFX6-NEXT: s_add_u32 s5, s2, s3 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 +; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_cmp_lg_u32 s2, 0 -; GFX6-NEXT: s_addc_u32 s4, s14, s4 +; GFX6-NEXT: s_addc_u32 s4, s12, s4 ; GFX6-NEXT: s_mul_i32 s2, s8, s4 ; GFX6-NEXT: v_readfirstlane_b32 s3, v0 ; GFX6-NEXT: s_add_i32 s2, s3, s2 @@ -9665,102 +9613,98 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX6-NEXT: s_mul_i32 s9, s5, s2 -; GFX6-NEXT: v_readfirstlane_b32 s15, v2 -; GFX6-NEXT: s_add_u32 s9, s15, s9 -; GFX6-NEXT: v_readfirstlane_b32 s14, v0 +; GFX6-NEXT: v_readfirstlane_b32 s13, v2 +; GFX6-NEXT: s_add_u32 s9, s13, s9 +; GFX6-NEXT: v_readfirstlane_b32 s12, v0 ; GFX6-NEXT: s_mul_i32 s3, s4, s3 -; GFX6-NEXT: s_addc_u32 s14, 0, s14 +; GFX6-NEXT: s_addc_u32 s12, 0, s12 ; GFX6-NEXT: v_readfirstlane_b32 s8, v3 ; GFX6-NEXT: s_add_u32 s3, s9, s3 -; GFX6-NEXT: s_addc_u32 s3, s14, s8 +; GFX6-NEXT: s_addc_u32 s3, s12, s8 ; GFX6-NEXT: v_readfirstlane_b32 s8, v1 ; GFX6-NEXT: s_addc_u32 s8, s8, 0 ; GFX6-NEXT: s_mul_i32 s2, s4, s2 ; GFX6-NEXT: s_add_u32 s2, s3, s2 ; GFX6-NEXT: s_addc_u32 s8, 0, s8 -; GFX6-NEXT: s_add_u32 s14, s5, s2 +; GFX6-NEXT: s_add_u32 s12, s5, s2 ; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_cmp_lg_u32 s2, 0 -; GFX6-NEXT: s_addc_u32 s15, s4, s8 +; GFX6-NEXT: s_addc_u32 s13, s4, s8 ; GFX6-NEXT: s_ashr_i32 s4, s11, 31 ; GFX6-NEXT: s_add_u32 s2, s10, s4 ; GFX6-NEXT: s_mov_b32 s5, s4 ; GFX6-NEXT: s_addc_u32 s3, s11, s4 ; GFX6-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v0, s15 +; GFX6-NEXT: v_mov_b32_e32 v0, s13 ; GFX6-NEXT: v_mul_hi_u32 v1, s8, v0 -; GFX6-NEXT: v_mov_b32_e32 v2, s14 +; GFX6-NEXT: v_mov_b32_e32 v2, s12 ; GFX6-NEXT: v_mul_hi_u32 v3, s8, v2 -; GFX6-NEXT: s_mul_i32 s2, s8, s15 +; GFX6-NEXT: s_mul_i32 s2, s8, s13 ; GFX6-NEXT: v_readfirstlane_b32 s10, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s9, v2 ; GFX6-NEXT: v_readfirstlane_b32 s11, v3 ; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0 ; GFX6-NEXT: s_add_u32 s2, s11, s2 ; GFX6-NEXT: s_addc_u32 s10, 0, s10 -; GFX6-NEXT: s_mul_i32 s11, s9, s14 -; GFX6-NEXT: v_readfirstlane_b32 s14, v1 +; GFX6-NEXT: s_mul_i32 s11, s9, s12 +; GFX6-NEXT: v_readfirstlane_b32 s12, v1 ; GFX6-NEXT: s_add_u32 s2, s2, s11 -; GFX6-NEXT: s_addc_u32 s2, s10, s14 +; GFX6-NEXT: s_addc_u32 s2, s10, s12 ; GFX6-NEXT: v_readfirstlane_b32 s10, v0 ; GFX6-NEXT: s_addc_u32 s10, s10, 0 -; GFX6-NEXT: s_mul_i32 s11, s9, s15 +; GFX6-NEXT: s_mul_i32 s11, s9, s13 ; GFX6-NEXT: s_add_u32 s11, s2, s11 ; GFX6-NEXT: v_mov_b32_e32 v0, s11 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX6-NEXT: s_addc_u32 s10, 0, s10 ; GFX6-NEXT: s_mul_i32 s10, s6, s10 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: v_readfirstlane_b32 s14, v0 -; GFX6-NEXT: s_add_i32 s10, s14, s10 -; GFX6-NEXT: s_mul_i32 s14, s7, s11 -; GFX6-NEXT: s_add_i32 s14, s10, s14 -; GFX6-NEXT: s_sub_i32 s15, s9, s14 +; GFX6-NEXT: v_readfirstlane_b32 s12, v0 +; GFX6-NEXT: s_add_i32 s10, s12, s10 +; GFX6-NEXT: s_mul_i32 s12, s7, s11 +; GFX6-NEXT: s_add_i32 s16, s10, s12 +; GFX6-NEXT: s_sub_i32 s12, s9, s16 ; GFX6-NEXT: s_mul_i32 s10, s6, s11 ; GFX6-NEXT: s_sub_u32 s8, s8, s10 ; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX6-NEXT: s_or_b32 s16, s10, s11 -; GFX6-NEXT: s_cmp_lg_u32 s16, 0 -; GFX6-NEXT: s_subb_u32 s15, s15, s7 -; GFX6-NEXT: s_sub_u32 s17, s8, s6 -; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX6-NEXT: s_or_b32 s10, s10, s11 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 -; GFX6-NEXT: s_subb_u32 s18, s15, 0 -; GFX6-NEXT: s_cmp_ge_u32 s18, s7 -; GFX6-NEXT: s_cselect_b32 s11, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s17, s6 -; GFX6-NEXT: s_cselect_b32 s19, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s18, s7 -; GFX6-NEXT: s_cselect_b32 s19, s19, s11 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 -; GFX6-NEXT: s_subb_u32 s15, s15, s7 -; GFX6-NEXT: s_sub_u32 s20, s17, s6 -; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX6-NEXT: s_or_b32 s13, s10, s11 +; GFX6-NEXT: s_subb_u32 s17, s12, s7 +; GFX6-NEXT: s_sub_u32 s18, s8, s6 +; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX6-NEXT: s_or_b32 s19, s12, s13 +; GFX6-NEXT: s_subb_u32 s19, s17, 0 +; GFX6-NEXT: s_cmp_ge_u32 s19, s7 +; GFX6-NEXT: s_cselect_b32 s20, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s18, s6 +; GFX6-NEXT: s_cselect_b32 s21, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s19, s7 +; GFX6-NEXT: s_cselect_b32 s20, s21, s20 +; GFX6-NEXT: s_or_b32 s12, s12, s13 +; GFX6-NEXT: s_subb_u32 s17, s17, s7 +; GFX6-NEXT: s_sub_u32 s21, s18, s6 +; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX6-NEXT: s_or_b32 s12, s12, s13 +; GFX6-NEXT: s_subb_u32 s12, s17, 0 +; GFX6-NEXT: s_cmp_lg_u32 s20, 0 +; GFX6-NEXT: s_cselect_b32 s13, s21, s18 +; GFX6-NEXT: s_cselect_b32 s12, s12, s19 ; GFX6-NEXT: s_or_b32 s10, s10, s11 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 -; GFX6-NEXT: s_subb_u32 s10, s15, 0 -; GFX6-NEXT: s_cmp_lg_u32 s19, 0 -; GFX6-NEXT: s_cselect_b32 s11, s20, s17 -; GFX6-NEXT: s_cselect_b32 s10, s10, s18 -; GFX6-NEXT: s_cmp_lg_u32 s16, 0 -; GFX6-NEXT: s_subb_u32 s9, s9, s14 +; GFX6-NEXT: s_subb_u32 s9, s9, s16 ; GFX6-NEXT: s_cmp_ge_u32 s9, s7 -; GFX6-NEXT: s_cselect_b32 s14, -1, 0 +; GFX6-NEXT: s_cselect_b32 s10, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s8, s6 ; GFX6-NEXT: s_cselect_b32 s6, -1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s9, s7 -; GFX6-NEXT: s_cselect_b32 s6, s6, s14 +; GFX6-NEXT: s_cselect_b32 s6, s6, s10 ; GFX6-NEXT: s_cmp_lg_u32 s6, 0 -; GFX6-NEXT: s_cselect_b32 s7, s10, s9 -; GFX6-NEXT: s_cselect_b32 s6, s11, s8 +; GFX6-NEXT: s_cselect_b32 s7, s12, s9 +; GFX6-NEXT: s_cselect_b32 s6, s13, s8 ; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] ; GFX6-NEXT: s_sub_u32 s5, s6, s4 ; GFX6-NEXT: s_subb_u32 s4, s7, s4 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mov_b32_e32 v0, s12 -; GFX6-NEXT: v_mov_b32_e32 v1, s13 +; GFX6-NEXT: v_mov_b32_e32 v0, s14 +; GFX6-NEXT: v_mov_b32_e32 v1, s15 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: v_mov_b32_e32 v3, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9780,8 +9724,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX9-NEXT: s_sub_u32 s12, 0, s2 -; GFX9-NEXT: s_subb_u32 s13, 0, s3 +; GFX9-NEXT: s_sub_u32 s6, 0, s2 +; GFX9-NEXT: s_subb_u32 s7, 0, s3 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -9790,56 +9734,52 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s14, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: s_mul_i32 s7, s12, s14 -; GFX9-NEXT: s_mul_hi_u32 s16, s12, s6 -; GFX9-NEXT: s_mul_i32 s15, s13, s6 -; GFX9-NEXT: s_add_i32 s7, s16, s7 -; GFX9-NEXT: s_mul_i32 s17, s12, s6 -; GFX9-NEXT: s_add_i32 s7, s7, s15 -; GFX9-NEXT: s_mul_hi_u32 s16, s6, s17 -; GFX9-NEXT: s_mul_i32 s18, s6, s7 -; GFX9-NEXT: s_mul_hi_u32 s15, s6, s7 +; GFX9-NEXT: v_readfirstlane_b32 s12, v1 +; GFX9-NEXT: v_readfirstlane_b32 s13, v0 +; GFX9-NEXT: s_mul_i32 s14, s6, s12 +; GFX9-NEXT: s_mul_hi_u32 s16, s6, s13 +; GFX9-NEXT: s_mul_i32 s15, s7, s13 +; GFX9-NEXT: s_add_i32 s14, s16, s14 +; GFX9-NEXT: s_mul_i32 s17, s6, s13 +; GFX9-NEXT: s_add_i32 s14, s14, s15 +; GFX9-NEXT: s_mul_hi_u32 s16, s13, s17 +; GFX9-NEXT: s_mul_i32 s18, s13, s14 +; GFX9-NEXT: s_mul_hi_u32 s15, s13, s14 ; GFX9-NEXT: s_add_u32 s16, s16, s18 ; GFX9-NEXT: s_addc_u32 s15, 0, s15 -; GFX9-NEXT: s_mul_hi_u32 s18, s14, s17 -; GFX9-NEXT: s_mul_i32 s17, s14, s17 +; GFX9-NEXT: s_mul_hi_u32 s18, s12, s17 +; GFX9-NEXT: s_mul_i32 s17, s12, s17 ; GFX9-NEXT: s_add_u32 s16, s16, s17 -; GFX9-NEXT: s_mul_hi_u32 s19, s14, s7 +; GFX9-NEXT: s_mul_hi_u32 s19, s12, s14 ; GFX9-NEXT: s_addc_u32 s15, s15, s18 ; GFX9-NEXT: s_addc_u32 s16, s19, 0 -; GFX9-NEXT: s_mul_i32 s7, s14, s7 -; GFX9-NEXT: s_add_u32 s7, s15, s7 +; GFX9-NEXT: s_mul_i32 s14, s12, s14 +; GFX9-NEXT: s_add_u32 s14, s15, s14 ; GFX9-NEXT: s_addc_u32 s15, 0, s16 -; GFX9-NEXT: s_add_u32 s16, s6, s7 -; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9-NEXT: s_addc_u32 s14, s14, s15 -; GFX9-NEXT: s_mul_i32 s6, s12, s14 -; GFX9-NEXT: s_mul_hi_u32 s7, s12, s16 -; GFX9-NEXT: s_add_i32 s6, s7, s6 -; GFX9-NEXT: s_mul_i32 s13, s13, s16 -; GFX9-NEXT: s_add_i32 s6, s6, s13 -; GFX9-NEXT: s_mul_i32 s12, s12, s16 -; GFX9-NEXT: s_mul_hi_u32 s13, s14, s12 -; GFX9-NEXT: s_mul_i32 s15, s14, s12 -; GFX9-NEXT: s_mul_i32 s18, s16, s6 -; GFX9-NEXT: s_mul_hi_u32 s12, s16, s12 -; GFX9-NEXT: s_mul_hi_u32 s17, s16, s6 -; GFX9-NEXT: s_add_u32 s12, s12, s18 +; GFX9-NEXT: s_add_u32 s13, s13, s14 +; GFX9-NEXT: s_addc_u32 s12, s12, s15 +; GFX9-NEXT: s_mul_i32 s14, s6, s12 +; GFX9-NEXT: s_mul_hi_u32 s15, s6, s13 +; GFX9-NEXT: s_add_i32 s14, s15, s14 +; GFX9-NEXT: s_mul_i32 s7, s7, s13 +; GFX9-NEXT: s_add_i32 s14, s14, s7 +; GFX9-NEXT: s_mul_i32 s6, s6, s13 +; GFX9-NEXT: s_mul_hi_u32 s15, s12, s6 +; GFX9-NEXT: s_mul_i32 s16, s12, s6 +; GFX9-NEXT: s_mul_i32 s18, s13, s14 +; GFX9-NEXT: s_mul_hi_u32 s6, s13, s6 +; GFX9-NEXT: s_mul_hi_u32 s17, s13, s14 +; GFX9-NEXT: s_add_u32 s6, s6, s18 ; GFX9-NEXT: s_addc_u32 s17, 0, s17 -; GFX9-NEXT: s_add_u32 s12, s12, s15 -; GFX9-NEXT: s_mul_hi_u32 s7, s14, s6 -; GFX9-NEXT: s_addc_u32 s12, s17, s13 +; GFX9-NEXT: s_add_u32 s6, s6, s16 +; GFX9-NEXT: s_mul_hi_u32 s7, s12, s14 +; GFX9-NEXT: s_addc_u32 s6, s17, s15 ; GFX9-NEXT: s_addc_u32 s7, s7, 0 -; GFX9-NEXT: s_mul_i32 s6, s14, s6 -; GFX9-NEXT: s_add_u32 s6, s12, s6 -; GFX9-NEXT: s_addc_u32 s12, 0, s7 -; GFX9-NEXT: s_add_u32 s13, s16, s6 -; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9-NEXT: s_addc_u32 s12, s14, s12 +; GFX9-NEXT: s_mul_i32 s14, s12, s14 +; GFX9-NEXT: s_add_u32 s6, s6, s14 +; GFX9-NEXT: s_addc_u32 s7, 0, s7 +; GFX9-NEXT: s_add_u32 s13, s13, s6 +; GFX9-NEXT: s_addc_u32 s12, s12, s7 ; GFX9-NEXT: s_ashr_i32 s6, s9, 31 ; GFX9-NEXT: s_add_u32 s8, s8, s6 ; GFX9-NEXT: s_mov_b32 s7, s6 @@ -9868,11 +9808,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_mul_i32 s12, s2, s12 ; GFX9-NEXT: s_sub_u32 s8, s8, s12 ; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 ; GFX9-NEXT: s_subb_u32 s17, s14, s3 ; GFX9-NEXT: s_sub_u32 s18, s8, s2 ; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0 ; GFX9-NEXT: s_subb_u32 s19, s17, 0 ; GFX9-NEXT: s_cmp_ge_u32 s19, s3 ; GFX9-NEXT: s_cselect_b32 s20, -1, 0 @@ -9881,13 +9819,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_cmp_eq_u32 s19, s3 ; GFX9-NEXT: s_cselect_b32 s20, s21, s20 ; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0 -; GFX9-NEXT: s_subb_u32 s17, s17, s3 -; GFX9-NEXT: s_sub_u32 s21, s18, s2 -; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0 -; GFX9-NEXT: s_subb_u32 s14, s17, 0 +; GFX9-NEXT: s_subb_u32 s14, s17, s3 +; GFX9-NEXT: s_sub_u32 s15, s18, s2 +; GFX9-NEXT: s_subb_u32 s14, s14, 0 ; GFX9-NEXT: s_cmp_lg_u32 s20, 0 -; GFX9-NEXT: s_cselect_b32 s15, s21, s18 +; GFX9-NEXT: s_cselect_b32 s15, s15, s18 ; GFX9-NEXT: s_cselect_b32 s14, s14, s19 ; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 ; GFX9-NEXT: s_subb_u32 s9, s9, s16 @@ -9911,8 +9847,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: s_sub_u32 s6, 0, s2 -; GFX9-NEXT: s_subb_u32 s7, 0, s3 +; GFX9-NEXT: s_sub_u32 s4, 0, s2 +; GFX9-NEXT: s_subb_u32 s5, 0, s3 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -9922,74 +9858,70 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v1 ; GFX9-NEXT: v_readfirstlane_b32 s9, v2 -; GFX9-NEXT: s_mul_hi_u32 s8, s6, s4 -; GFX9-NEXT: s_mul_i32 s14, s6, s9 -; GFX9-NEXT: s_mul_i32 s5, s7, s4 +; GFX9-NEXT: s_mul_hi_u32 s8, s4, s6 +; GFX9-NEXT: s_mul_i32 s14, s4, s9 +; GFX9-NEXT: s_mul_i32 s7, s5, s6 ; GFX9-NEXT: s_add_i32 s8, s8, s14 -; GFX9-NEXT: s_add_i32 s8, s8, s5 -; GFX9-NEXT: s_mul_i32 s15, s6, s4 -; GFX9-NEXT: s_mul_i32 s14, s4, s8 -; GFX9-NEXT: s_mul_hi_u32 s16, s4, s15 -; GFX9-NEXT: s_mul_hi_u32 s5, s4, s8 +; GFX9-NEXT: s_add_i32 s8, s8, s7 +; GFX9-NEXT: s_mul_i32 s15, s4, s6 +; GFX9-NEXT: s_mul_i32 s14, s6, s8 +; GFX9-NEXT: s_mul_hi_u32 s16, s6, s15 +; GFX9-NEXT: s_mul_hi_u32 s7, s6, s8 ; GFX9-NEXT: s_add_u32 s14, s16, s14 -; GFX9-NEXT: s_addc_u32 s5, 0, s5 +; GFX9-NEXT: s_addc_u32 s7, 0, s7 ; GFX9-NEXT: s_mul_hi_u32 s17, s9, s15 ; GFX9-NEXT: s_mul_i32 s15, s9, s15 ; GFX9-NEXT: s_add_u32 s14, s14, s15 ; GFX9-NEXT: s_mul_hi_u32 s16, s9, s8 -; GFX9-NEXT: s_addc_u32 s5, s5, s17 +; GFX9-NEXT: s_addc_u32 s7, s7, s17 ; GFX9-NEXT: s_addc_u32 s14, s16, 0 ; GFX9-NEXT: s_mul_i32 s8, s9, s8 -; GFX9-NEXT: s_add_u32 s5, s5, s8 +; GFX9-NEXT: s_add_u32 s7, s7, s8 ; GFX9-NEXT: s_addc_u32 s8, 0, s14 -; GFX9-NEXT: s_add_u32 s14, s4, s5 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s8, s9, s8 -; GFX9-NEXT: s_mul_i32 s4, s6, s8 -; GFX9-NEXT: s_mul_hi_u32 s5, s6, s14 -; GFX9-NEXT: s_add_i32 s4, s5, s4 -; GFX9-NEXT: s_mul_i32 s7, s7, s14 -; GFX9-NEXT: s_add_i32 s4, s4, s7 -; GFX9-NEXT: s_mul_i32 s6, s6, s14 -; GFX9-NEXT: s_mul_hi_u32 s7, s8, s6 -; GFX9-NEXT: s_mul_i32 s9, s8, s6 -; GFX9-NEXT: s_mul_i32 s16, s14, s4 -; GFX9-NEXT: s_mul_hi_u32 s6, s14, s6 -; GFX9-NEXT: s_mul_hi_u32 s15, s14, s4 -; GFX9-NEXT: s_add_u32 s6, s6, s16 +; GFX9-NEXT: s_add_u32 s6, s6, s7 +; GFX9-NEXT: s_addc_u32 s7, s9, s8 +; GFX9-NEXT: s_mul_i32 s8, s4, s7 +; GFX9-NEXT: s_mul_hi_u32 s9, s4, s6 +; GFX9-NEXT: s_add_i32 s8, s9, s8 +; GFX9-NEXT: s_mul_i32 s5, s5, s6 +; GFX9-NEXT: s_add_i32 s8, s8, s5 +; GFX9-NEXT: s_mul_i32 s4, s4, s6 +; GFX9-NEXT: s_mul_hi_u32 s9, s7, s4 +; GFX9-NEXT: s_mul_i32 s14, s7, s4 +; GFX9-NEXT: s_mul_i32 s16, s6, s8 +; GFX9-NEXT: s_mul_hi_u32 s4, s6, s4 +; GFX9-NEXT: s_mul_hi_u32 s15, s6, s8 +; GFX9-NEXT: s_add_u32 s4, s4, s16 ; GFX9-NEXT: s_addc_u32 s15, 0, s15 -; GFX9-NEXT: s_add_u32 s6, s6, s9 -; GFX9-NEXT: s_mul_hi_u32 s5, s8, s4 -; GFX9-NEXT: s_addc_u32 s6, s15, s7 +; GFX9-NEXT: s_add_u32 s4, s4, s14 +; GFX9-NEXT: s_mul_hi_u32 s5, s7, s8 +; GFX9-NEXT: s_addc_u32 s4, s15, s9 ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: s_mul_i32 s4, s8, s4 -; GFX9-NEXT: s_add_u32 s4, s6, s4 -; GFX9-NEXT: s_addc_u32 s6, 0, s5 -; GFX9-NEXT: s_add_u32 s9, s14, s4 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s8, s8, s6 +; GFX9-NEXT: s_mul_i32 s8, s7, s8 +; GFX9-NEXT: s_add_u32 s4, s4, s8 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 +; GFX9-NEXT: s_add_u32 s8, s6, s4 +; GFX9-NEXT: s_addc_u32 s9, s7, s5 ; GFX9-NEXT: s_ashr_i32 s4, s11, 31 ; GFX9-NEXT: s_add_u32 s6, s10, s4 ; GFX9-NEXT: s_mov_b32 s5, s4 ; GFX9-NEXT: s_addc_u32 s7, s11, s4 ; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] -; GFX9-NEXT: s_mul_i32 s11, s6, s8 -; GFX9-NEXT: s_mul_hi_u32 s14, s6, s9 -; GFX9-NEXT: s_mul_hi_u32 s10, s6, s8 +; GFX9-NEXT: s_mul_i32 s11, s6, s9 +; GFX9-NEXT: s_mul_hi_u32 s14, s6, s8 +; GFX9-NEXT: s_mul_hi_u32 s10, s6, s9 ; GFX9-NEXT: s_add_u32 s11, s14, s11 ; GFX9-NEXT: s_addc_u32 s10, 0, s10 -; GFX9-NEXT: s_mul_hi_u32 s15, s7, s9 -; GFX9-NEXT: s_mul_i32 s9, s7, s9 -; GFX9-NEXT: s_add_u32 s9, s11, s9 -; GFX9-NEXT: s_mul_hi_u32 s14, s7, s8 -; GFX9-NEXT: s_addc_u32 s9, s10, s15 -; GFX9-NEXT: s_addc_u32 s10, s14, 0 +; GFX9-NEXT: s_mul_hi_u32 s15, s7, s8 ; GFX9-NEXT: s_mul_i32 s8, s7, s8 -; GFX9-NEXT: s_add_u32 s8, s9, s8 +; GFX9-NEXT: s_add_u32 s8, s11, s8 +; GFX9-NEXT: s_mul_hi_u32 s14, s7, s9 +; GFX9-NEXT: s_addc_u32 s8, s10, s15 +; GFX9-NEXT: s_addc_u32 s10, s14, 0 +; GFX9-NEXT: s_mul_i32 s9, s7, s9 +; GFX9-NEXT: s_add_u32 s8, s8, s9 ; GFX9-NEXT: s_addc_u32 s9, 0, s10 ; GFX9-NEXT: s_mul_i32 s9, s2, s9 ; GFX9-NEXT: s_mul_hi_u32 s10, s2, s8 @@ -10000,11 +9932,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_mul_i32 s8, s2, s8 ; GFX9-NEXT: s_sub_u32 s6, s6, s8 ; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX9-NEXT: s_subb_u32 s15, s10, s3 ; GFX9-NEXT: s_sub_u32 s16, s6, s2 ; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 ; GFX9-NEXT: s_subb_u32 s17, s15, 0 ; GFX9-NEXT: s_cmp_ge_u32 s17, s3 ; GFX9-NEXT: s_cselect_b32 s18, -1, 0 @@ -10013,13 +9943,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_cmp_eq_u32 s17, s3 ; GFX9-NEXT: s_cselect_b32 s18, s19, s18 ; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s15, s15, s3 -; GFX9-NEXT: s_sub_u32 s19, s16, s2 -; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s10, s15, 0 +; GFX9-NEXT: s_subb_u32 s10, s15, s3 +; GFX9-NEXT: s_sub_u32 s11, s16, s2 +; GFX9-NEXT: s_subb_u32 s10, s10, 0 ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cselect_b32 s11, s19, s16 +; GFX9-NEXT: s_cselect_b32 s11, s11, s16 ; GFX9-NEXT: s_cselect_b32 s10, s10, s17 ; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX9-NEXT: s_subb_u32 s7, s7, s14 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll index 394727c88b0be..01f4414b930e1 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -612,12 +612,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8-NEXT: s_add_i32 s2, s2, s8 +; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -653,12 +652,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9-NEXT: s_add_i32 s2, s2, s8 +; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -693,11 +691,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 -; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 -; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: s_add_i32 s2, s2, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -733,11 +730,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 -; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 +; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 -; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -774,11 +770,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: s_add_i32 s2, s2, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -818,11 +813,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 -; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -859,11 +853,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -901,15 +894,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -999,12 +992,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8-NEXT: s_add_i32 s2, s2, s8 +; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1042,12 +1034,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9-NEXT: s_add_i32 s2, s2, s8 +; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1084,11 +1075,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 -; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 -; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: s_add_i32 s2, s2, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1127,11 +1117,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 -; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 +; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 -; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1171,11 +1160,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: s_add_i32 s2, s2, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1218,11 +1206,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 -; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1261,11 +1248,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX12W64-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1306,15 +1292,15 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB3_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2073,12 +2059,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8-NEXT: s_add_i32 s2, s2, s8 +; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2114,12 +2099,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9-NEXT: s_add_i32 s2, s2, s8 +; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2154,11 +2138,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 -; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 -; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: s_add_i32 s2, s2, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2194,11 +2177,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 -; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 +; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 -; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2235,11 +2217,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: s_add_i32 s2, s2, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2279,11 +2260,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 -; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2321,11 +2301,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2363,15 +2342,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 258bc2959f391..9db6d706b634b 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -717,12 +717,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s2 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 -; GFX8_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 +; GFX8_ITERATIVE-NEXT: s_add_i32 s6, s6, s3 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -762,12 +761,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s2 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 -; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 +; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s3 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -805,13 +803,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1064_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s7, s[0:1] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s7 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s7 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s2 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -853,11 +850,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s3 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032_ITERATIVE-NEXT: s_add_i32 s6, s6, s2 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -897,14 +893,13 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1164_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v1, s2 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -949,11 +944,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s6, s6, s2 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -993,14 +987,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1264_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] +; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1] +; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v1, s2 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2 +; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s7 ; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 -; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 ; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8 -; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -1028,6 +1022,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xf1ff ; GFX1264_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null @@ -1041,15 +1036,15 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1232_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1 -; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 -; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3 +; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s2 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -2363,7 +2358,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -2416,7 +2410,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -2462,13 +2455,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s2 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s2 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s2 ; GFX1064_ITERATIVE-NEXT: s_add_u32 s6, s6, s3 ; GFX1064_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -2515,13 +2507,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s1 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1 ; GFX1032_ITERATIVE-NEXT: s_add_u32 s6, s6, s2 ; GFX1032_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -2569,14 +2560,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s2 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s2 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s6, s6, s3 ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -2626,14 +2616,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 ; GFX1132_ITERATIVE-NEXT: s_add_u32 s6, s6, s2 ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -2677,16 +2666,16 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1264_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1] +; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[0:1] +; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s8 +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s8 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s8 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s8 +; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s10 -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s10 -; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s10 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s10 ; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[8:9] ; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3] -; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -2731,17 +2720,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1 -; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 -; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8 -; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3] +; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3] ; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -4490,12 +4479,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s2 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s12, m0 -; GFX8_ITERATIVE-NEXT: s_add_i32 s12, s12, s6 +; GFX8_ITERATIVE-NEXT: s_add_i32 s12, s12, s3 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -4550,12 +4538,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s2 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s12, m0 -; GFX9_ITERATIVE-NEXT: s_add_i32 s12, s12, s6 +; GFX9_ITERATIVE-NEXT: s_add_i32 s12, s12, s3 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -4608,13 +4595,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s6 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s6 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s2 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1064_ITERATIVE-NEXT: s_add_i32 s12, s12, s6 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064_ITERATIVE-NEXT: s_add_i32 s12, s12, s7 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -4670,11 +4656,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s8, s1 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s3 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032_ITERATIVE-NEXT: s_add_i32 s8, s8, s2 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -4728,14 +4713,13 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2 ; GFX1164_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s6 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s6 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s2 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164_ITERATIVE-NEXT: s_add_i32 s12, s12, s6 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164_ITERATIVE-NEXT: s_add_i32 s12, s12, s7 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -4799,11 +4783,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v2, s8, s1 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s8, s8, s2 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -4861,14 +4844,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1264_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] +; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1] +; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v1, s2 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2 +; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s7 ; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 -; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 ; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8 -; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -4896,6 +4879,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xf1ff ; GFX1264_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null @@ -4909,15 +4893,15 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1232_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1 -; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 -; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3 +; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s2 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -6673,7 +6657,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -6746,7 +6729,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -6812,13 +6794,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s2 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s2 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2 ; GFX1064_ITERATIVE-NEXT: s_add_u32 s8, s8, s3 ; GFX1064_ITERATIVE-NEXT: s_addc_u32 s9, s9, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -6883,13 +6864,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s1 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s1 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1 ; GFX1032_ITERATIVE-NEXT: s_add_u32 s8, s8, s2 ; GFX1032_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -6955,14 +6935,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s2 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s2 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s8, s8, s3 ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s9, s9, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -7036,14 +7015,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s1 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s1 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1 ; GFX1132_ITERATIVE-NEXT: s_add_u32 s8, s8, s2 ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -7109,16 +7087,16 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1264_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1] +; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[0:1] +; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s8 +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s8 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s8 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s8 +; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s10 -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s10 -; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s10 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s10 ; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[8:9] ; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3] -; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -7163,17 +7141,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1 -; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 -; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8 -; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3] +; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3] ; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 23c5f4f5506f3..6167a84094b7a 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -499,12 +499,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -540,12 +539,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -580,11 +578,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -621,11 +618,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -663,11 +659,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -707,11 +702,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1088,11 +1082,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX8_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3 +; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s4 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1117,11 +1110,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX9_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3 +; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s4 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1147,9 +1139,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1176,9 +1167,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1206,10 +1196,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] ; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1239,10 +1227,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2022,7 +2008,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2071,7 +2056,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2112,13 +2096,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 ; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2160,13 +2143,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 ; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2209,14 +2191,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -2261,14 +2242,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -2881,7 +2861,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2914,7 +2893,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2946,7 +2924,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2979,7 +2956,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3013,8 +2989,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3048,9 +3022,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s4 ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3906,12 +3879,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3947,12 +3919,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3987,11 +3958,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4028,11 +3998,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4070,11 +4039,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -4114,11 +4082,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -4495,11 +4462,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX8_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3 +; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s4 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4524,11 +4490,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX9_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3 +; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s4 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4554,9 +4519,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4583,9 +4547,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4613,10 +4576,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] ; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4646,10 +4607,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5452,7 +5411,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5501,7 +5459,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5542,13 +5499,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 ; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5590,13 +5546,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 ; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5639,14 +5594,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -5691,14 +5645,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -6313,12 +6266,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_and_b32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6354,12 +6306,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_and_b32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6394,11 +6345,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6435,11 +6385,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_and_b32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6477,11 +6426,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -6521,11 +6469,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_and_b32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -6926,12 +6873,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX8_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6973,12 +6919,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX9_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7015,15 +6960,14 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX1064_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7065,12 +7009,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7109,16 +7052,15 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX1164_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -7163,12 +7105,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1132_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -7672,12 +7613,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_or_b32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7713,12 +7653,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_or_b32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7753,11 +7692,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7794,11 +7732,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_or_b32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7836,11 +7773,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -7880,11 +7816,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_or_b32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -8284,12 +8219,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX8_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8331,12 +8265,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX9_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8373,15 +8306,14 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX1064_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8423,12 +8355,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8467,16 +8398,15 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX1164_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -8521,12 +8451,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1132_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9030,12 +8959,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_xor_b32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9071,12 +8999,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_xor_b32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9111,11 +9038,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9152,11 +9078,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9194,11 +9119,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -9238,11 +9162,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_xor_b32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -9642,12 +9565,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9689,12 +9611,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9731,15 +9652,14 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9781,12 +9701,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9825,16 +9744,15 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9879,12 +9797,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1132_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -10388,12 +10305,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_max_i32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10429,12 +10345,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_max_i32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10469,11 +10384,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10510,11 +10424,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_max_i32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10552,11 +10465,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -10596,11 +10508,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_max_i32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -11255,7 +11166,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11311,7 +11221,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11363,7 +11272,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11415,7 +11323,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11468,9 +11375,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -11525,9 +11431,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -12214,12 +12119,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_min_i32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -12255,12 +12159,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_min_i32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -12295,11 +12198,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -12336,11 +12238,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_min_i32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -12378,11 +12279,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -12422,11 +12322,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_min_i32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -13081,7 +12980,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -13137,7 +13035,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -13189,7 +13086,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -13241,7 +13137,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -13294,9 +13189,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -13351,9 +13245,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -14040,12 +13933,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_max_u32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14081,12 +13973,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_max_u32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14121,11 +14012,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14162,11 +14052,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_max_u32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14204,11 +14093,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -14248,11 +14136,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_max_u32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -14901,7 +14788,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14956,7 +14842,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -15007,7 +14892,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -15058,7 +14942,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -15112,8 +14995,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -15169,8 +15050,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -15853,12 +15732,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_min_u32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -15894,12 +15772,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_min_u32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -15934,11 +15811,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -15975,11 +15851,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_min_u32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -16017,11 +15892,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -16061,11 +15935,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_min_u32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -16715,7 +16588,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -16770,7 +16642,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -16821,7 +16692,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -16872,7 +16742,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -16926,8 +16795,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -16983,8 +16850,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll index e4def28667ed4..9afc0c62e846e 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -611,12 +611,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8-NEXT: s_add_i32 s2, s2, s8 +; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -652,12 +651,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9-NEXT: s_add_i32 s2, s2, s8 +; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -692,11 +690,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 -; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 -; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: s_add_i32 s2, s2, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -732,11 +729,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 -; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 +; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 -; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -773,11 +769,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: s_add_i32 s2, s2, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -817,11 +812,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 -; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -858,11 +852,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -900,15 +893,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1665,12 +1658,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8-NEXT: s_add_i32 s2, s2, s8 +; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1706,12 +1698,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9-NEXT: s_add_i32 s2, s2, s8 +; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1746,11 +1737,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 -; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 -; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: s_add_i32 s2, s2, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1786,11 +1776,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 -; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 +; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 -; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1827,11 +1816,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: s_add_i32 s2, s2, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1871,11 +1859,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 -; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1913,11 +1900,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX12W64-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1955,15 +1941,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB6_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll index 39a3c9aade586..10fd34f08b83e 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -628,12 +628,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8-NEXT: s_add_i32 s2, s2, s8 +; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -670,12 +669,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9-NEXT: s_add_i32 s2, s2, s8 +; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -711,11 +709,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 -; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 -; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: s_add_i32 s2, s2, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -752,11 +749,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 -; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 +; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 -; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -794,11 +790,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: s_add_i32 s2, s2, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -839,11 +834,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 -; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -880,11 +874,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -923,15 +916,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1833,12 +1826,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8-NEXT: s_add_i32 s2, s2, s8 +; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1875,12 +1867,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9-NEXT: s_add_i32 s2, s2, s8 +; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1916,11 +1907,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 -; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 -; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX10W64-NEXT: s_add_i32 s2, s2, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1957,11 +1947,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 -; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 +; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 -; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1999,11 +1988,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: s_add_i32 s2, s2, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2044,11 +2032,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 -; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2086,11 +2073,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 -; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2129,15 +2115,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll index 4a6fa4f9ad859..b96de173dc8c6 100644 --- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll +++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll @@ -704,7 +704,6 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; CISI-NEXT: s_add_u32 s4, s4, s6 ; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0 ; CISI-NEXT: s_or_b32 s6, s12, s13 -; CISI-NEXT: s_cmp_lg_u32 s6, 0 ; CISI-NEXT: s_addc_u32 s5, s5, s7 ; CISI-NEXT: s_mov_b32 s8, s0 ; CISI-NEXT: s_mov_b32 s9, s1 @@ -725,16 +724,14 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: s_add_u32 s2, s4, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_add_u32 s0, s4, s6 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_addc_u32 s1, s5, s7 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 -; VI-NEXT: s_addc_u32 s0, s5, s7 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: v_mov_b32_e32 v5, s0 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -746,12 +743,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s2, s12, s14 -; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_addc_u32 s0, s13, s15 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_add_u32 s0, s12, s14 +; GFX9-NEXT: s_addc_u32 s1, s13, s15 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -764,10 +759,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-NEXT: s_add_u32 s0, s12, s14 -; GFX1010-NEXT: s_cselect_b32 s1, -1, 0 -; GFX1010-NEXT: v_mov_b32_e32 v0, s0 -; GFX1010-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1010-NEXT: s_addc_u32 s1, s13, s15 +; GFX1010-NEXT: v_mov_b32_e32 v0, s0 ; GFX1010-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1010-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 @@ -781,10 +774,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_add_u32 s4, s4, s6 -; GFX1030W32-NEXT: s_cselect_b32 s6, -1, 0 -; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030W32-NEXT: s_cmp_lg_u32 s6, 0 ; GFX1030W32-NEXT: s_addc_u32 s5, s5, s7 +; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030W32-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1030W32-NEXT: v_mov_b32_e32 v1, s5 ; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 @@ -798,10 +789,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_add_u32 s4, s4, s6 -; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX1030W64-NEXT: s_addc_u32 s5, s5, s7 +; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030W64-NEXT: v_mov_b32_e32 v1, s5 ; GFX1030W64-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] @@ -814,10 +803,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s4, s4, s6 -; GFX11-NEXT: s_cselect_b32 s6, -1, 0 -; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: s_cmp_lg_u32 s6, 0 ; GFX11-NEXT: s_addc_u32 s5, s5, s7 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-NEXT: s_cselect_b32 s4, -1, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 @@ -831,10 +818,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_add_co_u32 s0, s12, s14 -; GFX1250-NEXT: s_cselect_b32 s1, -1, 0 -; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0 -; GFX1250-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1250-NEXT: s_add_co_ci_u32 s1, s13, s15 +; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0 ; GFX1250-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 @@ -1691,7 +1676,6 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; CISI-NEXT: s_sub_u32 s4, s4, s6 ; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0 ; CISI-NEXT: s_or_b32 s6, s12, s13 -; CISI-NEXT: s_cmp_lg_u32 s6, 0 ; CISI-NEXT: s_subb_u32 s5, s5, s7 ; CISI-NEXT: s_mov_b32 s8, s0 ; CISI-NEXT: s_mov_b32 s9, s1 @@ -1712,16 +1696,14 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: s_sub_u32 s2, s4, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_sub_u32 s0, s4, s6 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_subb_u32 s1, s5, s7 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 -; VI-NEXT: s_subb_u32 s0, s5, s7 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: v_mov_b32_e32 v5, s0 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -1733,12 +1715,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_u32 s2, s12, s14 -; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_subb_u32 s0, s13, s15 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_sub_u32 s0, s12, s14 +; GFX9-NEXT: s_subb_u32 s1, s13, s15 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -1751,10 +1731,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-NEXT: s_sub_u32 s0, s12, s14 -; GFX1010-NEXT: s_cselect_b32 s1, -1, 0 -; GFX1010-NEXT: v_mov_b32_e32 v0, s0 -; GFX1010-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1010-NEXT: s_subb_u32 s1, s13, s15 +; GFX1010-NEXT: v_mov_b32_e32 v0, s0 ; GFX1010-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1010-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 @@ -1768,10 +1746,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_sub_u32 s4, s4, s6 -; GFX1030W32-NEXT: s_cselect_b32 s6, -1, 0 -; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030W32-NEXT: s_cmp_lg_u32 s6, 0 ; GFX1030W32-NEXT: s_subb_u32 s5, s5, s7 +; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030W32-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1030W32-NEXT: v_mov_b32_e32 v1, s5 ; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 @@ -1785,10 +1761,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_sub_u32 s4, s4, s6 -; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX1030W64-NEXT: s_subb_u32 s5, s5, s7 +; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030W64-NEXT: v_mov_b32_e32 v1, s5 ; GFX1030W64-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] @@ -1801,10 +1775,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s4, s4, s6 -; GFX11-NEXT: s_cselect_b32 s6, -1, 0 -; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: s_cmp_lg_u32 s6, 0 ; GFX11-NEXT: s_subb_u32 s5, s5, s7 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-NEXT: s_cselect_b32 s4, -1, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 @@ -1818,10 +1790,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_sub_co_u32 s0, s12, s14 -; GFX1250-NEXT: s_cselect_b32 s1, -1, 0 -; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0 -; GFX1250-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1250-NEXT: s_sub_co_ci_u32 s1, s13, s15 +; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0 ; GFX1250-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 @@ -2218,49 +2188,46 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; VI-NEXT: s_addc_u32 s6, s7, s9 ; VI-NEXT: s_addc_u32 s8, s8, 0 ; VI-NEXT: v_readfirstlane_b32 s7, v0 -; VI-NEXT: s_add_u32 s12, s6, s7 -; VI-NEXT: v_mov_b32_e32 v0, s12 +; VI-NEXT: s_add_u32 s10, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s10 ; VI-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s4, v0, 0 -; VI-NEXT: s_addc_u32 s13, 0, s8 -; VI-NEXT: s_mul_i32 s8, s4, s13 +; VI-NEXT: s_addc_u32 s11, 0, s8 +; VI-NEXT: s_mul_i32 s8, s4, s11 ; VI-NEXT: v_readfirstlane_b32 s9, v1 ; VI-NEXT: s_add_i32 s8, s9, s8 -; VI-NEXT: s_mul_i32 s9, s5, s12 -; VI-NEXT: s_add_i32 s14, s8, s9 -; VI-NEXT: s_sub_i32 s10, s3, s14 +; VI-NEXT: s_mul_i32 s9, s5, s10 +; VI-NEXT: s_add_i32 s12, s8, s9 +; VI-NEXT: s_sub_i32 s13, s3, s12 ; VI-NEXT: v_readfirstlane_b32 s8, v0 -; VI-NEXT: s_sub_u32 s15, s2, s8 +; VI-NEXT: s_sub_u32 s14, s2, s8 ; VI-NEXT: s_cselect_b64 s[8:9], -1, 0 -; VI-NEXT: s_cmp_lg_u64 s[8:9], 0 -; VI-NEXT: s_subb_u32 s16, s10, s5 -; VI-NEXT: s_sub_u32 s17, s15, s4 -; VI-NEXT: s_cselect_b64 s[10:11], -1, 0 -; VI-NEXT: s_cmp_lg_u64 s[10:11], 0 -; VI-NEXT: s_subb_u32 s10, s16, 0 -; VI-NEXT: s_cmp_ge_u32 s10, s5 -; VI-NEXT: s_cselect_b32 s11, -1, 0 -; VI-NEXT: s_cmp_ge_u32 s17, s4 +; VI-NEXT: s_subb_u32 s13, s13, s5 +; VI-NEXT: s_sub_u32 s15, s14, s4 +; VI-NEXT: s_subb_u32 s13, s13, 0 +; VI-NEXT: s_cmp_ge_u32 s13, s5 ; VI-NEXT: s_cselect_b32 s16, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s10, s5 -; VI-NEXT: s_cselect_b32 s10, s16, s11 -; VI-NEXT: s_add_u32 s11, s12, 1 -; VI-NEXT: s_addc_u32 s16, s13, 0 -; VI-NEXT: s_add_u32 s17, s12, 2 -; VI-NEXT: s_addc_u32 s18, s13, 0 -; VI-NEXT: s_cmp_lg_u32 s10, 0 -; VI-NEXT: s_cselect_b32 s10, s17, s11 -; VI-NEXT: s_cselect_b32 s11, s18, s16 +; VI-NEXT: s_cmp_ge_u32 s15, s4 +; VI-NEXT: s_cselect_b32 s15, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s13, s5 +; VI-NEXT: s_cselect_b32 s13, s15, s16 +; VI-NEXT: s_add_u32 s15, s10, 1 +; VI-NEXT: s_addc_u32 s16, s11, 0 +; VI-NEXT: s_add_u32 s17, s10, 2 +; VI-NEXT: s_addc_u32 s18, s11, 0 +; VI-NEXT: s_cmp_lg_u32 s13, 0 +; VI-NEXT: s_cselect_b32 s13, s17, s15 +; VI-NEXT: s_cselect_b32 s15, s18, s16 ; VI-NEXT: s_cmp_lg_u64 s[8:9], 0 -; VI-NEXT: s_subb_u32 s3, s3, s14 +; VI-NEXT: s_subb_u32 s3, s3, s12 ; VI-NEXT: s_cmp_ge_u32 s3, s5 ; VI-NEXT: s_cselect_b32 s8, -1, 0 -; VI-NEXT: s_cmp_ge_u32 s15, s4 +; VI-NEXT: s_cmp_ge_u32 s14, s4 ; VI-NEXT: s_cselect_b32 s9, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s3, s5 ; VI-NEXT: s_cselect_b32 s3, s9, s8 ; VI-NEXT: s_cmp_lg_u32 s3, 0 -; VI-NEXT: s_cselect_b32 s9, s11, s13 -; VI-NEXT: s_cselect_b32 s8, s10, s12 +; VI-NEXT: s_cselect_b32 s9, s15, s11 +; VI-NEXT: s_cselect_b32 s8, s13, s10 ; VI-NEXT: s_cbranch_execnz .LBB16_4 ; VI-NEXT: .LBB16_2: ; VI-NEXT: v_cvt_f32_u32_e32 v0, s4 @@ -2311,8 +2278,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_sub_u32 s10, 0, s6 -; GFX9-NEXT: s_subb_u32 s11, 0, s7 +; GFX9-NEXT: s_sub_u32 s8, 0, s6 +; GFX9-NEXT: s_subb_u32 s9, 0, s7 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2321,109 +2288,102 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX9-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s12, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v0 -; GFX9-NEXT: s_mul_i32 s9, s10, s12 -; GFX9-NEXT: s_mul_hi_u32 s14, s10, s8 -; GFX9-NEXT: s_mul_i32 s13, s11, s8 -; GFX9-NEXT: s_add_i32 s9, s14, s9 -; GFX9-NEXT: s_add_i32 s9, s9, s13 -; GFX9-NEXT: s_mul_i32 s15, s10, s8 -; GFX9-NEXT: s_mul_i32 s14, s8, s9 -; GFX9-NEXT: s_mul_hi_u32 s16, s8, s15 -; GFX9-NEXT: s_mul_hi_u32 s13, s8, s9 +; GFX9-NEXT: v_readfirstlane_b32 s10, v1 +; GFX9-NEXT: v_readfirstlane_b32 s11, v0 +; GFX9-NEXT: s_mul_i32 s12, s8, s10 +; GFX9-NEXT: s_mul_hi_u32 s14, s8, s11 +; GFX9-NEXT: s_mul_i32 s13, s9, s11 +; GFX9-NEXT: s_add_i32 s12, s14, s12 +; GFX9-NEXT: s_add_i32 s12, s12, s13 +; GFX9-NEXT: s_mul_i32 s15, s8, s11 +; GFX9-NEXT: s_mul_i32 s14, s11, s12 +; GFX9-NEXT: s_mul_hi_u32 s16, s11, s15 +; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12 ; GFX9-NEXT: s_add_u32 s14, s16, s14 ; GFX9-NEXT: s_addc_u32 s13, 0, s13 -; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15 -; GFX9-NEXT: s_mul_i32 s15, s12, s15 +; GFX9-NEXT: s_mul_hi_u32 s17, s10, s15 +; GFX9-NEXT: s_mul_i32 s15, s10, s15 ; GFX9-NEXT: s_add_u32 s14, s14, s15 -; GFX9-NEXT: s_mul_hi_u32 s16, s12, s9 +; GFX9-NEXT: s_mul_hi_u32 s16, s10, s12 ; GFX9-NEXT: s_addc_u32 s13, s13, s17 ; GFX9-NEXT: s_addc_u32 s14, s16, 0 -; GFX9-NEXT: s_mul_i32 s9, s12, s9 -; GFX9-NEXT: s_add_u32 s9, s13, s9 +; GFX9-NEXT: s_mul_i32 s12, s10, s12 +; GFX9-NEXT: s_add_u32 s12, s13, s12 ; GFX9-NEXT: s_addc_u32 s13, 0, s14 -; GFX9-NEXT: s_add_u32 s14, s8, s9 -; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 -; GFX9-NEXT: s_addc_u32 s12, s12, s13 -; GFX9-NEXT: s_mul_i32 s8, s10, s12 -; GFX9-NEXT: s_mul_hi_u32 s9, s10, s14 -; GFX9-NEXT: s_add_i32 s8, s9, s8 -; GFX9-NEXT: s_mul_i32 s11, s11, s14 -; GFX9-NEXT: s_add_i32 s8, s8, s11 -; GFX9-NEXT: s_mul_i32 s10, s10, s14 -; GFX9-NEXT: s_mul_hi_u32 s11, s12, s10 -; GFX9-NEXT: s_mul_i32 s13, s12, s10 -; GFX9-NEXT: s_mul_i32 s16, s14, s8 -; GFX9-NEXT: s_mul_hi_u32 s10, s14, s10 -; GFX9-NEXT: s_mul_hi_u32 s15, s14, s8 -; GFX9-NEXT: s_add_u32 s10, s10, s16 +; GFX9-NEXT: s_add_u32 s11, s11, s12 +; GFX9-NEXT: s_addc_u32 s10, s10, s13 +; GFX9-NEXT: s_mul_i32 s12, s8, s10 +; GFX9-NEXT: s_mul_hi_u32 s13, s8, s11 +; GFX9-NEXT: s_add_i32 s12, s13, s12 +; GFX9-NEXT: s_mul_i32 s9, s9, s11 +; GFX9-NEXT: s_add_i32 s12, s12, s9 +; GFX9-NEXT: s_mul_i32 s8, s8, s11 +; GFX9-NEXT: s_mul_hi_u32 s13, s10, s8 +; GFX9-NEXT: s_mul_i32 s14, s10, s8 +; GFX9-NEXT: s_mul_i32 s16, s11, s12 +; GFX9-NEXT: s_mul_hi_u32 s8, s11, s8 +; GFX9-NEXT: s_mul_hi_u32 s15, s11, s12 +; GFX9-NEXT: s_add_u32 s8, s8, s16 ; GFX9-NEXT: s_addc_u32 s15, 0, s15 -; GFX9-NEXT: s_add_u32 s10, s10, s13 -; GFX9-NEXT: s_mul_hi_u32 s9, s12, s8 -; GFX9-NEXT: s_addc_u32 s10, s15, s11 +; GFX9-NEXT: s_add_u32 s8, s8, s14 +; GFX9-NEXT: s_mul_hi_u32 s9, s10, s12 +; GFX9-NEXT: s_addc_u32 s8, s15, s13 ; GFX9-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-NEXT: s_mul_i32 s8, s12, s8 -; GFX9-NEXT: s_add_u32 s8, s10, s8 -; GFX9-NEXT: s_addc_u32 s10, 0, s9 -; GFX9-NEXT: s_add_u32 s11, s14, s8 -; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 -; GFX9-NEXT: s_addc_u32 s8, s12, s10 -; GFX9-NEXT: s_mul_i32 s10, s2, s8 -; GFX9-NEXT: s_mul_hi_u32 s12, s2, s11 -; GFX9-NEXT: s_mul_hi_u32 s9, s2, s8 -; GFX9-NEXT: s_add_u32 s10, s12, s10 +; GFX9-NEXT: s_mul_i32 s12, s10, s12 +; GFX9-NEXT: s_add_u32 s8, s8, s12 ; GFX9-NEXT: s_addc_u32 s9, 0, s9 -; GFX9-NEXT: s_mul_hi_u32 s13, s3, s11 -; GFX9-NEXT: s_mul_i32 s11, s3, s11 -; GFX9-NEXT: s_add_u32 s10, s10, s11 -; GFX9-NEXT: s_mul_hi_u32 s12, s3, s8 -; GFX9-NEXT: s_addc_u32 s9, s9, s13 -; GFX9-NEXT: s_addc_u32 s10, s12, 0 +; GFX9-NEXT: s_add_u32 s8, s11, s8 +; GFX9-NEXT: s_addc_u32 s9, s10, s9 +; GFX9-NEXT: s_mul_i32 s11, s2, s9 +; GFX9-NEXT: s_mul_hi_u32 s12, s2, s8 +; GFX9-NEXT: s_mul_hi_u32 s10, s2, s9 +; GFX9-NEXT: s_add_u32 s11, s12, s11 +; GFX9-NEXT: s_addc_u32 s10, 0, s10 +; GFX9-NEXT: s_mul_hi_u32 s13, s3, s8 ; GFX9-NEXT: s_mul_i32 s8, s3, s8 -; GFX9-NEXT: s_add_u32 s12, s9, s8 -; GFX9-NEXT: s_addc_u32 s13, 0, s10 -; GFX9-NEXT: s_mul_i32 s8, s6, s13 -; GFX9-NEXT: s_mul_hi_u32 s9, s6, s12 +; GFX9-NEXT: s_add_u32 s8, s11, s8 +; GFX9-NEXT: s_mul_hi_u32 s12, s3, s9 +; GFX9-NEXT: s_addc_u32 s8, s10, s13 +; GFX9-NEXT: s_addc_u32 s10, s12, 0 +; GFX9-NEXT: s_mul_i32 s9, s3, s9 +; GFX9-NEXT: s_add_u32 s11, s8, s9 +; GFX9-NEXT: s_addc_u32 s10, 0, s10 +; GFX9-NEXT: s_mul_i32 s8, s6, s10 +; GFX9-NEXT: s_mul_hi_u32 s9, s6, s11 ; GFX9-NEXT: s_add_i32 s8, s9, s8 -; GFX9-NEXT: s_mul_i32 s9, s7, s12 -; GFX9-NEXT: s_add_i32 s14, s8, s9 -; GFX9-NEXT: s_sub_i32 s10, s3, s14 -; GFX9-NEXT: s_mul_i32 s8, s6, s12 -; GFX9-NEXT: s_sub_u32 s15, s2, s8 +; GFX9-NEXT: s_mul_i32 s9, s7, s11 +; GFX9-NEXT: s_add_i32 s12, s8, s9 +; GFX9-NEXT: s_sub_i32 s13, s3, s12 +; GFX9-NEXT: s_mul_i32 s8, s6, s11 +; GFX9-NEXT: s_sub_u32 s14, s2, s8 ; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 -; GFX9-NEXT: s_subb_u32 s16, s10, s7 -; GFX9-NEXT: s_sub_u32 s17, s15, s6 -; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s10, s16, 0 -; GFX9-NEXT: s_cmp_ge_u32 s10, s7 -; GFX9-NEXT: s_cselect_b32 s11, -1, 0 -; GFX9-NEXT: s_cmp_ge_u32 s17, s6 +; GFX9-NEXT: s_subb_u32 s13, s13, s7 +; GFX9-NEXT: s_sub_u32 s15, s14, s6 +; GFX9-NEXT: s_subb_u32 s13, s13, 0 +; GFX9-NEXT: s_cmp_ge_u32 s13, s7 ; GFX9-NEXT: s_cselect_b32 s16, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s10, s7 -; GFX9-NEXT: s_cselect_b32 s10, s16, s11 -; GFX9-NEXT: s_add_u32 s11, s12, 1 -; GFX9-NEXT: s_addc_u32 s16, s13, 0 -; GFX9-NEXT: s_add_u32 s17, s12, 2 -; GFX9-NEXT: s_addc_u32 s18, s13, 0 -; GFX9-NEXT: s_cmp_lg_u32 s10, 0 -; GFX9-NEXT: s_cselect_b32 s10, s17, s11 -; GFX9-NEXT: s_cselect_b32 s11, s18, s16 +; GFX9-NEXT: s_cmp_ge_u32 s15, s6 +; GFX9-NEXT: s_cselect_b32 s15, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s13, s7 +; GFX9-NEXT: s_cselect_b32 s13, s15, s16 +; GFX9-NEXT: s_add_u32 s15, s11, 1 +; GFX9-NEXT: s_addc_u32 s16, s10, 0 +; GFX9-NEXT: s_add_u32 s17, s11, 2 +; GFX9-NEXT: s_addc_u32 s18, s10, 0 +; GFX9-NEXT: s_cmp_lg_u32 s13, 0 +; GFX9-NEXT: s_cselect_b32 s13, s17, s15 +; GFX9-NEXT: s_cselect_b32 s15, s18, s16 ; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 -; GFX9-NEXT: s_subb_u32 s3, s3, s14 +; GFX9-NEXT: s_subb_u32 s3, s3, s12 ; GFX9-NEXT: s_cmp_ge_u32 s3, s7 ; GFX9-NEXT: s_cselect_b32 s8, -1, 0 -; GFX9-NEXT: s_cmp_ge_u32 s15, s6 +; GFX9-NEXT: s_cmp_ge_u32 s14, s6 ; GFX9-NEXT: s_cselect_b32 s9, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s3, s7 ; GFX9-NEXT: s_cselect_b32 s3, s9, s8 ; GFX9-NEXT: s_cmp_lg_u32 s3, 0 -; GFX9-NEXT: s_cselect_b32 s9, s11, s13 -; GFX9-NEXT: s_cselect_b32 s8, s10, s12 +; GFX9-NEXT: s_cselect_b32 s9, s15, s10 +; GFX9-NEXT: s_cselect_b32 s8, s13, s11 ; GFX9-NEXT: s_cbranch_execnz .LBB16_3 ; GFX9-NEXT: .LBB16_2: ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 @@ -2503,44 +2463,40 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: s_add_u32 s11, s12, s11 ; GFX1010-NEXT: s_addc_u32 s12, 0, s13 ; GFX1010-NEXT: s_add_u32 s8, s8, s11 -; GFX1010-NEXT: s_cselect_b32 s11, -1, 0 -; GFX1010-NEXT: s_mul_hi_u32 s13, s9, s8 -; GFX1010-NEXT: s_cmp_lg_u32 s11, 0 -; GFX1010-NEXT: s_mul_i32 s11, s9, s8 ; GFX1010-NEXT: s_addc_u32 s5, s5, s12 -; GFX1010-NEXT: s_mul_i32 s10, s10, s8 +; GFX1010-NEXT: s_mul_hi_u32 s11, s9, s8 +; GFX1010-NEXT: s_mul_i32 s12, s9, s8 ; GFX1010-NEXT: s_mul_i32 s9, s9, s5 -; GFX1010-NEXT: s_mul_hi_u32 s12, s8, s11 -; GFX1010-NEXT: s_add_i32 s9, s13, s9 -; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s11 +; GFX1010-NEXT: s_mul_i32 s10, s10, s8 +; GFX1010-NEXT: s_add_i32 s9, s11, s9 +; GFX1010-NEXT: s_mul_i32 s11, s5, s12 ; GFX1010-NEXT: s_add_i32 s9, s9, s10 -; GFX1010-NEXT: s_mul_i32 s10, s5, s11 +; GFX1010-NEXT: s_mul_hi_u32 s10, s8, s12 ; GFX1010-NEXT: s_mul_i32 s15, s8, s9 ; GFX1010-NEXT: s_mul_hi_u32 s14, s8, s9 -; GFX1010-NEXT: s_add_u32 s12, s12, s15 +; GFX1010-NEXT: s_add_u32 s10, s10, s15 +; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s12 ; GFX1010-NEXT: s_addc_u32 s14, 0, s14 -; GFX1010-NEXT: s_mul_hi_u32 s11, s5, s9 -; GFX1010-NEXT: s_add_u32 s10, s12, s10 +; GFX1010-NEXT: s_mul_hi_u32 s12, s5, s9 +; GFX1010-NEXT: s_add_u32 s10, s10, s11 ; GFX1010-NEXT: s_mul_i32 s9, s5, s9 ; GFX1010-NEXT: s_addc_u32 s10, s14, s13 -; GFX1010-NEXT: s_addc_u32 s11, s11, 0 +; GFX1010-NEXT: s_addc_u32 s11, s12, 0 ; GFX1010-NEXT: s_add_u32 s9, s10, s9 ; GFX1010-NEXT: s_addc_u32 s10, 0, s11 ; GFX1010-NEXT: s_add_u32 s8, s8, s9 -; GFX1010-NEXT: s_cselect_b32 s9, -1, 0 -; GFX1010-NEXT: s_mul_hi_u32 s11, s2, s8 -; GFX1010-NEXT: s_cmp_lg_u32 s9, 0 -; GFX1010-NEXT: s_mul_hi_u32 s9, s3, s8 ; GFX1010-NEXT: s_addc_u32 s5, s5, s10 -; GFX1010-NEXT: s_mul_i32 s8, s3, s8 +; GFX1010-NEXT: s_mul_hi_u32 s9, s2, s8 ; GFX1010-NEXT: s_mul_i32 s12, s2, s5 -; GFX1010-NEXT: s_mul_hi_u32 s10, s2, s5 -; GFX1010-NEXT: s_add_u32 s11, s11, s12 -; GFX1010-NEXT: s_addc_u32 s10, 0, s10 +; GFX1010-NEXT: s_mul_hi_u32 s11, s2, s5 +; GFX1010-NEXT: s_mul_hi_u32 s10, s3, s8 +; GFX1010-NEXT: s_mul_i32 s8, s3, s8 +; GFX1010-NEXT: s_add_u32 s9, s9, s12 +; GFX1010-NEXT: s_addc_u32 s11, 0, s11 ; GFX1010-NEXT: s_mul_hi_u32 s13, s3, s5 -; GFX1010-NEXT: s_add_u32 s8, s11, s8 +; GFX1010-NEXT: s_add_u32 s8, s9, s8 ; GFX1010-NEXT: s_mul_i32 s5, s3, s5 -; GFX1010-NEXT: s_addc_u32 s8, s10, s9 +; GFX1010-NEXT: s_addc_u32 s8, s11, s10 ; GFX1010-NEXT: s_addc_u32 s9, s13, 0 ; GFX1010-NEXT: s_add_u32 s5, s8, s5 ; GFX1010-NEXT: s_addc_u32 s8, 0, s9 @@ -2553,11 +2509,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: s_sub_i32 s11, s3, s9 ; GFX1010-NEXT: s_sub_u32 s10, s2, s10 ; GFX1010-NEXT: s_cselect_b32 s12, -1, 0 -; GFX1010-NEXT: s_cmp_lg_u32 s12, 0 ; GFX1010-NEXT: s_subb_u32 s11, s11, s7 ; GFX1010-NEXT: s_sub_u32 s13, s10, s6 -; GFX1010-NEXT: s_cselect_b32 s14, -1, 0 -; GFX1010-NEXT: s_cmp_lg_u32 s14, 0 ; GFX1010-NEXT: s_subb_u32 s11, s11, 0 ; GFX1010-NEXT: s_cmp_ge_u32 s11, s7 ; GFX1010-NEXT: s_cselect_b32 s14, -1, 0 @@ -2663,44 +2616,40 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W32-NEXT: s_add_u32 s11, s12, s11 ; GFX1030W32-NEXT: s_addc_u32 s12, 0, s13 ; GFX1030W32-NEXT: s_add_u32 s8, s8, s11 -; GFX1030W32-NEXT: s_cselect_b32 s11, -1, 0 -; GFX1030W32-NEXT: s_mul_hi_u32 s13, s9, s8 -; GFX1030W32-NEXT: s_cmp_lg_u32 s11, 0 -; GFX1030W32-NEXT: s_mul_i32 s11, s9, s8 ; GFX1030W32-NEXT: s_addc_u32 s7, s7, s12 -; GFX1030W32-NEXT: s_mul_i32 s10, s10, s8 +; GFX1030W32-NEXT: s_mul_hi_u32 s11, s9, s8 +; GFX1030W32-NEXT: s_mul_i32 s12, s9, s8 ; GFX1030W32-NEXT: s_mul_i32 s9, s9, s7 -; GFX1030W32-NEXT: s_mul_hi_u32 s12, s8, s11 -; GFX1030W32-NEXT: s_add_i32 s9, s13, s9 -; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s11 +; GFX1030W32-NEXT: s_mul_i32 s10, s10, s8 +; GFX1030W32-NEXT: s_add_i32 s9, s11, s9 +; GFX1030W32-NEXT: s_mul_i32 s11, s7, s12 ; GFX1030W32-NEXT: s_add_i32 s9, s9, s10 -; GFX1030W32-NEXT: s_mul_i32 s10, s7, s11 +; GFX1030W32-NEXT: s_mul_hi_u32 s10, s8, s12 ; GFX1030W32-NEXT: s_mul_i32 s15, s8, s9 ; GFX1030W32-NEXT: s_mul_hi_u32 s14, s8, s9 -; GFX1030W32-NEXT: s_add_u32 s12, s12, s15 +; GFX1030W32-NEXT: s_add_u32 s10, s10, s15 +; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s12 ; GFX1030W32-NEXT: s_addc_u32 s14, 0, s14 -; GFX1030W32-NEXT: s_mul_hi_u32 s11, s7, s9 -; GFX1030W32-NEXT: s_add_u32 s10, s12, s10 +; GFX1030W32-NEXT: s_mul_hi_u32 s12, s7, s9 +; GFX1030W32-NEXT: s_add_u32 s10, s10, s11 ; GFX1030W32-NEXT: s_mul_i32 s9, s7, s9 ; GFX1030W32-NEXT: s_addc_u32 s10, s14, s13 -; GFX1030W32-NEXT: s_addc_u32 s11, s11, 0 +; GFX1030W32-NEXT: s_addc_u32 s11, s12, 0 ; GFX1030W32-NEXT: s_add_u32 s9, s10, s9 ; GFX1030W32-NEXT: s_addc_u32 s10, 0, s11 ; GFX1030W32-NEXT: s_add_u32 s8, s8, s9 -; GFX1030W32-NEXT: s_cselect_b32 s9, -1, 0 -; GFX1030W32-NEXT: s_mul_hi_u32 s11, s2, s8 -; GFX1030W32-NEXT: s_cmp_lg_u32 s9, 0 -; GFX1030W32-NEXT: s_mul_hi_u32 s9, s3, s8 ; GFX1030W32-NEXT: s_addc_u32 s7, s7, s10 -; GFX1030W32-NEXT: s_mul_i32 s8, s3, s8 +; GFX1030W32-NEXT: s_mul_hi_u32 s9, s2, s8 ; GFX1030W32-NEXT: s_mul_i32 s12, s2, s7 -; GFX1030W32-NEXT: s_mul_hi_u32 s10, s2, s7 -; GFX1030W32-NEXT: s_add_u32 s11, s11, s12 -; GFX1030W32-NEXT: s_addc_u32 s10, 0, s10 +; GFX1030W32-NEXT: s_mul_hi_u32 s11, s2, s7 +; GFX1030W32-NEXT: s_mul_hi_u32 s10, s3, s8 +; GFX1030W32-NEXT: s_mul_i32 s8, s3, s8 +; GFX1030W32-NEXT: s_add_u32 s9, s9, s12 +; GFX1030W32-NEXT: s_addc_u32 s11, 0, s11 ; GFX1030W32-NEXT: s_mul_hi_u32 s13, s3, s7 -; GFX1030W32-NEXT: s_add_u32 s8, s11, s8 +; GFX1030W32-NEXT: s_add_u32 s8, s9, s8 ; GFX1030W32-NEXT: s_mul_i32 s7, s3, s7 -; GFX1030W32-NEXT: s_addc_u32 s8, s10, s9 +; GFX1030W32-NEXT: s_addc_u32 s8, s11, s10 ; GFX1030W32-NEXT: s_addc_u32 s9, s13, 0 ; GFX1030W32-NEXT: s_add_u32 s7, s8, s7 ; GFX1030W32-NEXT: s_addc_u32 s8, 0, s9 @@ -2713,11 +2662,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W32-NEXT: s_sub_i32 s11, s3, s9 ; GFX1030W32-NEXT: s_sub_u32 s10, s2, s10 ; GFX1030W32-NEXT: s_cselect_b32 s12, -1, 0 -; GFX1030W32-NEXT: s_cmp_lg_u32 s12, 0 ; GFX1030W32-NEXT: s_subb_u32 s11, s11, s5 ; GFX1030W32-NEXT: s_sub_u32 s13, s10, s4 -; GFX1030W32-NEXT: s_cselect_b32 s14, -1, 0 -; GFX1030W32-NEXT: s_cmp_lg_u32 s14, 0 ; GFX1030W32-NEXT: s_subb_u32 s11, s11, 0 ; GFX1030W32-NEXT: s_cmp_ge_u32 s11, s5 ; GFX1030W32-NEXT: s_cselect_b32 s14, -1, 0 @@ -2790,8 +2736,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W64-NEXT: ; %bb.1: ; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v1, s5 -; GFX1030W64-NEXT: s_sub_u32 s9, 0, s4 -; GFX1030W64-NEXT: s_subb_u32 s10, 0, s5 +; GFX1030W64-NEXT: s_sub_u32 s8, 0, s4 +; GFX1030W64-NEXT: s_subb_u32 s9, 0, s5 ; GFX1030W64-NEXT: v_fmamk_f32 v0, v1, 0x4f800000, v0 ; GFX1030W64-NEXT: v_rcp_f32_e32 v0, v0 ; GFX1030W64-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2800,109 +2746,102 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W64-NEXT: v_fmamk_f32 v0, v1, 0xcf800000, v0 ; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX1030W64-NEXT: v_readfirstlane_b32 s8, v1 -; GFX1030W64-NEXT: v_readfirstlane_b32 s6, v0 -; GFX1030W64-NEXT: s_mul_i32 s7, s9, s8 -; GFX1030W64-NEXT: s_mul_hi_u32 s12, s9, s6 -; GFX1030W64-NEXT: s_mul_i32 s11, s10, s6 -; GFX1030W64-NEXT: s_add_i32 s7, s12, s7 -; GFX1030W64-NEXT: s_mul_i32 s13, s9, s6 -; GFX1030W64-NEXT: s_add_i32 s7, s7, s11 -; GFX1030W64-NEXT: s_mul_hi_u32 s12, s6, s13 -; GFX1030W64-NEXT: s_mul_i32 s15, s6, s7 -; GFX1030W64-NEXT: s_mul_hi_u32 s14, s8, s13 -; GFX1030W64-NEXT: s_mul_i32 s11, s8, s13 -; GFX1030W64-NEXT: s_mul_hi_u32 s13, s6, s7 +; GFX1030W64-NEXT: v_readfirstlane_b32 s6, v1 +; GFX1030W64-NEXT: v_readfirstlane_b32 s7, v0 +; GFX1030W64-NEXT: s_mul_i32 s10, s8, s6 +; GFX1030W64-NEXT: s_mul_hi_u32 s12, s8, s7 +; GFX1030W64-NEXT: s_mul_i32 s11, s9, s7 +; GFX1030W64-NEXT: s_add_i32 s10, s12, s10 +; GFX1030W64-NEXT: s_mul_i32 s13, s8, s7 +; GFX1030W64-NEXT: s_add_i32 s10, s10, s11 +; GFX1030W64-NEXT: s_mul_hi_u32 s12, s7, s13 +; GFX1030W64-NEXT: s_mul_i32 s15, s7, s10 +; GFX1030W64-NEXT: s_mul_hi_u32 s14, s6, s13 +; GFX1030W64-NEXT: s_mul_i32 s11, s6, s13 +; GFX1030W64-NEXT: s_mul_hi_u32 s13, s7, s10 ; GFX1030W64-NEXT: s_add_u32 s12, s12, s15 ; GFX1030W64-NEXT: s_addc_u32 s13, 0, s13 -; GFX1030W64-NEXT: s_mul_hi_u32 s16, s8, s7 +; GFX1030W64-NEXT: s_mul_hi_u32 s16, s6, s10 ; GFX1030W64-NEXT: s_add_u32 s11, s12, s11 -; GFX1030W64-NEXT: s_mul_i32 s7, s8, s7 +; GFX1030W64-NEXT: s_mul_i32 s10, s6, s10 ; GFX1030W64-NEXT: s_addc_u32 s11, s13, s14 ; GFX1030W64-NEXT: s_addc_u32 s12, s16, 0 -; GFX1030W64-NEXT: s_add_u32 s7, s11, s7 +; GFX1030W64-NEXT: s_add_u32 s10, s11, s10 ; GFX1030W64-NEXT: s_addc_u32 s11, 0, s12 -; GFX1030W64-NEXT: s_add_u32 s12, s6, s7 -; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX1030W64-NEXT: s_mul_hi_u32 s13, s9, s12 -; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1030W64-NEXT: s_mul_i32 s6, s9, s12 -; GFX1030W64-NEXT: s_addc_u32 s8, s8, s11 -; GFX1030W64-NEXT: s_mul_i32 s10, s10, s12 -; GFX1030W64-NEXT: s_mul_i32 s9, s9, s8 -; GFX1030W64-NEXT: s_mul_hi_u32 s7, s12, s6 -; GFX1030W64-NEXT: s_add_i32 s9, s13, s9 -; GFX1030W64-NEXT: s_mul_hi_u32 s11, s8, s6 -; GFX1030W64-NEXT: s_add_i32 s9, s9, s10 -; GFX1030W64-NEXT: s_mul_i32 s6, s8, s6 -; GFX1030W64-NEXT: s_mul_i32 s14, s12, s9 -; GFX1030W64-NEXT: s_mul_hi_u32 s13, s12, s9 -; GFX1030W64-NEXT: s_add_u32 s7, s7, s14 +; GFX1030W64-NEXT: s_add_u32 s7, s7, s10 +; GFX1030W64-NEXT: s_addc_u32 s6, s6, s11 +; GFX1030W64-NEXT: s_mul_hi_u32 s10, s8, s7 +; GFX1030W64-NEXT: s_mul_i32 s11, s8, s7 +; GFX1030W64-NEXT: s_mul_i32 s8, s8, s6 +; GFX1030W64-NEXT: s_mul_i32 s9, s9, s7 +; GFX1030W64-NEXT: s_add_i32 s8, s10, s8 +; GFX1030W64-NEXT: s_mul_i32 s10, s6, s11 +; GFX1030W64-NEXT: s_add_i32 s8, s8, s9 +; GFX1030W64-NEXT: s_mul_hi_u32 s9, s7, s11 +; GFX1030W64-NEXT: s_mul_i32 s14, s7, s8 +; GFX1030W64-NEXT: s_mul_hi_u32 s13, s7, s8 +; GFX1030W64-NEXT: s_add_u32 s9, s9, s14 +; GFX1030W64-NEXT: s_mul_hi_u32 s12, s6, s11 ; GFX1030W64-NEXT: s_addc_u32 s13, 0, s13 -; GFX1030W64-NEXT: s_mul_hi_u32 s10, s8, s9 -; GFX1030W64-NEXT: s_add_u32 s6, s7, s6 -; GFX1030W64-NEXT: s_mul_i32 s9, s8, s9 -; GFX1030W64-NEXT: s_addc_u32 s6, s13, s11 -; GFX1030W64-NEXT: s_addc_u32 s7, s10, 0 -; GFX1030W64-NEXT: s_add_u32 s6, s6, s9 -; GFX1030W64-NEXT: s_addc_u32 s9, 0, s7 -; GFX1030W64-NEXT: s_add_u32 s10, s12, s6 -; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX1030W64-NEXT: s_mul_hi_u32 s11, s2, s10 -; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1030W64-NEXT: s_mul_hi_u32 s6, s3, s10 -; GFX1030W64-NEXT: s_addc_u32 s7, s8, s9 -; GFX1030W64-NEXT: s_mul_i32 s8, s3, s10 -; GFX1030W64-NEXT: s_mul_i32 s10, s2, s7 -; GFX1030W64-NEXT: s_mul_hi_u32 s9, s2, s7 -; GFX1030W64-NEXT: s_add_u32 s10, s11, s10 -; GFX1030W64-NEXT: s_addc_u32 s9, 0, s9 -; GFX1030W64-NEXT: s_mul_hi_u32 s12, s3, s7 -; GFX1030W64-NEXT: s_add_u32 s8, s10, s8 +; GFX1030W64-NEXT: s_mul_hi_u32 s11, s6, s8 +; GFX1030W64-NEXT: s_add_u32 s9, s9, s10 +; GFX1030W64-NEXT: s_mul_i32 s8, s6, s8 +; GFX1030W64-NEXT: s_addc_u32 s9, s13, s12 +; GFX1030W64-NEXT: s_addc_u32 s10, s11, 0 +; GFX1030W64-NEXT: s_add_u32 s8, s9, s8 +; GFX1030W64-NEXT: s_addc_u32 s9, 0, s10 +; GFX1030W64-NEXT: s_add_u32 s7, s7, s8 +; GFX1030W64-NEXT: s_addc_u32 s6, s6, s9 +; GFX1030W64-NEXT: s_mul_hi_u32 s8, s2, s7 +; GFX1030W64-NEXT: s_mul_i32 s11, s2, s6 +; GFX1030W64-NEXT: s_mul_hi_u32 s10, s2, s6 +; GFX1030W64-NEXT: s_mul_hi_u32 s9, s3, s7 ; GFX1030W64-NEXT: s_mul_i32 s7, s3, s7 -; GFX1030W64-NEXT: s_addc_u32 s6, s9, s6 +; GFX1030W64-NEXT: s_add_u32 s8, s8, s11 +; GFX1030W64-NEXT: s_addc_u32 s10, 0, s10 +; GFX1030W64-NEXT: s_mul_hi_u32 s12, s3, s6 +; GFX1030W64-NEXT: s_add_u32 s7, s8, s7 +; GFX1030W64-NEXT: s_mul_i32 s6, s3, s6 +; GFX1030W64-NEXT: s_addc_u32 s7, s10, s9 ; GFX1030W64-NEXT: s_addc_u32 s8, s12, 0 -; GFX1030W64-NEXT: s_add_u32 s10, s6, s7 +; GFX1030W64-NEXT: s_add_u32 s10, s7, s6 ; GFX1030W64-NEXT: s_addc_u32 s11, 0, s8 ; GFX1030W64-NEXT: s_mul_hi_u32 s6, s4, s10 ; GFX1030W64-NEXT: s_mul_i32 s7, s4, s11 ; GFX1030W64-NEXT: s_mul_i32 s8, s5, s10 ; GFX1030W64-NEXT: s_add_i32 s6, s6, s7 -; GFX1030W64-NEXT: s_add_i32 s12, s6, s8 +; GFX1030W64-NEXT: s_add_i32 s8, s6, s8 ; GFX1030W64-NEXT: s_mul_i32 s6, s4, s10 -; GFX1030W64-NEXT: s_sub_i32 s8, s3, s12 -; GFX1030W64-NEXT: s_sub_u32 s13, s2, s6 +; GFX1030W64-NEXT: s_sub_i32 s9, s3, s8 +; GFX1030W64-NEXT: s_sub_u32 s12, s2, s6 ; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1030W64-NEXT: s_subb_u32 s14, s8, s5 -; GFX1030W64-NEXT: s_sub_u32 s15, s13, s4 -; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0 -; GFX1030W64-NEXT: s_subb_u32 s8, s14, 0 -; GFX1030W64-NEXT: s_cmp_ge_u32 s8, s5 -; GFX1030W64-NEXT: s_cselect_b32 s9, -1, 0 -; GFX1030W64-NEXT: s_cmp_ge_u32 s15, s4 +; GFX1030W64-NEXT: s_subb_u32 s9, s9, s5 +; GFX1030W64-NEXT: s_sub_u32 s13, s12, s4 +; GFX1030W64-NEXT: s_subb_u32 s9, s9, 0 +; GFX1030W64-NEXT: s_cmp_ge_u32 s9, s5 ; GFX1030W64-NEXT: s_cselect_b32 s14, -1, 0 -; GFX1030W64-NEXT: s_cmp_eq_u32 s8, s5 -; GFX1030W64-NEXT: s_cselect_b32 s8, s14, s9 -; GFX1030W64-NEXT: s_add_u32 s9, s10, 1 +; GFX1030W64-NEXT: s_cmp_ge_u32 s13, s4 +; GFX1030W64-NEXT: s_cselect_b32 s13, -1, 0 +; GFX1030W64-NEXT: s_cmp_eq_u32 s9, s5 +; GFX1030W64-NEXT: s_cselect_b32 s9, s13, s14 +; GFX1030W64-NEXT: s_add_u32 s13, s10, 1 ; GFX1030W64-NEXT: s_addc_u32 s14, s11, 0 ; GFX1030W64-NEXT: s_add_u32 s15, s10, 2 ; GFX1030W64-NEXT: s_addc_u32 s16, s11, 0 -; GFX1030W64-NEXT: s_cmp_lg_u32 s8, 0 -; GFX1030W64-NEXT: s_cselect_b32 s15, s15, s9 +; GFX1030W64-NEXT: s_cmp_lg_u32 s9, 0 +; GFX1030W64-NEXT: s_cselect_b32 s13, s15, s13 ; GFX1030W64-NEXT: s_cselect_b32 s14, s16, s14 ; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1030W64-NEXT: s_subb_u32 s3, s3, s12 +; GFX1030W64-NEXT: s_subb_u32 s3, s3, s8 ; GFX1030W64-NEXT: s_cmp_ge_u32 s3, s5 ; GFX1030W64-NEXT: s_cselect_b32 s6, -1, 0 -; GFX1030W64-NEXT: s_cmp_ge_u32 s13, s4 +; GFX1030W64-NEXT: s_cmp_ge_u32 s12, s4 ; GFX1030W64-NEXT: s_cselect_b32 s7, -1, 0 ; GFX1030W64-NEXT: s_cmp_eq_u32 s3, s5 ; GFX1030W64-NEXT: s_cselect_b32 s3, s7, s6 ; GFX1030W64-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1030W64-NEXT: s_cselect_b32 s7, s14, s11 -; GFX1030W64-NEXT: s_cselect_b32 s6, s15, s10 +; GFX1030W64-NEXT: s_cselect_b32 s6, s13, s10 ; GFX1030W64-NEXT: s_cbranch_execnz .LBB16_3 ; GFX1030W64-NEXT: .LBB16_2: ; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s4 @@ -2988,44 +2927,40 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: s_add_u32 s11, s12, s11 ; GFX11-NEXT: s_addc_u32 s12, 0, s13 ; GFX11-NEXT: s_add_u32 s8, s8, s11 -; GFX11-NEXT: s_cselect_b32 s11, -1, 0 -; GFX11-NEXT: s_mul_hi_u32 s13, s9, s8 -; GFX11-NEXT: s_cmp_lg_u32 s11, 0 -; GFX11-NEXT: s_mul_i32 s11, s9, s8 ; GFX11-NEXT: s_addc_u32 s7, s7, s12 -; GFX11-NEXT: s_mul_i32 s10, s10, s8 +; GFX11-NEXT: s_mul_hi_u32 s11, s9, s8 +; GFX11-NEXT: s_mul_i32 s12, s9, s8 ; GFX11-NEXT: s_mul_i32 s9, s9, s7 -; GFX11-NEXT: s_mul_hi_u32 s12, s8, s11 -; GFX11-NEXT: s_add_i32 s9, s13, s9 -; GFX11-NEXT: s_mul_hi_u32 s13, s7, s11 +; GFX11-NEXT: s_mul_i32 s10, s10, s8 +; GFX11-NEXT: s_add_i32 s9, s11, s9 +; GFX11-NEXT: s_mul_i32 s11, s7, s12 ; GFX11-NEXT: s_add_i32 s9, s9, s10 -; GFX11-NEXT: s_mul_i32 s10, s7, s11 +; GFX11-NEXT: s_mul_hi_u32 s10, s8, s12 ; GFX11-NEXT: s_mul_i32 s15, s8, s9 ; GFX11-NEXT: s_mul_hi_u32 s14, s8, s9 -; GFX11-NEXT: s_add_u32 s12, s12, s15 +; GFX11-NEXT: s_add_u32 s10, s10, s15 +; GFX11-NEXT: s_mul_hi_u32 s13, s7, s12 ; GFX11-NEXT: s_addc_u32 s14, 0, s14 -; GFX11-NEXT: s_mul_hi_u32 s11, s7, s9 -; GFX11-NEXT: s_add_u32 s10, s12, s10 +; GFX11-NEXT: s_mul_hi_u32 s12, s7, s9 +; GFX11-NEXT: s_add_u32 s10, s10, s11 ; GFX11-NEXT: s_mul_i32 s9, s7, s9 ; GFX11-NEXT: s_addc_u32 s10, s14, s13 -; GFX11-NEXT: s_addc_u32 s11, s11, 0 +; GFX11-NEXT: s_addc_u32 s11, s12, 0 ; GFX11-NEXT: s_add_u32 s9, s10, s9 ; GFX11-NEXT: s_addc_u32 s10, 0, s11 ; GFX11-NEXT: s_add_u32 s8, s8, s9 -; GFX11-NEXT: s_cselect_b32 s9, -1, 0 -; GFX11-NEXT: s_mul_hi_u32 s11, s2, s8 -; GFX11-NEXT: s_cmp_lg_u32 s9, 0 -; GFX11-NEXT: s_mul_hi_u32 s9, s3, s8 ; GFX11-NEXT: s_addc_u32 s7, s7, s10 -; GFX11-NEXT: s_mul_i32 s8, s3, s8 +; GFX11-NEXT: s_mul_hi_u32 s9, s2, s8 ; GFX11-NEXT: s_mul_i32 s12, s2, s7 -; GFX11-NEXT: s_mul_hi_u32 s10, s2, s7 -; GFX11-NEXT: s_add_u32 s11, s11, s12 -; GFX11-NEXT: s_addc_u32 s10, 0, s10 +; GFX11-NEXT: s_mul_hi_u32 s11, s2, s7 +; GFX11-NEXT: s_mul_hi_u32 s10, s3, s8 +; GFX11-NEXT: s_mul_i32 s8, s3, s8 +; GFX11-NEXT: s_add_u32 s9, s9, s12 +; GFX11-NEXT: s_addc_u32 s11, 0, s11 ; GFX11-NEXT: s_mul_hi_u32 s13, s3, s7 -; GFX11-NEXT: s_add_u32 s8, s11, s8 +; GFX11-NEXT: s_add_u32 s8, s9, s8 ; GFX11-NEXT: s_mul_i32 s7, s3, s7 -; GFX11-NEXT: s_addc_u32 s8, s10, s9 +; GFX11-NEXT: s_addc_u32 s8, s11, s10 ; GFX11-NEXT: s_addc_u32 s9, s13, 0 ; GFX11-NEXT: s_add_u32 s7, s8, s7 ; GFX11-NEXT: s_addc_u32 s8, 0, s9 @@ -3035,17 +2970,14 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: s_add_i32 s9, s9, s10 ; GFX11-NEXT: s_mul_i32 s10, s4, s7 ; GFX11-NEXT: s_add_i32 s9, s9, s11 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_sub_i32 s11, s3, s9 ; GFX11-NEXT: s_sub_u32 s10, s2, s10 ; GFX11-NEXT: s_cselect_b32 s12, -1, 0 -; GFX11-NEXT: s_cmp_lg_u32 s12, 0 ; GFX11-NEXT: s_subb_u32 s11, s11, s5 ; GFX11-NEXT: s_sub_u32 s13, s10, s4 -; GFX11-NEXT: s_cselect_b32 s14, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_lg_u32 s14, 0 ; GFX11-NEXT: s_subb_u32 s11, s11, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_cmp_ge_u32 s11, s5 ; GFX11-NEXT: s_cselect_b32 s14, -1, 0 ; GFX11-NEXT: s_cmp_ge_u32 s13, s4 @@ -3118,9 +3050,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_or_b64 s[4:5], s[2:3], s[6:7] -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_and_b64 s[4:5], s[4:5], 0xffffffff00000000 -; GFX1250-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1250-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX1250-NEXT: ; %bb.1: ; GFX1250-NEXT: s_cvt_f32_u32 s4, s6 @@ -3155,12 +3086,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[4:5], s[12:13] ; GFX1250-NEXT: s_add_co_u32 s8, s8, s12 -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1250-NEXT: s_add_co_ci_u32 s9, s9, s13 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_mul_u64 s[10:11], s[10:11], s[8:9] -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_mul_hi_u32 s13, s8, s11 ; GFX1250-NEXT: s_mul_i32 s12, s8, s11 ; GFX1250-NEXT: s_mul_hi_u32 s4, s8, s10 @@ -3175,19 +3103,17 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_add_nc_u64 s[10:11], s[4:5], s[10:11] ; GFX1250-NEXT: s_add_co_u32 s8, s8, s10 -; GFX1250-NEXT: s_cselect_b32 s10, -1, 0 -; GFX1250-NEXT: s_mul_hi_u32 s4, s2, s8 -; GFX1250-NEXT: s_cmp_lg_u32 s10, 0 -; GFX1250-NEXT: s_mul_hi_u32 s12, s3, s8 ; GFX1250-NEXT: s_add_co_ci_u32 s10, s9, s11 -; GFX1250-NEXT: s_mul_i32 s11, s3, s8 +; GFX1250-NEXT: s_mul_hi_u32 s4, s2, s8 +; GFX1250-NEXT: s_mul_hi_u32 s11, s3, s8 +; GFX1250-NEXT: s_mul_i32 s12, s3, s8 ; GFX1250-NEXT: s_mul_hi_u32 s9, s2, s10 ; GFX1250-NEXT: s_mul_i32 s8, s2, s10 ; GFX1250-NEXT: s_mul_hi_u32 s13, s3, s10 ; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[4:5], s[8:9] ; GFX1250-NEXT: s_mul_i32 s10, s3, s10 -; GFX1250-NEXT: s_add_co_u32 s4, s8, s11 -; GFX1250-NEXT: s_add_co_ci_u32 s4, s9, s12 +; GFX1250-NEXT: s_add_co_u32 s4, s8, s12 +; GFX1250-NEXT: s_add_co_ci_u32 s4, s9, s11 ; GFX1250-NEXT: s_add_co_ci_u32 s11, s13, 0 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[4:5], s[10:11] @@ -3202,10 +3128,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1250-NEXT: s_cmp_lg_u32 s8, 0 ; GFX1250-NEXT: s_sub_co_ci_u32 s12, s12, s7 ; GFX1250-NEXT: s_sub_co_u32 s13, s4, s6 -; GFX1250-NEXT: s_cselect_b32 s14, -1, 0 -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: s_cmp_lg_u32 s14, 0 ; GFX1250-NEXT: s_sub_co_ci_u32 s12, s12, 0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_cmp_ge_u32 s12, s7 ; GFX1250-NEXT: s_cselect_b32 s14, -1, 0 ; GFX1250-NEXT: s_cmp_ge_u32 s13, s6 diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index 4b151b9038a60..07e6a76d14cf9 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -714,9 +714,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; VI-NEXT: s_lshl_b32 s2, s2, 8 ; VI-NEXT: s_or_b32 s2, s2, s3 ; VI-NEXT: s_lshl_b32 s3, s2, 16 -; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: s_flbit_i32_b32 s3, s3 -; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: s_cselect_b32 s2, s3, 32 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll index cefcbddd3e394..fca57be5764f8 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -1491,7 +1491,6 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace( ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: s_cbranch_scc0 .LBB14_4 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_mov_b32 s11, 0xf000 @@ -1521,7 +1520,6 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace( ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s6, 16 -; VI-NEXT: s_cmp_lg_u32 s4, 0 ; VI-NEXT: s_cbranch_scc0 .LBB14_4 ; VI-NEXT: ; %bb.1: ; %else ; VI-NEXT: s_mov_b32 s11, 0xf000 diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll index d8a5e7fa3b029..dbdea8e3c533d 100644 --- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll +++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll @@ -14,7 +14,6 @@ define i32 @s_add_co_select_user() { ; GFX7-NEXT: s_add_u32 s7, s6, s6 ; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX7-NEXT: s_or_b32 s4, s4, s5 -; GFX7-NEXT: s_cmp_lg_u32 s4, 0 ; GFX7-NEXT: s_addc_u32 s8, s6, 0 ; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec @@ -31,8 +30,6 @@ define i32 @s_add_co_select_user() { ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s7, s6, s6 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX9-NEXT: s_addc_u32 s8, s6, 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec @@ -49,8 +46,6 @@ define i32 @s_add_co_select_user() { ; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s5, s4, s4 -; GFX10-NEXT: s_cselect_b32 s6, -1, 0 -; GFX10-NEXT: s_cmp_lg_u32 s6, 0 ; GFX10-NEXT: s_addc_u32 s6, s4, 0 ; GFX10-NEXT: s_cselect_b32 s7, -1, 0 ; GFX10-NEXT: s_and_b32 s7, s7, exec_lo @@ -67,16 +62,13 @@ define i32 @s_add_co_select_user() { ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s1, s0, s0 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-NEXT: s_addc_u32 s2, s0, 0 ; GFX11-NEXT: s_cselect_b32 s3, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s3, s3, exec_lo ; GFX11-NEXT: s_cselect_b32 s2, s2, 0 ; GFX11-NEXT: s_cmp_gt_u32 s0, 31 ; GFX11-NEXT: s_cselect_b32 s0, s1, s2 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] bb: @@ -104,7 +96,6 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; GFX7-NEXT: s_add_u32 s0, s2, s2 ; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_cmp_lg_u32 s0, 0 ; GFX7-NEXT: s_addc_u32 s0, s2, 0 ; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX7-NEXT: s_andn2_b64 vcc, exec, s[0:1] @@ -125,12 +116,10 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; ; GFX9-LABEL: s_add_co_br_user: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s2, s2 -; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_addc_u32 s0, s2, 0 +; GFX9-NEXT: s_add_u32 s1, s0, s0 +; GFX9-NEXT: s_addc_u32 s0, s0, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GFX9-NEXT: s_cbranch_vccnz .LBB1_2 @@ -153,8 +142,6 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; GFX10-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s1, s0, s0 -; GFX10-NEXT: s_cselect_b32 s1, -1, 0 -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10-NEXT: s_addc_u32 s0, s0, 0 ; GFX10-NEXT: s_cselect_b32 s0, -1, 0 ; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s0 @@ -178,11 +165,9 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s1, s0, s0 -; GFX11-NEXT: s_cselect_b32 s1, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-NEXT: s_addc_u32 s0, s0, 0 ; GFX11-NEXT: s_cselect_b32 s0, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_vccnz .LBB1_2 ; GFX11-NEXT: ; %bb.1: ; %bb0 diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index 62847b15d3443..9a17538ea9b1b 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -1117,7 +1117,6 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; SI: ; %bb.0: ; SI-NEXT: s_and_b32 s3, s1, 0x1ff ; SI-NEXT: s_or_b32 s0, s3, s0 -; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; SI-NEXT: s_lshr_b32 s0, s1, 8 @@ -1169,7 +1168,6 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; VI: ; %bb.0: ; VI-NEXT: s_and_b32 s3, s1, 0x1ff ; VI-NEXT: s_or_b32 s0, s3, s0 -; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; VI-NEXT: s_lshr_b32 s0, s1, 8 @@ -1217,7 +1215,6 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s3, s1, 0x1ff ; GFX9-NEXT: s_or_b32 s0, s3, s0 -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX9-NEXT: s_lshr_b32 s0, s1, 8 @@ -1264,11 +1261,9 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_and_b32 s3, s1, 0x1ff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_or_b32 s0, s3, s0 -; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s0, 0 ; GFX11-TRUE16-NEXT: s_cselect_b32 s0, -1, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX11-TRUE16-NEXT: s_bfe_u32 s0, s1, 0xb0014 ; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 8 @@ -1320,11 +1315,9 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; GFX11-FAKE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_and_b32 s3, s1, 0x1ff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_or_b32 s0, s3, s0 -; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s0, 0 ; GFX11-FAKE16-NEXT: s_cselect_b32 s0, -1, 0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX11-FAKE16-NEXT: s_bfe_u32 s0, s1, 0xb0014 ; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s1, 8 @@ -4023,7 +4016,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; SI-NEXT: s_and_b32 s6, s4, 0xffe ; SI-NEXT: s_and_b32 s4, s1, 0x1ff ; SI-NEXT: s_or_b32 s0, s4, s0 -; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s5 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] @@ -4066,7 +4058,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; SI-NEXT: s_and_b32 s5, s0, 0xffe ; SI-NEXT: s_and_b32 s0, s3, 0x1ff ; SI-NEXT: s_or_b32 s0, s0, s2 -; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; SI-NEXT: v_readfirstlane_b32 s0, v2 @@ -4120,10 +4111,9 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; VI-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_lshr_b32 s5, s3, 8 -; VI-NEXT: s_and_b32 s6, s3, 0x1ff ; VI-NEXT: s_and_b32 s5, s5, 0xffe +; VI-NEXT: s_and_b32 s6, s3, 0x1ff ; VI-NEXT: s_or_b32 s2, s6, s2 -; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cselect_b64 s[6:7], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7] ; VI-NEXT: s_bfe_u32 s3, s3, 0xb0014 @@ -4163,7 +4153,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; VI-NEXT: s_and_b32 s7, s2, 0xffe ; VI-NEXT: s_and_b32 s2, s1, 0x1ff ; VI-NEXT: s_or_b32 s0, s2, s0 -; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] ; VI-NEXT: s_bfe_u32 s1, s1, 0xb0014 @@ -4209,10 +4198,9 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; GFX9-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshr_b32 s5, s3, 8 -; GFX9-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX9-NEXT: s_and_b32 s5, s5, 0xffe +; GFX9-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX9-NEXT: s_or_b32 s2, s6, s2 -; GFX9-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7] ; GFX9-NEXT: s_bfe_u32 s6, s3, 0xb0014 @@ -4254,7 +4242,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; GFX9-NEXT: s_and_b32 s6, s2, 0xffe ; GFX9-NEXT: s_and_b32 s2, s1, 0x1ff ; GFX9-NEXT: s_or_b32 s0, s2, s0 -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 @@ -4301,11 +4288,10 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; ; GFX11-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b32 s5, s3, 0x1ff -; GFX11-NEXT: s_lshr_b32 s6, s3, 8 -; GFX11-NEXT: s_or_b32 s2, s5, s2 -; GFX11-NEXT: s_and_b32 s5, s6, 0xffe -; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_lshr_b32 s5, s3, 8 +; GFX11-NEXT: s_and_b32 s6, s3, 0x1ff +; GFX11-NEXT: s_and_b32 s5, s5, 0xffe +; GFX11-NEXT: s_or_b32 s2, s6, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 @@ -4348,13 +4334,12 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x40f ; GFX11-NEXT: s_cselect_b32 s2, s5, s6 ; GFX11-NEXT: s_lshr_b32 s3, s3, 16 -; GFX11-NEXT: s_and_b32 s6, s1, 0x1ff ; GFX11-NEXT: s_lshr_b32 s5, s1, 8 ; GFX11-NEXT: s_and_b32 s3, s3, 0x8000 -; GFX11-NEXT: s_or_b32 s0, s6, s0 +; GFX11-NEXT: s_and_b32 s6, s1, 0x1ff ; GFX11-NEXT: s_and_b32 s5, s5, 0xffe ; GFX11-NEXT: s_or_b32 s2, s3, s2 -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_or_b32 s0, s6, s0 ; GFX11-NEXT: s_cselect_b32 s0, -1, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll index b0dd1872e2b3a..c28b25c76d241 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -599,10 +599,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; SI-GISEL-NEXT: s_addk_i32 s3, 0xfc10 ; SI-GISEL-NEXT: s_and_b32 s6, s6, 0xffe ; SI-GISEL-NEXT: s_or_b32 s4, s7, s4 -; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; SI-GISEL-NEXT: s_or_b32 s4, s6, s4 -; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; SI-GISEL-NEXT: s_cselect_b32 s6, 1, 0 ; SI-GISEL-NEXT: s_lshl_b32 s6, s6, 9 ; SI-GISEL-NEXT: s_lshl_b32 s7, s3, 12 @@ -711,10 +709,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; VI-GISEL-NEXT: s_addk_i32 s4, 0xfc10 ; VI-GISEL-NEXT: s_and_b32 s5, s5, 0xffe ; VI-GISEL-NEXT: s_or_b32 s2, s6, s2 -; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; VI-GISEL-NEXT: s_or_b32 s2, s5, s2 -; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; VI-GISEL-NEXT: s_sub_i32 s7, 1, s4 ; VI-GISEL-NEXT: s_lshl_b32 s6, s4, 12 @@ -824,10 +820,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX9-GISEL-NEXT: s_addk_i32 s4, 0xfc10 ; GFX9-GISEL-NEXT: s_and_b32 s5, s5, 0xffe ; GFX9-GISEL-NEXT: s_or_b32 s2, s6, s2 -; GFX9-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; GFX9-GISEL-NEXT: s_or_b32 s2, s5, s2 -; GFX9-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX9-GISEL-NEXT: s_sub_i32 s7, 1, s4 ; GFX9-GISEL-NEXT: s_lshl_b32 s6, s4, 12 @@ -937,10 +931,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX950-GISEL-NEXT: s_addk_i32 s4, 0xfc10 ; GFX950-GISEL-NEXT: s_and_b32 s5, s5, 0xffe ; GFX950-GISEL-NEXT: s_or_b32 s2, s6, s2 -; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; GFX950-GISEL-NEXT: s_or_b32 s2, s5, s2 -; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX950-GISEL-NEXT: s_sub_i32 s7, 1, s4 ; GFX950-GISEL-NEXT: s_lshl_b32 s6, s4, 12 @@ -1118,17 +1110,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s3, 0xb0014 ; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s3, 8 -; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2 +; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s4, 0xfc10 ; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe -; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2 ; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, 1, 0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s5, s2 -; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s6, 1, s4 ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s8, s2, 0x1000 @@ -1175,17 +1165,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s3, 0xb0014 ; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s3, 8 -; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2 +; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s4, 0xfc10 ; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe -; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2 ; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, 1, 0 -; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s5, s2 -; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s6, 1, s4 ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s8, s2, 0x1000 @@ -1366,17 +1354,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX1250-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s3, 0xb0014 ; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s3, 8 -; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2 +; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX1250-GISEL-TRUE16-NEXT: s_addk_co_i32 s4, 0xfc10 ; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe -; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2 ; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s2, 1, 0 -; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s5, s2 -; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_sub_co_i32 s6, 1, s4 ; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s8, s2, 0x1000 @@ -1423,17 +1409,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX1250-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s3, 0xb0014 ; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s3, 8 -; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2 +; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX1250-GISEL-FAKE16-NEXT: s_addk_co_i32 s4, 0xfc10 ; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe -; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2 ; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s2, 1, 0 -; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s5, s2 -; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_sub_co_i32 s6, 1, s4 ; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s8, s2, 0x1000 @@ -2154,10 +2138,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; SI-GISEL-NEXT: s_addk_i32 s3, 0xfc10 ; SI-GISEL-NEXT: s_and_b32 s8, s8, 0xffe ; SI-GISEL-NEXT: s_or_b32 s4, s9, s4 -; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; SI-GISEL-NEXT: s_or_b32 s4, s8, s4 -; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; SI-GISEL-NEXT: s_cselect_b32 s8, 1, 0 ; SI-GISEL-NEXT: s_lshl_b32 s8, s8, 9 ; SI-GISEL-NEXT: s_lshl_b32 s9, s3, 12 @@ -2193,12 +2175,10 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; SI-GISEL-NEXT: s_and_b32 s4, s4, 0x8000 ; SI-GISEL-NEXT: s_addk_i32 s5, 0xfc10 ; SI-GISEL-NEXT: s_and_b32 s8, s8, 0xffe -; SI-GISEL-NEXT: s_or_b32 s6, s9, s6 ; SI-GISEL-NEXT: s_or_b32 s3, s4, s3 -; SI-GISEL-NEXT: s_cmp_lg_u32 s6, 0 +; SI-GISEL-NEXT: s_or_b32 s4, s9, s6 ; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; SI-GISEL-NEXT: s_or_b32 s4, s8, s4 -; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; SI-GISEL-NEXT: s_cselect_b32 s6, 1, 0 ; SI-GISEL-NEXT: s_lshl_b32 s6, s6, 9 ; SI-GISEL-NEXT: s_lshl_b32 s8, s5, 12 @@ -2355,10 +2335,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; VI-GISEL-NEXT: s_addk_i32 s2, 0xfc10 ; VI-GISEL-NEXT: s_and_b32 s3, s3, 0xffe ; VI-GISEL-NEXT: s_or_b32 s4, s8, s4 -; VI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; VI-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; VI-GISEL-NEXT: s_or_b32 s3, s3, s4 -; VI-GISEL-NEXT: s_cmp_lg_u32 s3, 0 ; VI-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; VI-GISEL-NEXT: s_sub_i32 s9, 1, s2 ; VI-GISEL-NEXT: s_lshl_b32 s8, s2, 12 @@ -2392,14 +2370,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; VI-GISEL-NEXT: s_or_b32 s2, s3, s2 ; VI-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014 ; VI-GISEL-NEXT: s_lshr_b32 s4, s7, 8 -; VI-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff ; VI-GISEL-NEXT: s_addk_i32 s3, 0xfc10 ; VI-GISEL-NEXT: s_and_b32 s4, s4, 0xffe +; VI-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff ; VI-GISEL-NEXT: s_or_b32 s5, s5, s6 -; VI-GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; VI-GISEL-NEXT: s_or_b32 s4, s4, s5 -; VI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; VI-GISEL-NEXT: s_sub_i32 s8, 1, s3 ; VI-GISEL-NEXT: s_lshl_b32 s6, s3, 12 @@ -2555,10 +2531,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX9-GISEL-NEXT: s_addk_i32 s2, 0xfc10 ; GFX9-GISEL-NEXT: s_and_b32 s3, s3, 0xffe ; GFX9-GISEL-NEXT: s_or_b32 s4, s8, s4 -; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX9-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; GFX9-GISEL-NEXT: s_or_b32 s3, s3, s4 -; GFX9-GISEL-NEXT: s_cmp_lg_u32 s3, 0 ; GFX9-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; GFX9-GISEL-NEXT: s_sub_i32 s9, 1, s2 ; GFX9-GISEL-NEXT: s_lshl_b32 s8, s2, 12 @@ -2592,14 +2566,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX9-GISEL-NEXT: s_or_b32 s2, s3, s2 ; GFX9-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014 ; GFX9-GISEL-NEXT: s_lshr_b32 s4, s7, 8 -; GFX9-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff ; GFX9-GISEL-NEXT: s_addk_i32 s3, 0xfc10 ; GFX9-GISEL-NEXT: s_and_b32 s4, s4, 0xffe +; GFX9-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff ; GFX9-GISEL-NEXT: s_or_b32 s5, s5, s6 -; GFX9-GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX9-GISEL-NEXT: s_or_b32 s4, s4, s5 -; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX9-GISEL-NEXT: s_sub_i32 s8, 1, s3 ; GFX9-GISEL-NEXT: s_lshl_b32 s6, s3, 12 @@ -2752,10 +2724,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX950-GISEL-NEXT: s_addk_i32 s2, 0xfc10 ; GFX950-GISEL-NEXT: s_and_b32 s3, s3, 0xffe ; GFX950-GISEL-NEXT: s_or_b32 s4, s8, s4 -; GFX950-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; GFX950-GISEL-NEXT: s_or_b32 s3, s3, s4 -; GFX950-GISEL-NEXT: s_cmp_lg_u32 s3, 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; GFX950-GISEL-NEXT: s_sub_i32 s9, 1, s2 ; GFX950-GISEL-NEXT: s_lshl_b32 s8, s2, 12 @@ -2789,14 +2759,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX950-GISEL-NEXT: s_or_b32 s2, s3, s2 ; GFX950-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014 ; GFX950-GISEL-NEXT: s_lshr_b32 s4, s7, 8 -; GFX950-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff ; GFX950-GISEL-NEXT: s_addk_i32 s3, 0xfc10 ; GFX950-GISEL-NEXT: s_and_b32 s4, s4, 0xffe +; GFX950-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff ; GFX950-GISEL-NEXT: s_or_b32 s5, s5, s6 -; GFX950-GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX950-GISEL-NEXT: s_or_b32 s4, s4, s5 -; GFX950-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX950-GISEL-NEXT: s_sub_i32 s8, 1, s3 ; GFX950-GISEL-NEXT: s_lshl_b32 s6, s3, 12 @@ -3073,17 +3041,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff ; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s2, s5, 0xb0014 ; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 8 -; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4 +; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff ; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s2, 0xfc10 ; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0xffe -; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4 ; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s3, s4 -; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0 ; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s8, 1, s2 ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s10, s3, 0x1000 @@ -3115,19 +3081,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX11-GISEL-TRUE16-NEXT: s_cmpk_eq_i32 s2, 0x40f ; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, s4, s3 ; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 16 -; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff ; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s7, 0xb0014 ; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s7, 8 ; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0x8000 -; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s6, s8, s6 +; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff ; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s4, 0xfc10 ; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s3, s2 -; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s6, 0 +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s8, s6 ; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s3, 1, 0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s5, s3 -; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s6, 1, s4 ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s9, s3, 0x1000 @@ -3176,17 +3140,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff ; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s2, s5, 0xb0014 ; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 8 -; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4 +; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff ; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s2, 0xfc10 ; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0xffe -; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4 ; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0 -; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s3, s4 -; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0 ; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s8, 1, s2 ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s10, s3, 0x1000 @@ -3218,19 +3180,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX11-GISEL-FAKE16-NEXT: s_cmpk_eq_i32 s2, 0x40f ; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, s4, s3 ; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 16 -; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff ; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s7, 0xb0014 ; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s7, 8 ; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0x8000 -; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s6, s8, s6 +; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff ; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s4, 0xfc10 ; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s3, s2 -; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s6, 0 +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s8, s6 ; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s3, 1, 0 -; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s5, s3 -; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s6, 1, s4 ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s9, s3, 0x1000 @@ -3511,17 +3471,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff ; GFX1250-GISEL-TRUE16-NEXT: s_bfe_u32 s2, s5, 0xb0014 ; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 8 -; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4 +; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff ; GFX1250-GISEL-TRUE16-NEXT: s_addk_co_i32 s2, 0xfc10 ; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0xffe -; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4 ; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0 -; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s3, s3, s4 -; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_sub_co_i32 s8, 1, s2 ; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s10, s3, 0x1000 @@ -3553,19 +3511,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX1250-GISEL-TRUE16-NEXT: s_cmp_eq_u32 s2, 0x40f ; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s2, s4, s3 ; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 16 -; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff ; GFX1250-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s7, 0xb0014 ; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s7, 8 ; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0x8000 -; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s6, s8, s6 +; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff ; GFX1250-GISEL-TRUE16-NEXT: s_addk_co_i32 s4, 0xfc10 ; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe ; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s3, s2 -; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s6, 0 +; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s3, s8, s6 ; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s3, 1, 0 -; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s3, s5, s3 -; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_sub_co_i32 s6, 1, s4 ; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s9, s3, 0x1000 @@ -3614,17 +3570,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff ; GFX1250-GISEL-FAKE16-NEXT: s_bfe_u32 s2, s5, 0xb0014 ; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 8 -; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4 +; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff ; GFX1250-GISEL-FAKE16-NEXT: s_addk_co_i32 s2, 0xfc10 ; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0xffe -; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4 ; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0 -; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s3, s3, s4 -; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_sub_co_i32 s8, 1, s2 ; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s10, s3, 0x1000 @@ -3656,19 +3610,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX1250-GISEL-FAKE16-NEXT: s_cmp_eq_u32 s2, 0x40f ; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s2, s4, s3 ; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 16 -; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff ; GFX1250-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s7, 0xb0014 ; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s7, 8 ; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0x8000 -; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s6, s8, s6 +; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff ; GFX1250-GISEL-FAKE16-NEXT: s_addk_co_i32 s4, 0xfc10 ; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe ; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s3, s2 -; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s6, 0 +; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s3, s8, s6 ; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s3, 1, 0 -; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s3, s5, s3 -; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_sub_co_i32 s6, 1, s4 ; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s9, s3, 0x1000 diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll index f11654912d02d..b6b26a47970b0 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll @@ -182,7 +182,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; SI-NEXT: s_and_b32 s1, s7, 0x1ff ; SI-NEXT: s_and_b32 s8, s0, 0xffe ; SI-NEXT: s_or_b32 s0, s1, s6 -; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; SI-NEXT: s_bfe_u32 s0, s7, 0xb0014 @@ -237,7 +236,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; VI-SDAG-NEXT: s_and_b32 s8, s4, 0xffe ; VI-SDAG-NEXT: s_and_b32 s4, s7, 0x1ff ; VI-SDAG-NEXT: s_or_b32 s4, s4, s6 -; VI-SDAG-NEXT: s_cmp_lg_u32 s4, 0 ; VI-SDAG-NEXT: s_mov_b32 s1, s5 ; VI-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0 ; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] @@ -290,10 +288,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; VI-GISEL-NEXT: s_addk_i32 s4, 0xfc10 ; VI-GISEL-NEXT: s_and_b32 s5, s5, 0xffe ; VI-GISEL-NEXT: s_or_b32 s2, s6, s2 -; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; VI-GISEL-NEXT: s_or_b32 s2, s5, s2 -; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; VI-GISEL-NEXT: s_sub_i32 s7, 1, s4 ; VI-GISEL-NEXT: s_lshl_b32 s6, s4, 12 @@ -335,11 +331,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff -; GFX10-SDAG-NEXT: s_lshr_b32 s5, s3, 8 -; GFX10-SDAG-NEXT: s_or_b32 s2, s4, s2 -; GFX10-SDAG-NEXT: s_and_b32 s4, s5, 0xffe -; GFX10-SDAG-NEXT: s_cmp_lg_u32 s2, 0 +; GFX10-SDAG-NEXT: s_lshr_b32 s4, s3, 8 +; GFX10-SDAG-NEXT: s_and_b32 s5, s3, 0x1ff +; GFX10-SDAG-NEXT: s_and_b32 s4, s4, 0xffe +; GFX10-SDAG-NEXT: s_or_b32 s2, s5, s2 ; GFX10-SDAG-NEXT: s_cselect_b32 s2, -1, 0 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 ; GFX10-SDAG-NEXT: s_bfe_u32 s2, s3, 0xb0014 @@ -387,16 +382,14 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX10-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014 ; GFX10-GISEL-NEXT: s_lshr_b32 s5, s3, 8 -; GFX10-GISEL-NEXT: s_or_b32 s2, s6, s2 +; GFX10-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX10-GISEL-NEXT: s_addk_i32 s4, 0xfc10 ; GFX10-GISEL-NEXT: s_and_b32 s5, s5, 0xffe -; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX10-GISEL-NEXT: s_or_b32 s2, s6, s2 ; GFX10-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; GFX10-GISEL-NEXT: s_or_b32 s2, s5, s2 -; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX10-GISEL-NEXT: s_sub_i32 s6, 1, s4 ; GFX10-GISEL-NEXT: s_or_b32 s8, s2, 0x1000 @@ -438,11 +431,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff -; GFX11-SDAG-NEXT: s_lshr_b32 s5, s3, 8 -; GFX11-SDAG-NEXT: s_or_b32 s2, s4, s2 -; GFX11-SDAG-NEXT: s_and_b32 s4, s5, 0xffe -; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-SDAG-NEXT: s_lshr_b32 s4, s3, 8 +; GFX11-SDAG-NEXT: s_and_b32 s5, s3, 0x1ff +; GFX11-SDAG-NEXT: s_and_b32 s4, s4, 0xffe +; GFX11-SDAG-NEXT: s_or_b32 s2, s5, s2 ; GFX11-SDAG-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 @@ -498,17 +490,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX11-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014 ; GFX11-GISEL-NEXT: s_lshr_b32 s5, s3, 8 -; GFX11-GISEL-NEXT: s_or_b32 s2, s6, s2 +; GFX11-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX11-GISEL-NEXT: s_addk_i32 s4, 0xfc10 ; GFX11-GISEL-NEXT: s_and_b32 s5, s5, 0xffe -; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-GISEL-NEXT: s_or_b32 s2, s6, s2 ; GFX11-GISEL-NEXT: s_cselect_b32 s2, 1, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_or_b32 s2, s5, s2 -; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX11-GISEL-NEXT: s_sub_i32 s6, 1, s4 ; GFX11-GISEL-NEXT: s_or_b32 s8, s2, 0x1000 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index 37756d15861be..31f277f73099b 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -472,7 +472,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -536,11 +535,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -606,7 +604,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -660,12 +657,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1 ; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -710,9 +706,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1 ; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -1690,7 +1685,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -1754,11 +1748,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1824,7 +1817,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -1878,12 +1870,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1 ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1928,9 +1919,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1 ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -2968,7 +2958,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -3032,11 +3021,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3102,7 +3090,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -3156,12 +3143,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1 ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3206,9 +3192,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1 ; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -3742,7 +3727,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -3806,11 +3790,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB6_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3876,7 +3859,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -3930,12 +3912,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1 ; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3980,9 +3961,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1 ; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -5019,7 +4999,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -5083,11 +5062,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5153,7 +5131,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -5207,12 +5184,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5270,9 +5246,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -6284,7 +6259,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6354,7 +6328,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6424,7 +6397,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6485,8 +6457,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6550,7 +6520,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7717,7 +7686,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7787,7 +7755,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7857,7 +7824,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7918,8 +7884,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7983,7 +7947,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9150,7 +9113,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9220,7 +9182,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9290,7 +9251,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9351,8 +9311,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9416,7 +9374,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10065,7 +10022,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10135,7 +10091,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10205,7 +10160,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10266,8 +10220,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10331,7 +10283,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11498,7 +11449,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11568,7 +11518,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11638,7 +11587,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11699,8 +11647,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11764,7 +11710,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index 6351bb39e97f5..4581efc06504a 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -381,13 +381,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-NEXT: .LBB1_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s4, v0, s2 +; GFX9-NEXT: v_readlane_b32 s3, v0, s2 +; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX9-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX9-NEXT: v_max_f32_e64 v2, s4, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v1 ; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -457,7 +456,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -513,7 +511,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 ; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1 @@ -562,8 +559,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -610,11 +606,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -1420,13 +1414,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-NEXT: .LBB3_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s4, v0, s2 +; GFX9-NEXT: v_readlane_b32 s3, v0, s2 +; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX9-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX9-NEXT: v_max_f32_e64 v2, s4, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v1 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1496,7 +1489,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -1552,7 +1544,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 ; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 @@ -1601,8 +1592,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -1649,11 +1639,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -2459,13 +2447,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-NEXT: .LBB5_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s4, v0, s2 +; GFX9-NEXT: v_readlane_b32 s3, v0, s2 +; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX9-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX9-NEXT: v_max_f32_e64 v2, s4, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v1 ; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2535,7 +2522,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -2591,7 +2577,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 ; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 @@ -2640,8 +2625,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -2688,11 +2672,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -3591,7 +3573,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -3665,7 +3646,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -3724,7 +3704,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -3774,8 +3753,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -3841,10 +3819,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -4859,7 +4836,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -4933,7 +4909,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -4992,7 +4967,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -5042,8 +5016,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -5109,10 +5082,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -6127,7 +6099,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -6201,7 +6172,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -6260,7 +6230,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -6310,8 +6279,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -6377,10 +6345,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index a9ac00863cd17..bd570d9eccdc3 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -381,13 +381,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-NEXT: .LBB1_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s4, v0, s2 +; GFX9-NEXT: v_readlane_b32 s3, v0, s2 +; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX9-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX9-NEXT: v_max_f32_e64 v2, s4, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX9-NEXT: v_min_f32_e32 v2, v2, v1 ; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -457,7 +456,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -513,7 +511,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 ; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1 @@ -562,8 +559,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -610,11 +606,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -1420,13 +1414,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-NEXT: .LBB3_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s4, v0, s2 +; GFX9-NEXT: v_readlane_b32 s3, v0, s2 +; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX9-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX9-NEXT: v_max_f32_e64 v2, s4, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX9-NEXT: v_min_f32_e32 v2, v2, v1 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1496,7 +1489,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -1552,7 +1544,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 ; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 @@ -1601,8 +1592,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -1649,11 +1639,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -2459,13 +2447,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-NEXT: .LBB5_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s4, v0, s2 +; GFX9-NEXT: v_readlane_b32 s3, v0, s2 +; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX9-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX9-NEXT: v_max_f32_e64 v2, s4, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX9-NEXT: v_min_f32_e32 v2, v2, v1 ; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2535,7 +2522,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -2591,7 +2577,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 ; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 @@ -2640,8 +2625,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -2688,11 +2672,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -3591,7 +3573,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -3665,7 +3646,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -3724,7 +3704,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -3774,8 +3753,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -3841,10 +3819,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -4859,7 +4836,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -4933,7 +4909,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -4992,7 +4967,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -5042,8 +5016,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -5109,10 +5082,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -6127,7 +6099,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -6201,7 +6172,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -6260,7 +6230,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -6310,8 +6279,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -6377,10 +6345,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index 6311143f57260..1f2d70c931e73 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -532,7 +532,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -596,11 +595,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -666,7 +664,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -720,12 +717,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -783,9 +779,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -1862,7 +1857,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -1926,11 +1920,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1996,7 +1989,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -2050,12 +2042,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2113,9 +2104,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -3192,7 +3182,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -3256,11 +3245,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3326,7 +3314,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -3380,12 +3367,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3443,9 +3429,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -4018,7 +4003,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -4082,11 +4066,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB6_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4152,7 +4135,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -4206,12 +4188,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4269,9 +4250,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -5347,7 +5327,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -5411,11 +5390,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5481,7 +5459,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -5535,12 +5512,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5598,9 +5574,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -6612,7 +6587,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6682,7 +6656,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6752,7 +6725,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6813,8 +6785,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6878,7 +6848,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8044,7 +8013,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8114,7 +8082,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8184,7 +8151,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8245,8 +8211,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8310,7 +8274,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9477,7 +9440,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9547,7 +9509,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9617,7 +9578,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9678,8 +9638,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9743,7 +9701,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10392,7 +10349,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10462,7 +10418,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10532,7 +10487,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10593,8 +10547,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10658,7 +10610,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11824,7 +11775,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11894,7 +11844,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11964,7 +11913,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -12025,8 +11973,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -12090,7 +12036,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll index eee232a3f292e..c3f391786f878 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -136,19 +136,17 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: .LBB2_6: ; %bb18 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: v_readfirstlane_b32 s13, v0 -; GFX11-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-NEXT: s_cselect_b32 s1, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 -; GFX11-NEXT: s_and_b32 s1, s8, s1 -; GFX11-NEXT: s_and_b32 s1, s1, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_cselect_b32 s13, -1, 0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13 +; GFX11-NEXT: s_and_b32 s13, s8, s13 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_and_b32 s13, s13, exec_lo ; GFX11-NEXT: v_readfirstlane_b32 s19, v2 -; GFX11-NEXT: s_cselect_b32 s1, s19, s13 -; GFX11-NEXT: s_and_b32 s13, 0xffff, s0 +; GFX11-NEXT: s_cselect_b32 s1, s19, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s1, s1, 1 -; GFX11-NEXT: s_cmp_lg_u32 s13, 0 +; GFX11-NEXT: s_and_b32 s13, 0xffff, s0 ; GFX11-NEXT: s_cselect_b32 s13, -1, 0 ; GFX11-NEXT: s_and_b32 s20, s9, exec_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index 8748aff42d65b..6dc919988cc4f 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -8265,12 +8265,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_readlane_b32 s6, v1, s3 -; GFX12-NEXT: s_lshl_b32 s7, 1, s3 ; GFX12-NEXT: v_writelane_b32 v0, s0, s3 +; GFX12-NEXT: s_lshl_b32 s3, 1, s3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 s1, s1, s7 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12-NEXT: s_and_not1_b32 s1, s1, s3 ; GFX12-NEXT: s_add_f32 s0, s0, s6 ; GFX12-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX12-NEXT: ; %bb.6: ; %ComputeEnd @@ -8351,14 +8349,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX942-NEXT: .LBB28_5: ; %ComputeLoop ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX942-NEXT: v_readfirstlane_b32 s8, v1 -; GFX942-NEXT: v_readlane_b32 s9, v2, s3 +; GFX942-NEXT: v_readfirstlane_b32 s6, v1 ; GFX942-NEXT: s_mov_b32 m0, s3 +; GFX942-NEXT: v_readlane_b32 s8, v2, s3 +; GFX942-NEXT: v_writelane_b32 v0, s6, m0 +; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX942-NEXT: v_writelane_b32 v0, s8, m0 -; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX942-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX942-NEXT: v_add_f32_e32 v1, s8, v1 ; GFX942-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX942-NEXT: ; %bb.6: ; %ComputeEnd ; GFX942-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -8440,15 +8437,14 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX11-NEXT: .LBB28_5: ; %ComputeLoop ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_ctz_i32_b32 s1, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readlane_b32 s6, v2, s1 -; GFX11-NEXT: s_lshl_b32 s7, 1, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 s0, s0, s7 ; GFX11-NEXT: v_writelane_b32 v0, s3, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_add_f32_e32 v1, s6, v1 -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_lshl_b32 s1, 1, s1 +; GFX11-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX11-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX11-NEXT: ; %bb.6: ; %ComputeEnd ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -8528,11 +8524,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX10-NEXT: s_ff1_i32_b32 s1, s0 ; GFX10-NEXT: v_readfirstlane_b32 s3, v1 ; GFX10-NEXT: v_readlane_b32 s6, v2, s1 -; GFX10-NEXT: s_lshl_b32 s7, 1, s1 -; GFX10-NEXT: s_andn2_b32 s0, s0, s7 ; GFX10-NEXT: v_writelane_b32 v0, s3, s1 ; GFX10-NEXT: v_add_f32_e32 v1, s6, v1 -; GFX10-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-NEXT: s_lshl_b32 s1, 1, s1 +; GFX10-NEXT: s_andn2_b32 s0, s0, s1 ; GFX10-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX10-NEXT: ; %bb.6: ; %ComputeEnd ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -8609,14 +8604,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX90A-NEXT: .LBB28_5: ; %ComputeLoop ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX90A-NEXT: v_readfirstlane_b32 s8, v1 -; GFX90A-NEXT: v_readlane_b32 s9, v2, s3 +; GFX90A-NEXT: v_readfirstlane_b32 s6, v1 ; GFX90A-NEXT: s_mov_b32 m0, s3 +; GFX90A-NEXT: v_readlane_b32 s8, v2, s3 +; GFX90A-NEXT: v_writelane_b32 v0, s6, m0 +; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX90A-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX90A-NEXT: v_writelane_b32 v0, s8, m0 -; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX90A-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX90A-NEXT: v_add_f32_e32 v1, s8, v1 ; GFX90A-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX90A-NEXT: ; %bb.6: ; %ComputeEnd ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -8692,14 +8686,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX908-NEXT: .LBB28_5: ; %ComputeLoop ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX908-NEXT: v_readfirstlane_b32 s8, v1 -; GFX908-NEXT: v_readlane_b32 s9, v2, s3 +; GFX908-NEXT: v_readfirstlane_b32 s6, v1 ; GFX908-NEXT: s_mov_b32 m0, s3 +; GFX908-NEXT: v_readlane_b32 s8, v2, s3 +; GFX908-NEXT: v_writelane_b32 v0, s6, m0 +; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX908-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX908-NEXT: v_writelane_b32 v0, s8, m0 -; GFX908-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX908-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX908-NEXT: v_add_f32_e32 v1, s8, v1 ; GFX908-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX908-NEXT: ; %bb.6: ; %ComputeEnd ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -8776,14 +8769,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX8-NEXT: .LBB28_5: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX8-NEXT: v_readfirstlane_b32 s8, v1 -; GFX8-NEXT: v_readlane_b32 s9, v2, s3 +; GFX8-NEXT: v_readfirstlane_b32 s6, v1 ; GFX8-NEXT: s_mov_b32 m0, s3 +; GFX8-NEXT: v_readlane_b32 s8, v2, s3 +; GFX8-NEXT: v_writelane_b32 v0, s6, m0 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: v_writelane_b32 v0, s8, m0 -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX8-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX8-NEXT: v_add_f32_e32 v1, s8, v1 ; GFX8-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX8-NEXT: ; %bb.6: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9130,12 +9122,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_readlane_b32 s6, v1, s3 -; GFX12-NEXT: s_lshl_b32 s7, 1, s3 ; GFX12-NEXT: v_writelane_b32 v0, s0, s3 +; GFX12-NEXT: s_lshl_b32 s3, 1, s3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 s1, s1, s7 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12-NEXT: s_and_not1_b32 s1, s1, s3 ; GFX12-NEXT: s_add_f32 s0, s0, s6 ; GFX12-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX12-NEXT: ; %bb.6: ; %ComputeEnd @@ -9212,14 +9202,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX942-NEXT: .LBB29_5: ; %ComputeLoop ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX942-NEXT: v_readfirstlane_b32 s8, v1 -; GFX942-NEXT: v_readlane_b32 s9, v2, s3 +; GFX942-NEXT: v_readfirstlane_b32 s6, v1 ; GFX942-NEXT: s_mov_b32 m0, s3 +; GFX942-NEXT: v_readlane_b32 s8, v2, s3 +; GFX942-NEXT: v_writelane_b32 v0, s6, m0 +; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX942-NEXT: v_writelane_b32 v0, s8, m0 -; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX942-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX942-NEXT: v_add_f32_e32 v1, s8, v1 ; GFX942-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX942-NEXT: ; %bb.6: ; %ComputeEnd ; GFX942-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9296,15 +9285,14 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: .LBB29_5: ; %ComputeLoop ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_ctz_i32_b32 s1, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readlane_b32 s6, v2, s1 -; GFX11-NEXT: s_lshl_b32 s7, 1, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 s0, s0, s7 ; GFX11-NEXT: v_writelane_b32 v0, s3, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_add_f32_e32 v1, s6, v1 -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_lshl_b32 s1, 1, s1 +; GFX11-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX11-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX11-NEXT: ; %bb.6: ; %ComputeEnd ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9377,11 +9365,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX10-NEXT: s_ff1_i32_b32 s1, s0 ; GFX10-NEXT: v_readfirstlane_b32 s3, v1 ; GFX10-NEXT: v_readlane_b32 s6, v2, s1 -; GFX10-NEXT: s_lshl_b32 s7, 1, s1 -; GFX10-NEXT: s_andn2_b32 s0, s0, s7 ; GFX10-NEXT: v_writelane_b32 v0, s3, s1 ; GFX10-NEXT: v_add_f32_e32 v1, s6, v1 -; GFX10-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-NEXT: s_lshl_b32 s1, 1, s1 +; GFX10-NEXT: s_andn2_b32 s0, s0, s1 ; GFX10-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX10-NEXT: ; %bb.6: ; %ComputeEnd ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9453,14 +9440,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: .LBB29_5: ; %ComputeLoop ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX90A-NEXT: v_readfirstlane_b32 s8, v1 -; GFX90A-NEXT: v_readlane_b32 s9, v2, s3 +; GFX90A-NEXT: v_readfirstlane_b32 s6, v1 ; GFX90A-NEXT: s_mov_b32 m0, s3 +; GFX90A-NEXT: v_readlane_b32 s8, v2, s3 +; GFX90A-NEXT: v_writelane_b32 v0, s6, m0 +; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX90A-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX90A-NEXT: v_writelane_b32 v0, s8, m0 -; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX90A-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX90A-NEXT: v_add_f32_e32 v1, s8, v1 ; GFX90A-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX90A-NEXT: ; %bb.6: ; %ComputeEnd ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9533,14 +9519,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX908-NEXT: .LBB29_5: ; %ComputeLoop ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX908-NEXT: v_readfirstlane_b32 s8, v1 -; GFX908-NEXT: v_readlane_b32 s9, v2, s3 +; GFX908-NEXT: v_readfirstlane_b32 s6, v1 ; GFX908-NEXT: s_mov_b32 m0, s3 +; GFX908-NEXT: v_readlane_b32 s8, v2, s3 +; GFX908-NEXT: v_writelane_b32 v0, s6, m0 +; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX908-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX908-NEXT: v_writelane_b32 v0, s8, m0 -; GFX908-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX908-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX908-NEXT: v_add_f32_e32 v1, s8, v1 ; GFX908-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX908-NEXT: ; %bb.6: ; %ComputeEnd ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9614,14 +9599,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: .LBB29_5: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX8-NEXT: v_readfirstlane_b32 s8, v1 -; GFX8-NEXT: v_readlane_b32 s9, v2, s3 +; GFX8-NEXT: v_readfirstlane_b32 s6, v1 ; GFX8-NEXT: s_mov_b32 m0, s3 +; GFX8-NEXT: v_readlane_b32 s8, v2, s3 +; GFX8-NEXT: v_writelane_b32 v0, s6, m0 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: v_writelane_b32 v0, s8, m0 -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX8-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX8-NEXT: v_add_f32_e32 v1, s8, v1 ; GFX8-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX8-NEXT: ; %bb.6: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir index c1cf06e30c745..fba42c494343b 100644 --- a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir +++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir @@ -388,9 +388,8 @@ body: | ; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY]], implicit-def $scc - ; GCN-NEXT: S_NOP 0, implicit killed $scc - ; GCN-NEXT: S_CMP_EQ_U32 killed [[S_AND_B32_]], 1, implicit-def $scc + ; GCN-NEXT: S_BITCMP1_B32 killed [[COPY]], 0, implicit-def $scc + ; GCN-NEXT: S_NOP 0, implicit $scc ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc ; GCN-NEXT: S_BRANCH %bb.1 ; GCN-NEXT: {{ $}} @@ -416,6 +415,80 @@ body: | bb.2: S_ENDPGM 0 +... +--- +name: xor_1_cmp_lg_0_killed_scc +body: | + ; GCN-LABEL: name: xor_1_cmp_lg_0_killed_scc + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GCN-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 1, killed [[COPY]], implicit-def $scc + ; GCN-NEXT: S_NOP 0, implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x40000000), %bb.2(0x40000000) + liveins: $sgpr0, $vgpr0_vgpr1 + + %0:sreg_32 = COPY $sgpr0 + %1:sreg_32 = S_XOR_B32 1, killed %0, implicit-def $scc + S_NOP 0, implicit killed $scc + S_CMP_LG_U32 killed %1:sreg_32, 0, implicit-def $scc + S_CBRANCH_SCC0 %bb.2, implicit $scc + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2(0x80000000) + + bb.2: + S_ENDPGM 0 + +... +--- +name: absdiff_1_cmp_lg_0_killed_scc +body: | + ; GCN-LABEL: name: absdiff_1_cmp_lg_0_killed_scc + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GCN-NEXT: [[S_ABSDIFF_I32_:%[0-9]+]]:sreg_32 = S_ABSDIFF_I32 1, killed [[COPY]], implicit-def $scc + ; GCN-NEXT: S_NOP 0, implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x40000000), %bb.2(0x40000000) + liveins: $sgpr0, $vgpr0_vgpr1 + + %0:sreg_32 = COPY $sgpr0 + %1:sreg_32 = S_ABSDIFF_I32 1, killed %0, implicit-def $scc + S_NOP 0, implicit killed $scc + S_CMP_LG_U32 killed %1:sreg_32, 0, implicit-def $scc + S_CBRANCH_SCC0 %bb.2, implicit $scc + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2(0x80000000) + + bb.2: + S_ENDPGM 0 + ... --- @@ -2070,8 +2143,7 @@ body: | ; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 3, killed [[COPY]], implicit-def dead $scc - ; GCN-NEXT: S_CMP_LG_U32 killed [[S_AND_B32_]], 0, implicit-def $scc + ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 3, killed [[COPY]], implicit-def $scc ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc ; GCN-NEXT: S_BRANCH %bb.1 ; GCN-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll index f53aaaad87e16..dd5f838b4a206 100644 --- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll +++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s declare i32 @llvm.ctpop.i32(i32) declare i64 @llvm.ctpop.i64(i64) @@ -10,7 +10,6 @@ define amdgpu_ps i32 @shl32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: shl32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_lshl_b32 s0, s0, s1 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -25,7 +24,6 @@ define amdgpu_ps i32 @shl64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: shl64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -40,7 +38,6 @@ define amdgpu_ps i32 @lshr32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: lshr32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_lshr_b32 s0, s0, s1 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -55,7 +52,6 @@ define amdgpu_ps i32 @lshr64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: lshr64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -70,7 +66,6 @@ define amdgpu_ps i32 @ashr32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: ashr32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_ashr_i32 s0, s0, s1 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -85,7 +80,6 @@ define amdgpu_ps i32 @ashr64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: ashr64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_ashr_i64 s[0:1], s[0:1], s2 -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -100,7 +94,6 @@ define amdgpu_ps i32 @abs32(i32 inreg %val0) { ; CHECK-LABEL: abs32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_abs_i32 s0, s0 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s0 ; CHECK-NEXT: ;;#ASMEND @@ -121,7 +114,6 @@ define amdgpu_ps i32 @and32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: and32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_and_b32 s0, s0, s1 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -136,7 +128,6 @@ define amdgpu_ps i32 @and64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: and64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -151,7 +142,6 @@ define amdgpu_ps i32 @or32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: or32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_or_b32 s0, s0, s1 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -166,7 +156,6 @@ define amdgpu_ps i32 @or64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: or64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -181,7 +170,6 @@ define amdgpu_ps i32 @xor32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: xor32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_xor_b32 s0, s0, s1 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -196,7 +184,6 @@ define amdgpu_ps i32 @xor64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: xor64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -211,7 +198,6 @@ define amdgpu_ps i32 @nand32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: nand32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_nand_b32 s0, s0, s1 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s0 ; CHECK-NEXT: ;;#ASMEND @@ -231,7 +217,6 @@ define amdgpu_ps i32 @nand64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: nand64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_nand_b64 s[0:1], s[0:1], s[2:3] -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND @@ -251,7 +236,6 @@ define amdgpu_ps i32 @nor32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: nor32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_nor_b32 s0, s0, s1 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s0 ; CHECK-NEXT: ;;#ASMEND @@ -271,7 +255,6 @@ define amdgpu_ps i32 @nor64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: nor64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_nor_b64 s[0:1], s[0:1], s[2:3] -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND @@ -291,7 +274,6 @@ define amdgpu_ps i32 @xnor32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: xnor32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_xnor_b32 s0, s0, s1 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s0 ; CHECK-NEXT: ;;#ASMEND @@ -311,7 +293,6 @@ define amdgpu_ps i32 @xnor64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: xnor64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_xnor_b64 s[0:1], s[0:1], s[2:3] -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND @@ -331,7 +312,6 @@ define amdgpu_ps i32 @andn232(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: andn232: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_andn2_b32 s0, s0, s1 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -347,7 +327,6 @@ define amdgpu_ps i32 @nandn264(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: nandn264: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -363,7 +342,6 @@ define amdgpu_ps i32 @orn232(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: orn232: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_orn2_b32 s0, s0, s1 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -379,7 +357,6 @@ define amdgpu_ps i32 @orn264(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: orn264: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_orn2_b64 s[0:1], s[0:1], s[2:3] -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -395,7 +372,6 @@ define amdgpu_ps i32 @bfe_i32(i32 inreg %val0) { ; CHECK-LABEL: bfe_i32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_bfe_i32 s0, s0, 0x80010 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -433,7 +409,6 @@ define amdgpu_ps i32 @bfe_u32(i32 inreg %val0) { ; CHECK-LABEL: bfe_u32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_bfe_u32 s0, s0, 0x80010 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -513,7 +488,6 @@ define amdgpu_ps i32 @bcnt132(i32 inreg %val0) { ; CHECK-LABEL: bcnt132: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_bcnt1_i32_b32 s0, s0 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s0 ; CHECK-NEXT: ;;#ASMEND @@ -552,7 +526,6 @@ define amdgpu_ps i32 @quadmask32(i32 inreg %val0) { ; CHECK-LABEL: quadmask32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_quadmask_b32 s0, s0 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s0 ; CHECK-NEXT: ;;#ASMEND @@ -571,7 +544,6 @@ define amdgpu_ps i32 @quadmask64(i64 inreg %val0) { ; CHECK-LABEL: quadmask64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_quadmask_b64 s[0:1], s[0:1] -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND @@ -590,7 +562,6 @@ define amdgpu_ps i32 @not32(i32 inreg %val0) { ; CHECK-LABEL: not32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_not_b32 s0, s0 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s0 ; CHECK-NEXT: ;;#ASMEND @@ -609,7 +580,6 @@ define amdgpu_ps i32 @not64(i64 inreg %val0) { ; CHECK-LABEL: not64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_not_b64 s[0:1], s[0:1] -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND @@ -623,3 +593,35 @@ define amdgpu_ps i32 @not64(i64 inreg %val0) { %zext = zext i1 %cmp to i32 ret i32 %zext } + + +; -------------------------------------------------------------------------------- +; Negative tests +; -------------------------------------------------------------------------------- + +@1 = extern_weak dso_local addrspace(4) constant i32 + +define amdgpu_ps i32 @si_pc_add_rel_offset_must_not_optimize() { +; CHECK-LABEL: si_pc_add_rel_offset_must_not_optimize: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_getpc_b64 s[0:1] +; CHECK-NEXT: s_add_u32 s0, s0, __unnamed_1@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s1, s1, __unnamed_1@rel32@hi+12 +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 +; CHECK-NEXT: s_cbranch_scc0 .LBB35_2 +; CHECK-NEXT: ; %bb.1: ; %endif +; CHECK-NEXT: s_mov_b32 s0, 1 +; CHECK-NEXT: s_branch .LBB35_3 +; CHECK-NEXT: .LBB35_2: ; %if +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: s_branch .LBB35_3 +; CHECK-NEXT: .LBB35_3: + %cmp = icmp ne ptr addrspace(4) @1, null + br i1 %cmp, label %endif, label %if + +if: + ret i32 0 + +endif: + ret i32 1 +} diff --git a/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll b/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll index a828ee0a7883c..7552f6b3a2d48 100644 --- a/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll +++ b/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll @@ -12,8 +12,6 @@ define amdgpu_ps i32 @s_uaddo_pseudo(i32 inreg %val0) { ; CHECK-LABEL: s_uaddo_pseudo: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 s0, s0, 1 -; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_addc_u32 s0, 1, 0 ; CHECK-NEXT: ; return to shader part epilog %pair = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %val0, i32 1) @@ -32,8 +30,6 @@ define amdgpu_ps i32 @s_usubo_pseudo(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: s_usubo_pseudo: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_sub_u32 s0, s0, 1 -; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0 ; CHECK-NEXT: s_subb_u32 s0, s1, 0 ; CHECK-NEXT: ; return to shader part epilog %pair = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %val0, i32 1) diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index 5f6d6226dd17e..71f5a94a7f245 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -56,10 +56,9 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_addc_u32 s15, 0, s16 ; GCN-NEXT: s_add_u32 s16, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: v_mul_hi_u32 v0, s12, v0 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_addc_u32 s14, s14, s15 ; GCN-NEXT: s_mul_i32 s0, s12, s14 ; GCN-NEXT: v_readfirstlane_b32 s1, v0 @@ -90,7 +89,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_add_u32 s15, s16, s0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_addc_u32 s14, s14, s12 ; GCN-NEXT: s_ashr_i32 s12, s7, 31 ; GCN-NEXT: s_add_u32 s0, s6, s12 @@ -116,52 +114,50 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: v_readfirstlane_b32 s4, v0 ; GCN-NEXT: s_addc_u32 s4, s4, 0 ; GCN-NEXT: s_mul_i32 s14, s7, s14 -; GCN-NEXT: s_add_u32 s14, s1, s14 -; GCN-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NEXT: s_add_u32 s16, s1, s14 +; GCN-NEXT: v_mov_b32_e32 v0, s16 ; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 -; GCN-NEXT: s_addc_u32 s15, 0, s4 +; GCN-NEXT: s_addc_u32 s17, 0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_mul_i32 s4, s10, s15 +; GCN-NEXT: s_mul_i32 s4, s10, s17 ; GCN-NEXT: v_readfirstlane_b32 s5, v0 ; GCN-NEXT: s_add_i32 s4, s5, s4 -; GCN-NEXT: s_mul_i32 s5, s11, s14 -; GCN-NEXT: s_add_i32 s16, s4, s5 -; GCN-NEXT: s_sub_i32 s17, s7, s16 -; GCN-NEXT: s_mul_i32 s4, s10, s14 +; GCN-NEXT: s_mul_i32 s5, s11, s16 +; GCN-NEXT: s_add_i32 s18, s4, s5 +; GCN-NEXT: s_sub_i32 s14, s7, s18 +; GCN-NEXT: s_mul_i32 s4, s10, s16 ; GCN-NEXT: s_sub_u32 s6, s6, s4 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s18, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s18, 0 -; GCN-NEXT: s_subb_u32 s17, s17, s11 -; GCN-NEXT: s_sub_u32 s19, s6, s10 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: s_or_b32 s15, s4, s5 +; GCN-NEXT: s_subb_u32 s19, s14, s11 +; GCN-NEXT: s_sub_u32 s20, s6, s10 +; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GCN-NEXT: s_or_b32 s14, s14, s15 +; GCN-NEXT: s_subb_u32 s14, s19, 0 +; GCN-NEXT: s_cmp_ge_u32 s14, s11 +; GCN-NEXT: s_cselect_b32 s15, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s20, s10 +; GCN-NEXT: s_cselect_b32 s19, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s14, s11 +; GCN-NEXT: s_cselect_b32 s14, s19, s15 +; GCN-NEXT: s_add_u32 s15, s16, 1 +; GCN-NEXT: s_addc_u32 s19, s17, 0 +; GCN-NEXT: s_add_u32 s20, s16, 2 +; GCN-NEXT: s_addc_u32 s21, s17, 0 +; GCN-NEXT: s_cmp_lg_u32 s14, 0 +; GCN-NEXT: s_cselect_b32 s14, s20, s15 +; GCN-NEXT: s_cselect_b32 s15, s21, s19 ; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_subb_u32 s4, s17, 0 +; GCN-NEXT: s_subb_u32 s4, s7, s18 ; GCN-NEXT: s_cmp_ge_u32 s4, s11 ; GCN-NEXT: s_cselect_b32 s5, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s19, s10 -; GCN-NEXT: s_cselect_b32 s17, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s4, s11 -; GCN-NEXT: s_cselect_b32 s4, s17, s5 -; GCN-NEXT: s_add_u32 s5, s14, 1 -; GCN-NEXT: s_addc_u32 s17, s15, 0 -; GCN-NEXT: s_add_u32 s19, s14, 2 -; GCN-NEXT: s_addc_u32 s20, s15, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_cselect_b32 s4, s19, s5 -; GCN-NEXT: s_cselect_b32 s5, s20, s17 -; GCN-NEXT: s_cmp_lg_u32 s18, 0 -; GCN-NEXT: s_subb_u32 s7, s7, s16 -; GCN-NEXT: s_cmp_ge_u32 s7, s11 -; GCN-NEXT: s_cselect_b32 s16, -1, 0 ; GCN-NEXT: s_cmp_ge_u32 s6, s10 ; GCN-NEXT: s_cselect_b32 s6, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s7, s11 -; GCN-NEXT: s_cselect_b32 s6, s6, s16 -; GCN-NEXT: s_cmp_lg_u32 s6, 0 -; GCN-NEXT: s_cselect_b32 s5, s5, s15 -; GCN-NEXT: s_cselect_b32 s4, s4, s14 +; GCN-NEXT: s_cmp_eq_u32 s4, s11 +; GCN-NEXT: s_cselect_b32 s4, s6, s5 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_cselect_b32 s5, s15, s17 +; GCN-NEXT: s_cselect_b32 s4, s14, s16 ; GCN-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9] ; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GCN-NEXT: s_sub_u32 s4, s4, s6 @@ -208,7 +204,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_add_u32 s18, s16, 1 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_or_b32 s10, s10, s11 -; GCN-IR-NEXT: s_cmp_lg_u32 s10, 0 ; GCN-IR-NEXT: s_addc_u32 s10, s17, 0 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s16, 63, s16 @@ -242,7 +237,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_add_u32 s14, s14, 1 ; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0 ; GCN-IR-NEXT: s_or_b32 s20, s20, s21 -; GCN-IR-NEXT: s_cmp_lg_u32 s20, 0 ; GCN-IR-NEXT: s_addc_u32 s15, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[12:13], s[8:9] @@ -1195,10 +1189,9 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_addc_u32 s12, 0, s13 ; GCN-NEXT: s_add_u32 s13, s8, s9 ; GCN-NEXT: v_mov_b32_e32 v0, s13 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-NEXT: s_addc_u32 s11, s11, s12 ; GCN-NEXT: s_mul_i32 s8, s2, s11 ; GCN-NEXT: v_readfirstlane_b32 s9, v0 @@ -1229,7 +1222,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_add_u32 s2, s13, s2 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-NEXT: s_addc_u32 s8, s11, s10 ; GCN-NEXT: v_mul_hi_u32 v1, s2, 24 ; GCN-NEXT: v_mul_hi_u32 v0, s8, 24 @@ -1238,48 +1230,46 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_readfirstlane_b32 s10, v1 ; GCN-NEXT: v_readfirstlane_b32 s9, v0 ; GCN-NEXT: s_add_u32 s8, s10, s8 -; GCN-NEXT: s_addc_u32 s10, 0, s9 -; GCN-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NEXT: s_addc_u32 s12, 0, s9 +; GCN-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 -; GCN-NEXT: s_mul_i32 s8, s7, s10 +; GCN-NEXT: s_mul_i32 s8, s7, s12 ; GCN-NEXT: v_readfirstlane_b32 s9, v0 -; GCN-NEXT: s_add_i32 s11, s9, s8 -; GCN-NEXT: s_sub_i32 s12, 0, s11 -; GCN-NEXT: s_mul_i32 s8, s6, s10 -; GCN-NEXT: s_sub_u32 s13, 24, s8 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s14, s8, s9 -; GCN-NEXT: s_cmp_lg_u32 s14, 0 -; GCN-NEXT: s_subb_u32 s12, s12, s7 -; GCN-NEXT: s_sub_u32 s15, s13, s6 +; GCN-NEXT: s_add_i32 s13, s9, s8 +; GCN-NEXT: s_sub_i32 s10, 0, s13 +; GCN-NEXT: s_mul_i32 s8, s6, s12 +; GCN-NEXT: s_sub_u32 s14, 24, s8 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: s_or_b32 s11, s8, s9 +; GCN-NEXT: s_subb_u32 s15, s10, s7 +; GCN-NEXT: s_sub_u32 s16, s14, s6 +; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GCN-NEXT: s_or_b32 s10, s10, s11 +; GCN-NEXT: s_subb_u32 s10, s15, 0 +; GCN-NEXT: s_cmp_ge_u32 s10, s7 +; GCN-NEXT: s_cselect_b32 s11, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s16, s6 +; GCN-NEXT: s_cselect_b32 s15, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s10, s7 +; GCN-NEXT: s_cselect_b32 s10, s15, s11 +; GCN-NEXT: s_add_u32 s11, s12, 1 +; GCN-NEXT: s_addc_u32 s15, 0, 0 +; GCN-NEXT: s_add_u32 s16, s12, 2 +; GCN-NEXT: s_addc_u32 s17, 0, 0 +; GCN-NEXT: s_cmp_lg_u32 s10, 0 +; GCN-NEXT: s_cselect_b32 s10, s16, s11 +; GCN-NEXT: s_cselect_b32 s11, s17, s15 ; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 -; GCN-NEXT: s_subb_u32 s8, s12, 0 +; GCN-NEXT: s_subb_u32 s8, 0, s13 ; GCN-NEXT: s_cmp_ge_u32 s8, s7 ; GCN-NEXT: s_cselect_b32 s9, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s15, s6 -; GCN-NEXT: s_cselect_b32 s12, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s8, s7 -; GCN-NEXT: s_cselect_b32 s8, s12, s9 -; GCN-NEXT: s_add_u32 s9, s10, 1 -; GCN-NEXT: s_addc_u32 s12, 0, 0 -; GCN-NEXT: s_add_u32 s15, s10, 2 -; GCN-NEXT: s_addc_u32 s16, 0, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 -; GCN-NEXT: s_cselect_b32 s8, s15, s9 -; GCN-NEXT: s_cselect_b32 s9, s16, s12 -; GCN-NEXT: s_cmp_lg_u32 s14, 0 -; GCN-NEXT: s_subb_u32 s11, 0, s11 -; GCN-NEXT: s_cmp_ge_u32 s11, s7 -; GCN-NEXT: s_cselect_b32 s12, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s13, s6 +; GCN-NEXT: s_cmp_ge_u32 s14, s6 ; GCN-NEXT: s_cselect_b32 s6, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s11, s7 -; GCN-NEXT: s_cselect_b32 s6, s6, s12 +; GCN-NEXT: s_cmp_eq_u32 s8, s7 +; GCN-NEXT: s_cselect_b32 s6, s6, s9 ; GCN-NEXT: s_cmp_lg_u32 s6, 0 -; GCN-NEXT: s_cselect_b32 s7, s9, 0 -; GCN-NEXT: s_cselect_b32 s6, s8, s10 +; GCN-NEXT: s_cselect_b32 s7, s11, 0 +; GCN-NEXT: s_cselect_b32 s6, s10, s12 ; GCN-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] ; GCN-NEXT: s_sub_u32 s6, s6, s4 ; GCN-NEXT: s_subb_u32 s7, s7, s4 @@ -1315,7 +1305,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s12, s10, 1 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_or_b32 s8, s8, s9 -; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-IR-NEXT: s_addc_u32 s8, s11, 0 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s10, 63, s10 @@ -1348,7 +1337,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s16, s16, 1 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_or_b32 s18, s18, s19 -; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0 ; GCN-IR-NEXT: s_addc_u32 s17, s17, 0 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll index bbd179364374c..e12e31b14e97d 100644 --- a/llvm/test/CodeGen/AMDGPU/srem.ll +++ b/llvm/test/CodeGen/AMDGPU/srem.ll @@ -1513,7 +1513,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GCN-NEXT: s_sub_u32 s3, 0, s8 -; GCN-NEXT: s_subb_u32 s12, 0, s9 +; GCN-NEXT: s_subb_u32 s10, 0, s9 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -1522,56 +1522,52 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s13, v1 -; GCN-NEXT: v_readfirstlane_b32 s10, v0 -; GCN-NEXT: s_mul_i32 s11, s3, s13 -; GCN-NEXT: s_mul_hi_u32 s15, s3, s10 -; GCN-NEXT: s_mul_i32 s14, s12, s10 -; GCN-NEXT: s_add_i32 s11, s15, s11 -; GCN-NEXT: s_add_i32 s11, s11, s14 -; GCN-NEXT: s_mul_i32 s16, s3, s10 -; GCN-NEXT: s_mul_i32 s15, s10, s11 -; GCN-NEXT: s_mul_hi_u32 s17, s10, s16 -; GCN-NEXT: s_mul_hi_u32 s14, s10, s11 +; GCN-NEXT: v_readfirstlane_b32 s11, v1 +; GCN-NEXT: v_readfirstlane_b32 s12, v0 +; GCN-NEXT: s_mul_i32 s13, s3, s11 +; GCN-NEXT: s_mul_hi_u32 s15, s3, s12 +; GCN-NEXT: s_mul_i32 s14, s10, s12 +; GCN-NEXT: s_add_i32 s13, s15, s13 +; GCN-NEXT: s_add_i32 s13, s13, s14 +; GCN-NEXT: s_mul_i32 s16, s3, s12 +; GCN-NEXT: s_mul_i32 s15, s12, s13 +; GCN-NEXT: s_mul_hi_u32 s17, s12, s16 +; GCN-NEXT: s_mul_hi_u32 s14, s12, s13 ; GCN-NEXT: s_add_u32 s15, s17, s15 ; GCN-NEXT: s_addc_u32 s14, 0, s14 -; GCN-NEXT: s_mul_hi_u32 s18, s13, s16 -; GCN-NEXT: s_mul_i32 s16, s13, s16 +; GCN-NEXT: s_mul_hi_u32 s18, s11, s16 +; GCN-NEXT: s_mul_i32 s16, s11, s16 ; GCN-NEXT: s_add_u32 s15, s15, s16 -; GCN-NEXT: s_mul_hi_u32 s17, s13, s11 +; GCN-NEXT: s_mul_hi_u32 s17, s11, s13 ; GCN-NEXT: s_addc_u32 s14, s14, s18 ; GCN-NEXT: s_addc_u32 s15, s17, 0 -; GCN-NEXT: s_mul_i32 s11, s13, s11 -; GCN-NEXT: s_add_u32 s11, s14, s11 +; GCN-NEXT: s_mul_i32 s13, s11, s13 +; GCN-NEXT: s_add_u32 s13, s14, s13 ; GCN-NEXT: s_addc_u32 s14, 0, s15 -; GCN-NEXT: s_add_u32 s15, s10, s11 -; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GCN-NEXT: s_addc_u32 s13, s13, s14 -; GCN-NEXT: s_mul_i32 s10, s3, s13 -; GCN-NEXT: s_mul_hi_u32 s11, s3, s15 -; GCN-NEXT: s_add_i32 s10, s11, s10 -; GCN-NEXT: s_mul_i32 s12, s12, s15 -; GCN-NEXT: s_add_i32 s10, s10, s12 -; GCN-NEXT: s_mul_i32 s3, s3, s15 -; GCN-NEXT: s_mul_hi_u32 s12, s13, s3 -; GCN-NEXT: s_mul_i32 s14, s13, s3 -; GCN-NEXT: s_mul_i32 s17, s15, s10 -; GCN-NEXT: s_mul_hi_u32 s3, s15, s3 -; GCN-NEXT: s_mul_hi_u32 s16, s15, s10 +; GCN-NEXT: s_add_u32 s12, s12, s13 +; GCN-NEXT: s_addc_u32 s11, s11, s14 +; GCN-NEXT: s_mul_i32 s13, s3, s11 +; GCN-NEXT: s_mul_hi_u32 s14, s3, s12 +; GCN-NEXT: s_add_i32 s13, s14, s13 +; GCN-NEXT: s_mul_i32 s10, s10, s12 +; GCN-NEXT: s_add_i32 s13, s13, s10 +; GCN-NEXT: s_mul_i32 s3, s3, s12 +; GCN-NEXT: s_mul_hi_u32 s14, s11, s3 +; GCN-NEXT: s_mul_i32 s15, s11, s3 +; GCN-NEXT: s_mul_i32 s17, s12, s13 +; GCN-NEXT: s_mul_hi_u32 s3, s12, s3 +; GCN-NEXT: s_mul_hi_u32 s16, s12, s13 ; GCN-NEXT: s_add_u32 s3, s3, s17 ; GCN-NEXT: s_addc_u32 s16, 0, s16 -; GCN-NEXT: s_add_u32 s3, s3, s14 -; GCN-NEXT: s_mul_hi_u32 s11, s13, s10 -; GCN-NEXT: s_addc_u32 s3, s16, s12 -; GCN-NEXT: s_addc_u32 s11, s11, 0 -; GCN-NEXT: s_mul_i32 s10, s13, s10 -; GCN-NEXT: s_add_u32 s3, s3, s10 -; GCN-NEXT: s_addc_u32 s12, 0, s11 -; GCN-NEXT: s_add_u32 s3, s15, s3 -; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GCN-NEXT: s_addc_u32 s14, s13, s12 +; GCN-NEXT: s_add_u32 s3, s3, s15 +; GCN-NEXT: s_mul_hi_u32 s10, s11, s13 +; GCN-NEXT: s_addc_u32 s3, s16, s14 +; GCN-NEXT: s_addc_u32 s10, s10, 0 +; GCN-NEXT: s_mul_i32 s13, s11, s13 +; GCN-NEXT: s_add_u32 s3, s3, s13 +; GCN-NEXT: s_addc_u32 s10, 0, s10 +; GCN-NEXT: s_add_u32 s3, s12, s3 +; GCN-NEXT: s_addc_u32 s14, s11, s10 ; GCN-NEXT: s_ashr_i32 s10, s5, 31 ; GCN-NEXT: s_add_u32 s12, s4, s10 ; GCN-NEXT: s_mov_b32 s11, s10 @@ -1600,11 +1596,9 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: s_mul_i32 s3, s8, s3 ; GCN-NEXT: s_sub_u32 s3, s12, s3 ; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0 ; GCN-NEXT: s_subb_u32 s12, s16, s9 ; GCN-NEXT: s_sub_u32 s18, s3, s8 ; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s19, s12, 0 ; GCN-NEXT: s_cmp_ge_u32 s19, s9 ; GCN-NEXT: s_cselect_b32 s20, -1, 0 @@ -1614,12 +1608,10 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: s_cselect_b32 s20, s21, s20 ; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s12, s12, s9 -; GCN-NEXT: s_sub_u32 s21, s18, s8 -; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 +; GCN-NEXT: s_sub_u32 s16, s18, s8 ; GCN-NEXT: s_subb_u32 s12, s12, 0 ; GCN-NEXT: s_cmp_lg_u32 s20, 0 -; GCN-NEXT: s_cselect_b32 s16, s21, s18 +; GCN-NEXT: s_cselect_b32 s16, s16, s18 ; GCN-NEXT: s_cselect_b32 s12, s12, s19 ; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0 ; GCN-NEXT: s_subb_u32 s5, s13, s5 @@ -1931,11 +1923,9 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; TONGA-NEXT: v_readfirstlane_b32 s14, v0 ; TONGA-NEXT: s_sub_u32 s12, s12, s14 ; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s3, s3, s7 ; TONGA-NEXT: s_sub_u32 s18, s12, s6 ; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s19, s3, 0 ; TONGA-NEXT: s_cmp_ge_u32 s19, s7 ; TONGA-NEXT: s_cselect_b32 s20, -1, 0 @@ -1945,12 +1935,10 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; TONGA-NEXT: s_cselect_b32 s20, s21, s20 ; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s3, s3, s7 -; TONGA-NEXT: s_sub_u32 s21, s18, s6 -; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 +; TONGA-NEXT: s_sub_u32 s16, s18, s6 ; TONGA-NEXT: s_subb_u32 s3, s3, 0 ; TONGA-NEXT: s_cmp_lg_u32 s20, 0 -; TONGA-NEXT: s_cselect_b32 s16, s21, s18 +; TONGA-NEXT: s_cselect_b32 s16, s16, s18 ; TONGA-NEXT: s_cselect_b32 s3, s3, s19 ; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s5, s13, s5 @@ -2730,7 +2718,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GCN-NEXT: s_sub_u32 s9, 0, s6 -; GCN-NEXT: s_subb_u32 s16, 0, s7 +; GCN-NEXT: s_subb_u32 s14, 0, s7 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2739,56 +2727,52 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s17, v1 -; GCN-NEXT: v_readfirstlane_b32 s14, v0 -; GCN-NEXT: s_mul_i32 s15, s9, s17 -; GCN-NEXT: s_mul_hi_u32 s19, s9, s14 -; GCN-NEXT: s_mul_i32 s18, s16, s14 -; GCN-NEXT: s_add_i32 s15, s19, s15 -; GCN-NEXT: s_add_i32 s15, s15, s18 -; GCN-NEXT: s_mul_i32 s20, s9, s14 -; GCN-NEXT: s_mul_i32 s19, s14, s15 -; GCN-NEXT: s_mul_hi_u32 s21, s14, s20 -; GCN-NEXT: s_mul_hi_u32 s18, s14, s15 +; GCN-NEXT: v_readfirstlane_b32 s15, v1 +; GCN-NEXT: v_readfirstlane_b32 s16, v0 +; GCN-NEXT: s_mul_i32 s17, s9, s15 +; GCN-NEXT: s_mul_hi_u32 s19, s9, s16 +; GCN-NEXT: s_mul_i32 s18, s14, s16 +; GCN-NEXT: s_add_i32 s17, s19, s17 +; GCN-NEXT: s_add_i32 s17, s17, s18 +; GCN-NEXT: s_mul_i32 s20, s9, s16 +; GCN-NEXT: s_mul_i32 s19, s16, s17 +; GCN-NEXT: s_mul_hi_u32 s21, s16, s20 +; GCN-NEXT: s_mul_hi_u32 s18, s16, s17 ; GCN-NEXT: s_add_u32 s19, s21, s19 ; GCN-NEXT: s_addc_u32 s18, 0, s18 -; GCN-NEXT: s_mul_hi_u32 s22, s17, s20 -; GCN-NEXT: s_mul_i32 s20, s17, s20 +; GCN-NEXT: s_mul_hi_u32 s22, s15, s20 +; GCN-NEXT: s_mul_i32 s20, s15, s20 ; GCN-NEXT: s_add_u32 s19, s19, s20 -; GCN-NEXT: s_mul_hi_u32 s21, s17, s15 +; GCN-NEXT: s_mul_hi_u32 s21, s15, s17 ; GCN-NEXT: s_addc_u32 s18, s18, s22 ; GCN-NEXT: s_addc_u32 s19, s21, 0 -; GCN-NEXT: s_mul_i32 s15, s17, s15 -; GCN-NEXT: s_add_u32 s15, s18, s15 +; GCN-NEXT: s_mul_i32 s17, s15, s17 +; GCN-NEXT: s_add_u32 s17, s18, s17 ; GCN-NEXT: s_addc_u32 s18, 0, s19 -; GCN-NEXT: s_add_u32 s19, s14, s15 -; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0 -; GCN-NEXT: s_addc_u32 s17, s17, s18 -; GCN-NEXT: s_mul_i32 s14, s9, s17 -; GCN-NEXT: s_mul_hi_u32 s15, s9, s19 -; GCN-NEXT: s_add_i32 s14, s15, s14 -; GCN-NEXT: s_mul_i32 s16, s16, s19 -; GCN-NEXT: s_add_i32 s14, s14, s16 -; GCN-NEXT: s_mul_i32 s9, s9, s19 -; GCN-NEXT: s_mul_hi_u32 s16, s17, s9 -; GCN-NEXT: s_mul_i32 s18, s17, s9 -; GCN-NEXT: s_mul_i32 s21, s19, s14 -; GCN-NEXT: s_mul_hi_u32 s9, s19, s9 -; GCN-NEXT: s_mul_hi_u32 s20, s19, s14 +; GCN-NEXT: s_add_u32 s16, s16, s17 +; GCN-NEXT: s_addc_u32 s15, s15, s18 +; GCN-NEXT: s_mul_i32 s17, s9, s15 +; GCN-NEXT: s_mul_hi_u32 s18, s9, s16 +; GCN-NEXT: s_add_i32 s17, s18, s17 +; GCN-NEXT: s_mul_i32 s14, s14, s16 +; GCN-NEXT: s_add_i32 s17, s17, s14 +; GCN-NEXT: s_mul_i32 s9, s9, s16 +; GCN-NEXT: s_mul_hi_u32 s18, s15, s9 +; GCN-NEXT: s_mul_i32 s19, s15, s9 +; GCN-NEXT: s_mul_i32 s21, s16, s17 +; GCN-NEXT: s_mul_hi_u32 s9, s16, s9 +; GCN-NEXT: s_mul_hi_u32 s20, s16, s17 ; GCN-NEXT: s_add_u32 s9, s9, s21 ; GCN-NEXT: s_addc_u32 s20, 0, s20 -; GCN-NEXT: s_add_u32 s9, s9, s18 -; GCN-NEXT: s_mul_hi_u32 s15, s17, s14 -; GCN-NEXT: s_addc_u32 s9, s20, s16 -; GCN-NEXT: s_addc_u32 s15, s15, 0 -; GCN-NEXT: s_mul_i32 s14, s17, s14 -; GCN-NEXT: s_add_u32 s9, s9, s14 -; GCN-NEXT: s_addc_u32 s16, 0, s15 -; GCN-NEXT: s_add_u32 s9, s19, s9 -; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0 -; GCN-NEXT: s_addc_u32 s18, s17, s16 +; GCN-NEXT: s_add_u32 s9, s9, s19 +; GCN-NEXT: s_mul_hi_u32 s14, s15, s17 +; GCN-NEXT: s_addc_u32 s9, s20, s18 +; GCN-NEXT: s_addc_u32 s14, s14, 0 +; GCN-NEXT: s_mul_i32 s17, s15, s17 +; GCN-NEXT: s_add_u32 s9, s9, s17 +; GCN-NEXT: s_addc_u32 s14, 0, s14 +; GCN-NEXT: s_add_u32 s9, s16, s9 +; GCN-NEXT: s_addc_u32 s18, s15, s14 ; GCN-NEXT: s_ashr_i32 s14, s11, 31 ; GCN-NEXT: s_add_u32 s16, s10, s14 ; GCN-NEXT: s_mov_b32 s15, s14 @@ -2817,11 +2801,9 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s9, s6, s9 ; GCN-NEXT: s_sub_u32 s9, s16, s9 ; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s16, s20, s7 ; GCN-NEXT: s_sub_u32 s22, s9, s6 ; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 ; GCN-NEXT: s_subb_u32 s23, s16, 0 ; GCN-NEXT: s_cmp_ge_u32 s23, s7 ; GCN-NEXT: s_cselect_b32 s24, -1, 0 @@ -2831,12 +2813,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s24, s25, s24 ; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 ; GCN-NEXT: s_subb_u32 s16, s16, s7 -; GCN-NEXT: s_sub_u32 s25, s22, s6 -; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 +; GCN-NEXT: s_sub_u32 s20, s22, s6 ; GCN-NEXT: s_subb_u32 s16, s16, 0 ; GCN-NEXT: s_cmp_lg_u32 s24, 0 -; GCN-NEXT: s_cselect_b32 s20, s25, s22 +; GCN-NEXT: s_cselect_b32 s20, s20, s22 ; GCN-NEXT: s_cselect_b32 s16, s16, s23 ; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s11, s17, s11 @@ -2887,7 +2867,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11 ; GCN-NEXT: s_sub_u32 s3, 0, s10 -; GCN-NEXT: s_subb_u32 s14, 0, s11 +; GCN-NEXT: s_subb_u32 s12, 0, s11 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2896,56 +2876,52 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s15, v1 -; GCN-NEXT: v_readfirstlane_b32 s12, v0 -; GCN-NEXT: s_mul_i32 s13, s3, s15 -; GCN-NEXT: s_mul_hi_u32 s17, s3, s12 -; GCN-NEXT: s_mul_i32 s16, s14, s12 -; GCN-NEXT: s_add_i32 s13, s17, s13 -; GCN-NEXT: s_add_i32 s13, s13, s16 -; GCN-NEXT: s_mul_i32 s18, s3, s12 -; GCN-NEXT: s_mul_i32 s17, s12, s13 -; GCN-NEXT: s_mul_hi_u32 s19, s12, s18 -; GCN-NEXT: s_mul_hi_u32 s16, s12, s13 +; GCN-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NEXT: v_readfirstlane_b32 s14, v0 +; GCN-NEXT: s_mul_i32 s15, s3, s13 +; GCN-NEXT: s_mul_hi_u32 s17, s3, s14 +; GCN-NEXT: s_mul_i32 s16, s12, s14 +; GCN-NEXT: s_add_i32 s15, s17, s15 +; GCN-NEXT: s_add_i32 s15, s15, s16 +; GCN-NEXT: s_mul_i32 s18, s3, s14 +; GCN-NEXT: s_mul_i32 s17, s14, s15 +; GCN-NEXT: s_mul_hi_u32 s19, s14, s18 +; GCN-NEXT: s_mul_hi_u32 s16, s14, s15 ; GCN-NEXT: s_add_u32 s17, s19, s17 ; GCN-NEXT: s_addc_u32 s16, 0, s16 -; GCN-NEXT: s_mul_hi_u32 s20, s15, s18 -; GCN-NEXT: s_mul_i32 s18, s15, s18 +; GCN-NEXT: s_mul_hi_u32 s20, s13, s18 +; GCN-NEXT: s_mul_i32 s18, s13, s18 ; GCN-NEXT: s_add_u32 s17, s17, s18 -; GCN-NEXT: s_mul_hi_u32 s19, s15, s13 +; GCN-NEXT: s_mul_hi_u32 s19, s13, s15 ; GCN-NEXT: s_addc_u32 s16, s16, s20 ; GCN-NEXT: s_addc_u32 s17, s19, 0 -; GCN-NEXT: s_mul_i32 s13, s15, s13 -; GCN-NEXT: s_add_u32 s13, s16, s13 +; GCN-NEXT: s_mul_i32 s15, s13, s15 +; GCN-NEXT: s_add_u32 s15, s16, s15 ; GCN-NEXT: s_addc_u32 s16, 0, s17 -; GCN-NEXT: s_add_u32 s17, s12, s13 -; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0 -; GCN-NEXT: s_addc_u32 s15, s15, s16 -; GCN-NEXT: s_mul_i32 s12, s3, s15 -; GCN-NEXT: s_mul_hi_u32 s13, s3, s17 -; GCN-NEXT: s_add_i32 s12, s13, s12 -; GCN-NEXT: s_mul_i32 s14, s14, s17 -; GCN-NEXT: s_add_i32 s12, s12, s14 -; GCN-NEXT: s_mul_i32 s3, s3, s17 -; GCN-NEXT: s_mul_hi_u32 s14, s15, s3 -; GCN-NEXT: s_mul_i32 s16, s15, s3 -; GCN-NEXT: s_mul_i32 s19, s17, s12 -; GCN-NEXT: s_mul_hi_u32 s3, s17, s3 -; GCN-NEXT: s_mul_hi_u32 s18, s17, s12 +; GCN-NEXT: s_add_u32 s14, s14, s15 +; GCN-NEXT: s_addc_u32 s13, s13, s16 +; GCN-NEXT: s_mul_i32 s15, s3, s13 +; GCN-NEXT: s_mul_hi_u32 s16, s3, s14 +; GCN-NEXT: s_add_i32 s15, s16, s15 +; GCN-NEXT: s_mul_i32 s12, s12, s14 +; GCN-NEXT: s_add_i32 s15, s15, s12 +; GCN-NEXT: s_mul_i32 s3, s3, s14 +; GCN-NEXT: s_mul_hi_u32 s16, s13, s3 +; GCN-NEXT: s_mul_i32 s17, s13, s3 +; GCN-NEXT: s_mul_i32 s19, s14, s15 +; GCN-NEXT: s_mul_hi_u32 s3, s14, s3 +; GCN-NEXT: s_mul_hi_u32 s18, s14, s15 ; GCN-NEXT: s_add_u32 s3, s3, s19 ; GCN-NEXT: s_addc_u32 s18, 0, s18 -; GCN-NEXT: s_add_u32 s3, s3, s16 -; GCN-NEXT: s_mul_hi_u32 s13, s15, s12 -; GCN-NEXT: s_addc_u32 s3, s18, s14 -; GCN-NEXT: s_addc_u32 s13, s13, 0 -; GCN-NEXT: s_mul_i32 s12, s15, s12 -; GCN-NEXT: s_add_u32 s3, s3, s12 -; GCN-NEXT: s_addc_u32 s14, 0, s13 -; GCN-NEXT: s_add_u32 s3, s17, s3 -; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0 -; GCN-NEXT: s_addc_u32 s16, s15, s14 +; GCN-NEXT: s_add_u32 s3, s3, s17 +; GCN-NEXT: s_mul_hi_u32 s12, s13, s15 +; GCN-NEXT: s_addc_u32 s3, s18, s16 +; GCN-NEXT: s_addc_u32 s12, s12, 0 +; GCN-NEXT: s_mul_i32 s15, s13, s15 +; GCN-NEXT: s_add_u32 s3, s3, s15 +; GCN-NEXT: s_addc_u32 s12, 0, s12 +; GCN-NEXT: s_add_u32 s3, s14, s3 +; GCN-NEXT: s_addc_u32 s16, s13, s12 ; GCN-NEXT: s_ashr_i32 s12, s5, 31 ; GCN-NEXT: s_add_u32 s14, s4, s12 ; GCN-NEXT: s_mov_b32 s13, s12 @@ -2974,11 +2950,9 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s3, s10, s3 ; GCN-NEXT: s_sub_u32 s3, s14, s3 ; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s14, s18, s11 ; GCN-NEXT: s_sub_u32 s20, s3, s10 ; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s21, s14, 0 ; GCN-NEXT: s_cmp_ge_u32 s21, s11 ; GCN-NEXT: s_cselect_b32 s22, -1, 0 @@ -2988,12 +2962,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s22, s23, s22 ; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s14, s14, s11 -; GCN-NEXT: s_sub_u32 s23, s20, s10 -; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 +; GCN-NEXT: s_sub_u32 s18, s20, s10 ; GCN-NEXT: s_subb_u32 s14, s14, 0 ; GCN-NEXT: s_cmp_lg_u32 s22, 0 -; GCN-NEXT: s_cselect_b32 s18, s23, s20 +; GCN-NEXT: s_cselect_b32 s18, s18, s20 ; GCN-NEXT: s_cselect_b32 s14, s14, s21 ; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s5, s15, s5 @@ -3463,11 +3435,9 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_readfirstlane_b32 s14, v0 ; TONGA-NEXT: s_sub_u32 s12, s12, s14 ; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s1, s1, s7 ; TONGA-NEXT: s_sub_u32 s18, s12, s6 ; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s19, s1, 0 ; TONGA-NEXT: s_cmp_ge_u32 s19, s7 ; TONGA-NEXT: s_cselect_b32 s20, -1, 0 @@ -3477,12 +3447,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: s_cselect_b32 s20, s21, s20 ; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s1, s1, s7 -; TONGA-NEXT: s_sub_u32 s21, s18, s6 -; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 +; TONGA-NEXT: s_sub_u32 s16, s18, s6 ; TONGA-NEXT: s_subb_u32 s1, s1, 0 ; TONGA-NEXT: s_cmp_lg_u32 s20, 0 -; TONGA-NEXT: s_cselect_b32 s16, s21, s18 +; TONGA-NEXT: s_cselect_b32 s16, s16, s18 ; TONGA-NEXT: s_cselect_b32 s1, s1, s19 ; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s3, s13, s3 @@ -4934,7 +4902,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GCN-NEXT: s_sub_u32 s17, 0, s6 -; GCN-NEXT: s_subb_u32 s24, 0, s7 +; GCN-NEXT: s_subb_u32 s22, 0, s7 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -4943,56 +4911,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s25, v1 -; GCN-NEXT: v_readfirstlane_b32 s22, v0 -; GCN-NEXT: s_mul_i32 s23, s17, s25 -; GCN-NEXT: s_mul_hi_u32 s27, s17, s22 -; GCN-NEXT: s_mul_i32 s26, s24, s22 -; GCN-NEXT: s_add_i32 s23, s27, s23 -; GCN-NEXT: s_add_i32 s23, s23, s26 -; GCN-NEXT: s_mul_i32 s28, s17, s22 -; GCN-NEXT: s_mul_i32 s27, s22, s23 -; GCN-NEXT: s_mul_hi_u32 s29, s22, s28 -; GCN-NEXT: s_mul_hi_u32 s26, s22, s23 +; GCN-NEXT: v_readfirstlane_b32 s23, v1 +; GCN-NEXT: v_readfirstlane_b32 s24, v0 +; GCN-NEXT: s_mul_i32 s25, s17, s23 +; GCN-NEXT: s_mul_hi_u32 s27, s17, s24 +; GCN-NEXT: s_mul_i32 s26, s22, s24 +; GCN-NEXT: s_add_i32 s25, s27, s25 +; GCN-NEXT: s_add_i32 s25, s25, s26 +; GCN-NEXT: s_mul_i32 s28, s17, s24 +; GCN-NEXT: s_mul_i32 s27, s24, s25 +; GCN-NEXT: s_mul_hi_u32 s29, s24, s28 +; GCN-NEXT: s_mul_hi_u32 s26, s24, s25 ; GCN-NEXT: s_add_u32 s27, s29, s27 ; GCN-NEXT: s_addc_u32 s26, 0, s26 -; GCN-NEXT: s_mul_hi_u32 s30, s25, s28 -; GCN-NEXT: s_mul_i32 s28, s25, s28 +; GCN-NEXT: s_mul_hi_u32 s30, s23, s28 +; GCN-NEXT: s_mul_i32 s28, s23, s28 ; GCN-NEXT: s_add_u32 s27, s27, s28 -; GCN-NEXT: s_mul_hi_u32 s29, s25, s23 +; GCN-NEXT: s_mul_hi_u32 s29, s23, s25 ; GCN-NEXT: s_addc_u32 s26, s26, s30 ; GCN-NEXT: s_addc_u32 s27, s29, 0 -; GCN-NEXT: s_mul_i32 s23, s25, s23 -; GCN-NEXT: s_add_u32 s23, s26, s23 +; GCN-NEXT: s_mul_i32 s25, s23, s25 +; GCN-NEXT: s_add_u32 s25, s26, s25 ; GCN-NEXT: s_addc_u32 s26, 0, s27 -; GCN-NEXT: s_add_u32 s27, s22, s23 -; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0 -; GCN-NEXT: s_addc_u32 s25, s25, s26 -; GCN-NEXT: s_mul_i32 s22, s17, s25 -; GCN-NEXT: s_mul_hi_u32 s23, s17, s27 -; GCN-NEXT: s_add_i32 s22, s23, s22 -; GCN-NEXT: s_mul_i32 s24, s24, s27 -; GCN-NEXT: s_add_i32 s22, s22, s24 -; GCN-NEXT: s_mul_i32 s17, s17, s27 -; GCN-NEXT: s_mul_hi_u32 s24, s25, s17 -; GCN-NEXT: s_mul_i32 s26, s25, s17 -; GCN-NEXT: s_mul_i32 s29, s27, s22 -; GCN-NEXT: s_mul_hi_u32 s17, s27, s17 -; GCN-NEXT: s_mul_hi_u32 s28, s27, s22 +; GCN-NEXT: s_add_u32 s24, s24, s25 +; GCN-NEXT: s_addc_u32 s23, s23, s26 +; GCN-NEXT: s_mul_i32 s25, s17, s23 +; GCN-NEXT: s_mul_hi_u32 s26, s17, s24 +; GCN-NEXT: s_add_i32 s25, s26, s25 +; GCN-NEXT: s_mul_i32 s22, s22, s24 +; GCN-NEXT: s_add_i32 s25, s25, s22 +; GCN-NEXT: s_mul_i32 s17, s17, s24 +; GCN-NEXT: s_mul_hi_u32 s26, s23, s17 +; GCN-NEXT: s_mul_i32 s27, s23, s17 +; GCN-NEXT: s_mul_i32 s29, s24, s25 +; GCN-NEXT: s_mul_hi_u32 s17, s24, s17 +; GCN-NEXT: s_mul_hi_u32 s28, s24, s25 ; GCN-NEXT: s_add_u32 s17, s17, s29 ; GCN-NEXT: s_addc_u32 s28, 0, s28 -; GCN-NEXT: s_add_u32 s17, s17, s26 -; GCN-NEXT: s_mul_hi_u32 s23, s25, s22 -; GCN-NEXT: s_addc_u32 s17, s28, s24 -; GCN-NEXT: s_addc_u32 s23, s23, 0 -; GCN-NEXT: s_mul_i32 s22, s25, s22 -; GCN-NEXT: s_add_u32 s17, s17, s22 -; GCN-NEXT: s_addc_u32 s24, 0, s23 -; GCN-NEXT: s_add_u32 s17, s27, s17 -; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0 -; GCN-NEXT: s_addc_u32 s26, s25, s24 +; GCN-NEXT: s_add_u32 s17, s17, s27 +; GCN-NEXT: s_mul_hi_u32 s22, s23, s25 +; GCN-NEXT: s_addc_u32 s17, s28, s26 +; GCN-NEXT: s_addc_u32 s22, s22, 0 +; GCN-NEXT: s_mul_i32 s25, s23, s25 +; GCN-NEXT: s_add_u32 s17, s17, s25 +; GCN-NEXT: s_addc_u32 s22, 0, s22 +; GCN-NEXT: s_add_u32 s17, s24, s17 +; GCN-NEXT: s_addc_u32 s26, s23, s22 ; GCN-NEXT: s_ashr_i32 s22, s19, 31 ; GCN-NEXT: s_add_u32 s24, s18, s22 ; GCN-NEXT: s_mov_b32 s23, s22 @@ -5021,11 +4985,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s17, s6, s17 ; GCN-NEXT: s_sub_u32 s17, s24, s17 ; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0 ; GCN-NEXT: s_subb_u32 s24, s28, s7 ; GCN-NEXT: s_sub_u32 s30, s17, s6 ; GCN-NEXT: s_cselect_b64 s[28:29], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0 ; GCN-NEXT: s_subb_u32 s31, s24, 0 ; GCN-NEXT: s_cmp_ge_u32 s31, s7 ; GCN-NEXT: s_cselect_b32 s33, -1, 0 @@ -5035,12 +4997,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s33, s34, s33 ; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0 ; GCN-NEXT: s_subb_u32 s24, s24, s7 -; GCN-NEXT: s_sub_u32 s34, s30, s6 -; GCN-NEXT: s_cselect_b64 s[28:29], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0 +; GCN-NEXT: s_sub_u32 s28, s30, s6 ; GCN-NEXT: s_subb_u32 s24, s24, 0 ; GCN-NEXT: s_cmp_lg_u32 s33, 0 -; GCN-NEXT: s_cselect_b32 s28, s34, s30 +; GCN-NEXT: s_cselect_b32 s28, s28, s30 ; GCN-NEXT: s_cselect_b32 s24, s24, s31 ; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0 ; GCN-NEXT: s_subb_u32 s19, s25, s19 @@ -5091,7 +5051,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s18 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s19 ; GCN-NEXT: s_sub_u32 s13, 0, s18 -; GCN-NEXT: s_subb_u32 s22, 0, s19 +; GCN-NEXT: s_subb_u32 s20, 0, s19 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -5100,56 +5060,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s23, v1 -; GCN-NEXT: v_readfirstlane_b32 s20, v0 -; GCN-NEXT: s_mul_i32 s21, s13, s23 -; GCN-NEXT: s_mul_hi_u32 s25, s13, s20 -; GCN-NEXT: s_mul_i32 s24, s22, s20 -; GCN-NEXT: s_add_i32 s21, s25, s21 -; GCN-NEXT: s_add_i32 s21, s21, s24 -; GCN-NEXT: s_mul_i32 s26, s13, s20 -; GCN-NEXT: s_mul_i32 s25, s20, s21 -; GCN-NEXT: s_mul_hi_u32 s27, s20, s26 -; GCN-NEXT: s_mul_hi_u32 s24, s20, s21 +; GCN-NEXT: v_readfirstlane_b32 s21, v1 +; GCN-NEXT: v_readfirstlane_b32 s22, v0 +; GCN-NEXT: s_mul_i32 s23, s13, s21 +; GCN-NEXT: s_mul_hi_u32 s25, s13, s22 +; GCN-NEXT: s_mul_i32 s24, s20, s22 +; GCN-NEXT: s_add_i32 s23, s25, s23 +; GCN-NEXT: s_add_i32 s23, s23, s24 +; GCN-NEXT: s_mul_i32 s26, s13, s22 +; GCN-NEXT: s_mul_i32 s25, s22, s23 +; GCN-NEXT: s_mul_hi_u32 s27, s22, s26 +; GCN-NEXT: s_mul_hi_u32 s24, s22, s23 ; GCN-NEXT: s_add_u32 s25, s27, s25 ; GCN-NEXT: s_addc_u32 s24, 0, s24 -; GCN-NEXT: s_mul_hi_u32 s28, s23, s26 -; GCN-NEXT: s_mul_i32 s26, s23, s26 +; GCN-NEXT: s_mul_hi_u32 s28, s21, s26 +; GCN-NEXT: s_mul_i32 s26, s21, s26 ; GCN-NEXT: s_add_u32 s25, s25, s26 -; GCN-NEXT: s_mul_hi_u32 s27, s23, s21 +; GCN-NEXT: s_mul_hi_u32 s27, s21, s23 ; GCN-NEXT: s_addc_u32 s24, s24, s28 ; GCN-NEXT: s_addc_u32 s25, s27, 0 -; GCN-NEXT: s_mul_i32 s21, s23, s21 -; GCN-NEXT: s_add_u32 s21, s24, s21 +; GCN-NEXT: s_mul_i32 s23, s21, s23 +; GCN-NEXT: s_add_u32 s23, s24, s23 ; GCN-NEXT: s_addc_u32 s24, 0, s25 -; GCN-NEXT: s_add_u32 s25, s20, s21 -; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 -; GCN-NEXT: s_addc_u32 s23, s23, s24 -; GCN-NEXT: s_mul_i32 s20, s13, s23 -; GCN-NEXT: s_mul_hi_u32 s21, s13, s25 -; GCN-NEXT: s_add_i32 s20, s21, s20 -; GCN-NEXT: s_mul_i32 s22, s22, s25 -; GCN-NEXT: s_add_i32 s20, s20, s22 -; GCN-NEXT: s_mul_i32 s13, s13, s25 -; GCN-NEXT: s_mul_hi_u32 s22, s23, s13 -; GCN-NEXT: s_mul_i32 s24, s23, s13 -; GCN-NEXT: s_mul_i32 s27, s25, s20 -; GCN-NEXT: s_mul_hi_u32 s13, s25, s13 -; GCN-NEXT: s_mul_hi_u32 s26, s25, s20 +; GCN-NEXT: s_add_u32 s22, s22, s23 +; GCN-NEXT: s_addc_u32 s21, s21, s24 +; GCN-NEXT: s_mul_i32 s23, s13, s21 +; GCN-NEXT: s_mul_hi_u32 s24, s13, s22 +; GCN-NEXT: s_add_i32 s23, s24, s23 +; GCN-NEXT: s_mul_i32 s20, s20, s22 +; GCN-NEXT: s_add_i32 s23, s23, s20 +; GCN-NEXT: s_mul_i32 s13, s13, s22 +; GCN-NEXT: s_mul_hi_u32 s24, s21, s13 +; GCN-NEXT: s_mul_i32 s25, s21, s13 +; GCN-NEXT: s_mul_i32 s27, s22, s23 +; GCN-NEXT: s_mul_hi_u32 s13, s22, s13 +; GCN-NEXT: s_mul_hi_u32 s26, s22, s23 ; GCN-NEXT: s_add_u32 s13, s13, s27 ; GCN-NEXT: s_addc_u32 s26, 0, s26 -; GCN-NEXT: s_add_u32 s13, s13, s24 -; GCN-NEXT: s_mul_hi_u32 s21, s23, s20 -; GCN-NEXT: s_addc_u32 s13, s26, s22 -; GCN-NEXT: s_addc_u32 s21, s21, 0 -; GCN-NEXT: s_mul_i32 s20, s23, s20 -; GCN-NEXT: s_add_u32 s13, s13, s20 -; GCN-NEXT: s_addc_u32 s22, 0, s21 -; GCN-NEXT: s_add_u32 s13, s25, s13 -; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 -; GCN-NEXT: s_addc_u32 s24, s23, s22 +; GCN-NEXT: s_add_u32 s13, s13, s25 +; GCN-NEXT: s_mul_hi_u32 s20, s21, s23 +; GCN-NEXT: s_addc_u32 s13, s26, s24 +; GCN-NEXT: s_addc_u32 s20, s20, 0 +; GCN-NEXT: s_mul_i32 s23, s21, s23 +; GCN-NEXT: s_add_u32 s13, s13, s23 +; GCN-NEXT: s_addc_u32 s20, 0, s20 +; GCN-NEXT: s_add_u32 s13, s22, s13 +; GCN-NEXT: s_addc_u32 s24, s21, s20 ; GCN-NEXT: s_ashr_i32 s20, s15, 31 ; GCN-NEXT: s_add_u32 s22, s14, s20 ; GCN-NEXT: s_mov_b32 s21, s20 @@ -5178,11 +5134,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s13, s18, s13 ; GCN-NEXT: s_sub_u32 s13, s22, s13 ; GCN-NEXT: s_cselect_b64 s[24:25], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[24:25], 0 ; GCN-NEXT: s_subb_u32 s22, s26, s19 ; GCN-NEXT: s_sub_u32 s28, s13, s18 ; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0 ; GCN-NEXT: s_subb_u32 s29, s22, 0 ; GCN-NEXT: s_cmp_ge_u32 s29, s19 ; GCN-NEXT: s_cselect_b32 s30, -1, 0 @@ -5192,12 +5146,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s30, s31, s30 ; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0 ; GCN-NEXT: s_subb_u32 s22, s22, s19 -; GCN-NEXT: s_sub_u32 s31, s28, s18 -; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0 +; GCN-NEXT: s_sub_u32 s26, s28, s18 ; GCN-NEXT: s_subb_u32 s22, s22, 0 ; GCN-NEXT: s_cmp_lg_u32 s30, 0 -; GCN-NEXT: s_cselect_b32 s26, s31, s28 +; GCN-NEXT: s_cselect_b32 s26, s26, s28 ; GCN-NEXT: s_cselect_b32 s22, s22, s29 ; GCN-NEXT: s_cmp_lg_u64 s[24:25], 0 ; GCN-NEXT: s_subb_u32 s15, s23, s15 @@ -5257,7 +5209,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s14 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s15 ; GCN-NEXT: s_sub_u32 s9, 0, s14 -; GCN-NEXT: s_subb_u32 s18, 0, s15 +; GCN-NEXT: s_subb_u32 s16, 0, s15 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -5266,56 +5218,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s19, v1 -; GCN-NEXT: v_readfirstlane_b32 s16, v0 -; GCN-NEXT: s_mul_i32 s17, s9, s19 -; GCN-NEXT: s_mul_hi_u32 s21, s9, s16 -; GCN-NEXT: s_mul_i32 s20, s18, s16 -; GCN-NEXT: s_add_i32 s17, s21, s17 -; GCN-NEXT: s_add_i32 s17, s17, s20 -; GCN-NEXT: s_mul_i32 s22, s9, s16 -; GCN-NEXT: s_mul_i32 s21, s16, s17 -; GCN-NEXT: s_mul_hi_u32 s23, s16, s22 -; GCN-NEXT: s_mul_hi_u32 s20, s16, s17 +; GCN-NEXT: v_readfirstlane_b32 s17, v1 +; GCN-NEXT: v_readfirstlane_b32 s18, v0 +; GCN-NEXT: s_mul_i32 s19, s9, s17 +; GCN-NEXT: s_mul_hi_u32 s21, s9, s18 +; GCN-NEXT: s_mul_i32 s20, s16, s18 +; GCN-NEXT: s_add_i32 s19, s21, s19 +; GCN-NEXT: s_add_i32 s19, s19, s20 +; GCN-NEXT: s_mul_i32 s22, s9, s18 +; GCN-NEXT: s_mul_i32 s21, s18, s19 +; GCN-NEXT: s_mul_hi_u32 s23, s18, s22 +; GCN-NEXT: s_mul_hi_u32 s20, s18, s19 ; GCN-NEXT: s_add_u32 s21, s23, s21 ; GCN-NEXT: s_addc_u32 s20, 0, s20 -; GCN-NEXT: s_mul_hi_u32 s24, s19, s22 -; GCN-NEXT: s_mul_i32 s22, s19, s22 +; GCN-NEXT: s_mul_hi_u32 s24, s17, s22 +; GCN-NEXT: s_mul_i32 s22, s17, s22 ; GCN-NEXT: s_add_u32 s21, s21, s22 -; GCN-NEXT: s_mul_hi_u32 s23, s19, s17 +; GCN-NEXT: s_mul_hi_u32 s23, s17, s19 ; GCN-NEXT: s_addc_u32 s20, s20, s24 ; GCN-NEXT: s_addc_u32 s21, s23, 0 -; GCN-NEXT: s_mul_i32 s17, s19, s17 -; GCN-NEXT: s_add_u32 s17, s20, s17 +; GCN-NEXT: s_mul_i32 s19, s17, s19 +; GCN-NEXT: s_add_u32 s19, s20, s19 ; GCN-NEXT: s_addc_u32 s20, 0, s21 -; GCN-NEXT: s_add_u32 s21, s16, s17 -; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 -; GCN-NEXT: s_addc_u32 s19, s19, s20 -; GCN-NEXT: s_mul_i32 s16, s9, s19 -; GCN-NEXT: s_mul_hi_u32 s17, s9, s21 -; GCN-NEXT: s_add_i32 s16, s17, s16 -; GCN-NEXT: s_mul_i32 s18, s18, s21 -; GCN-NEXT: s_add_i32 s16, s16, s18 -; GCN-NEXT: s_mul_i32 s9, s9, s21 -; GCN-NEXT: s_mul_hi_u32 s18, s19, s9 -; GCN-NEXT: s_mul_i32 s20, s19, s9 -; GCN-NEXT: s_mul_i32 s23, s21, s16 -; GCN-NEXT: s_mul_hi_u32 s9, s21, s9 -; GCN-NEXT: s_mul_hi_u32 s22, s21, s16 +; GCN-NEXT: s_add_u32 s18, s18, s19 +; GCN-NEXT: s_addc_u32 s17, s17, s20 +; GCN-NEXT: s_mul_i32 s19, s9, s17 +; GCN-NEXT: s_mul_hi_u32 s20, s9, s18 +; GCN-NEXT: s_add_i32 s19, s20, s19 +; GCN-NEXT: s_mul_i32 s16, s16, s18 +; GCN-NEXT: s_add_i32 s19, s19, s16 +; GCN-NEXT: s_mul_i32 s9, s9, s18 +; GCN-NEXT: s_mul_hi_u32 s20, s17, s9 +; GCN-NEXT: s_mul_i32 s21, s17, s9 +; GCN-NEXT: s_mul_i32 s23, s18, s19 +; GCN-NEXT: s_mul_hi_u32 s9, s18, s9 +; GCN-NEXT: s_mul_hi_u32 s22, s18, s19 ; GCN-NEXT: s_add_u32 s9, s9, s23 ; GCN-NEXT: s_addc_u32 s22, 0, s22 -; GCN-NEXT: s_add_u32 s9, s9, s20 -; GCN-NEXT: s_mul_hi_u32 s17, s19, s16 -; GCN-NEXT: s_addc_u32 s9, s22, s18 -; GCN-NEXT: s_addc_u32 s17, s17, 0 -; GCN-NEXT: s_mul_i32 s16, s19, s16 -; GCN-NEXT: s_add_u32 s9, s9, s16 -; GCN-NEXT: s_addc_u32 s18, 0, s17 -; GCN-NEXT: s_add_u32 s9, s21, s9 -; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 -; GCN-NEXT: s_addc_u32 s20, s19, s18 +; GCN-NEXT: s_add_u32 s9, s9, s21 +; GCN-NEXT: s_mul_hi_u32 s16, s17, s19 +; GCN-NEXT: s_addc_u32 s9, s22, s20 +; GCN-NEXT: s_addc_u32 s16, s16, 0 +; GCN-NEXT: s_mul_i32 s19, s17, s19 +; GCN-NEXT: s_add_u32 s9, s9, s19 +; GCN-NEXT: s_addc_u32 s16, 0, s16 +; GCN-NEXT: s_add_u32 s9, s18, s9 +; GCN-NEXT: s_addc_u32 s20, s17, s16 ; GCN-NEXT: s_ashr_i32 s16, s11, 31 ; GCN-NEXT: s_add_u32 s18, s10, s16 ; GCN-NEXT: s_mov_b32 s17, s16 @@ -5344,11 +5292,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s9, s14, s9 ; GCN-NEXT: s_sub_u32 s9, s18, s9 ; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 ; GCN-NEXT: s_subb_u32 s18, s22, s15 ; GCN-NEXT: s_sub_u32 s24, s9, s14 ; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0 ; GCN-NEXT: s_subb_u32 s25, s18, 0 ; GCN-NEXT: s_cmp_ge_u32 s25, s15 ; GCN-NEXT: s_cselect_b32 s26, -1, 0 @@ -5358,12 +5304,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s26, s27, s26 ; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0 ; GCN-NEXT: s_subb_u32 s18, s18, s15 -; GCN-NEXT: s_sub_u32 s27, s24, s14 -; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0 +; GCN-NEXT: s_sub_u32 s22, s24, s14 ; GCN-NEXT: s_subb_u32 s18, s18, 0 ; GCN-NEXT: s_cmp_lg_u32 s26, 0 -; GCN-NEXT: s_cselect_b32 s22, s27, s24 +; GCN-NEXT: s_cselect_b32 s22, s22, s24 ; GCN-NEXT: s_cselect_b32 s18, s18, s25 ; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 ; GCN-NEXT: s_subb_u32 s11, s19, s11 @@ -5420,7 +5364,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11 ; GCN-NEXT: s_sub_u32 s3, 0, s10 -; GCN-NEXT: s_subb_u32 s14, 0, s11 +; GCN-NEXT: s_subb_u32 s12, 0, s11 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -5429,56 +5373,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s15, v1 -; GCN-NEXT: v_readfirstlane_b32 s12, v0 -; GCN-NEXT: s_mul_i32 s13, s3, s15 -; GCN-NEXT: s_mul_hi_u32 s17, s3, s12 -; GCN-NEXT: s_mul_i32 s16, s14, s12 -; GCN-NEXT: s_add_i32 s13, s17, s13 -; GCN-NEXT: s_add_i32 s13, s13, s16 -; GCN-NEXT: s_mul_i32 s18, s3, s12 -; GCN-NEXT: s_mul_i32 s17, s12, s13 -; GCN-NEXT: s_mul_hi_u32 s19, s12, s18 -; GCN-NEXT: s_mul_hi_u32 s16, s12, s13 +; GCN-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NEXT: v_readfirstlane_b32 s14, v0 +; GCN-NEXT: s_mul_i32 s15, s3, s13 +; GCN-NEXT: s_mul_hi_u32 s17, s3, s14 +; GCN-NEXT: s_mul_i32 s16, s12, s14 +; GCN-NEXT: s_add_i32 s15, s17, s15 +; GCN-NEXT: s_add_i32 s15, s15, s16 +; GCN-NEXT: s_mul_i32 s18, s3, s14 +; GCN-NEXT: s_mul_i32 s17, s14, s15 +; GCN-NEXT: s_mul_hi_u32 s19, s14, s18 +; GCN-NEXT: s_mul_hi_u32 s16, s14, s15 ; GCN-NEXT: s_add_u32 s17, s19, s17 ; GCN-NEXT: s_addc_u32 s16, 0, s16 -; GCN-NEXT: s_mul_hi_u32 s20, s15, s18 -; GCN-NEXT: s_mul_i32 s18, s15, s18 +; GCN-NEXT: s_mul_hi_u32 s20, s13, s18 +; GCN-NEXT: s_mul_i32 s18, s13, s18 ; GCN-NEXT: s_add_u32 s17, s17, s18 -; GCN-NEXT: s_mul_hi_u32 s19, s15, s13 +; GCN-NEXT: s_mul_hi_u32 s19, s13, s15 ; GCN-NEXT: s_addc_u32 s16, s16, s20 ; GCN-NEXT: s_addc_u32 s17, s19, 0 -; GCN-NEXT: s_mul_i32 s13, s15, s13 -; GCN-NEXT: s_add_u32 s13, s16, s13 +; GCN-NEXT: s_mul_i32 s15, s13, s15 +; GCN-NEXT: s_add_u32 s15, s16, s15 ; GCN-NEXT: s_addc_u32 s16, 0, s17 -; GCN-NEXT: s_add_u32 s17, s12, s13 -; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0 -; GCN-NEXT: s_addc_u32 s15, s15, s16 -; GCN-NEXT: s_mul_i32 s12, s3, s15 -; GCN-NEXT: s_mul_hi_u32 s13, s3, s17 -; GCN-NEXT: s_add_i32 s12, s13, s12 -; GCN-NEXT: s_mul_i32 s14, s14, s17 -; GCN-NEXT: s_add_i32 s12, s12, s14 -; GCN-NEXT: s_mul_i32 s3, s3, s17 -; GCN-NEXT: s_mul_hi_u32 s14, s15, s3 -; GCN-NEXT: s_mul_i32 s16, s15, s3 -; GCN-NEXT: s_mul_i32 s19, s17, s12 -; GCN-NEXT: s_mul_hi_u32 s3, s17, s3 -; GCN-NEXT: s_mul_hi_u32 s18, s17, s12 +; GCN-NEXT: s_add_u32 s14, s14, s15 +; GCN-NEXT: s_addc_u32 s13, s13, s16 +; GCN-NEXT: s_mul_i32 s15, s3, s13 +; GCN-NEXT: s_mul_hi_u32 s16, s3, s14 +; GCN-NEXT: s_add_i32 s15, s16, s15 +; GCN-NEXT: s_mul_i32 s12, s12, s14 +; GCN-NEXT: s_add_i32 s15, s15, s12 +; GCN-NEXT: s_mul_i32 s3, s3, s14 +; GCN-NEXT: s_mul_hi_u32 s16, s13, s3 +; GCN-NEXT: s_mul_i32 s17, s13, s3 +; GCN-NEXT: s_mul_i32 s19, s14, s15 +; GCN-NEXT: s_mul_hi_u32 s3, s14, s3 +; GCN-NEXT: s_mul_hi_u32 s18, s14, s15 ; GCN-NEXT: s_add_u32 s3, s3, s19 ; GCN-NEXT: s_addc_u32 s18, 0, s18 -; GCN-NEXT: s_add_u32 s3, s3, s16 -; GCN-NEXT: s_mul_hi_u32 s13, s15, s12 -; GCN-NEXT: s_addc_u32 s3, s18, s14 -; GCN-NEXT: s_addc_u32 s13, s13, 0 -; GCN-NEXT: s_mul_i32 s12, s15, s12 -; GCN-NEXT: s_add_u32 s3, s3, s12 -; GCN-NEXT: s_addc_u32 s14, 0, s13 -; GCN-NEXT: s_add_u32 s3, s17, s3 -; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0 -; GCN-NEXT: s_addc_u32 s16, s15, s14 +; GCN-NEXT: s_add_u32 s3, s3, s17 +; GCN-NEXT: s_mul_hi_u32 s12, s13, s15 +; GCN-NEXT: s_addc_u32 s3, s18, s16 +; GCN-NEXT: s_addc_u32 s12, s12, 0 +; GCN-NEXT: s_mul_i32 s15, s13, s15 +; GCN-NEXT: s_add_u32 s3, s3, s15 +; GCN-NEXT: s_addc_u32 s12, 0, s12 +; GCN-NEXT: s_add_u32 s3, s14, s3 +; GCN-NEXT: s_addc_u32 s16, s13, s12 ; GCN-NEXT: s_ashr_i32 s12, s5, 31 ; GCN-NEXT: s_add_u32 s14, s4, s12 ; GCN-NEXT: s_mov_b32 s13, s12 @@ -5507,11 +5447,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s3, s10, s3 ; GCN-NEXT: s_sub_u32 s3, s14, s3 ; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s14, s18, s11 ; GCN-NEXT: s_sub_u32 s20, s3, s10 ; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s21, s14, 0 ; GCN-NEXT: s_cmp_ge_u32 s21, s11 ; GCN-NEXT: s_cselect_b32 s22, -1, 0 @@ -5521,12 +5459,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s22, s23, s22 ; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s14, s14, s11 -; GCN-NEXT: s_sub_u32 s23, s20, s10 -; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 +; GCN-NEXT: s_sub_u32 s18, s20, s10 ; GCN-NEXT: s_subb_u32 s14, s14, 0 ; GCN-NEXT: s_cmp_lg_u32 s22, 0 -; GCN-NEXT: s_cselect_b32 s18, s23, s20 +; GCN-NEXT: s_cselect_b32 s18, s18, s20 ; GCN-NEXT: s_cselect_b32 s14, s14, s21 ; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s5, s15, s5 @@ -6299,11 +6235,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_readfirstlane_b32 s14, v8 ; TONGA-NEXT: s_sub_u32 s12, s12, s14 ; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s1, s1, s7 ; TONGA-NEXT: s_sub_u32 s18, s12, s6 ; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s19, s1, 0 ; TONGA-NEXT: s_cmp_ge_u32 s19, s7 ; TONGA-NEXT: s_cselect_b32 s20, -1, 0 @@ -6313,12 +6247,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: s_cselect_b32 s20, s21, s20 ; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s1, s1, s7 -; TONGA-NEXT: s_sub_u32 s21, s18, s6 -; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 +; TONGA-NEXT: s_sub_u32 s16, s18, s6 ; TONGA-NEXT: s_subb_u32 s1, s1, 0 ; TONGA-NEXT: s_cmp_lg_u32 s20, 0 -; TONGA-NEXT: s_cselect_b32 s16, s21, s18 +; TONGA-NEXT: s_cselect_b32 s16, s16, s18 ; TONGA-NEXT: s_cselect_b32 s1, s1, s19 ; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s3, s13, s3 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index 33b0a5d129696..ea9bb0417dfa4 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -51,10 +51,9 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_addc_u32 s13, 0, s14 ; GCN-NEXT: s_add_u32 s14, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_addc_u32 s12, s12, s13 ; GCN-NEXT: s_mul_i32 s0, s10, s12 ; GCN-NEXT: v_readfirstlane_b32 s1, v0 @@ -85,7 +84,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_add_u32 s11, s14, s0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_addc_u32 s1, s12, s10 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 @@ -115,46 +113,43 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: v_readfirstlane_b32 s10, v0 ; GCN-NEXT: s_add_i32 s5, s10, s5 ; GCN-NEXT: s_mul_i32 s10, s9, s4 -; GCN-NEXT: s_add_i32 s10, s5, s10 -; GCN-NEXT: s_sub_i32 s11, s7, s10 +; GCN-NEXT: s_add_i32 s12, s5, s10 +; GCN-NEXT: s_sub_i32 s10, s7, s12 ; GCN-NEXT: s_mul_i32 s4, s8, s4 ; GCN-NEXT: s_sub_u32 s6, s6, s4 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s12, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s12, 0 -; GCN-NEXT: s_subb_u32 s11, s11, s9 -; GCN-NEXT: s_sub_u32 s13, s6, s8 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: s_or_b32 s11, s4, s5 +; GCN-NEXT: s_subb_u32 s13, s10, s9 +; GCN-NEXT: s_sub_u32 s14, s6, s8 +; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GCN-NEXT: s_or_b32 s15, s10, s11 +; GCN-NEXT: s_subb_u32 s15, s13, 0 +; GCN-NEXT: s_cmp_ge_u32 s15, s9 +; GCN-NEXT: s_cselect_b32 s16, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s14, s8 +; GCN-NEXT: s_cselect_b32 s17, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s15, s9 +; GCN-NEXT: s_cselect_b32 s16, s17, s16 +; GCN-NEXT: s_or_b32 s10, s10, s11 +; GCN-NEXT: s_subb_u32 s13, s13, s9 +; GCN-NEXT: s_sub_u32 s17, s14, s8 +; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GCN-NEXT: s_or_b32 s10, s10, s11 +; GCN-NEXT: s_subb_u32 s10, s13, 0 +; GCN-NEXT: s_cmp_lg_u32 s16, 0 +; GCN-NEXT: s_cselect_b32 s11, s17, s14 +; GCN-NEXT: s_cselect_b32 s10, s10, s15 ; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_subb_u32 s14, s11, 0 -; GCN-NEXT: s_cmp_ge_u32 s14, s9 +; GCN-NEXT: s_subb_u32 s4, s7, s12 +; GCN-NEXT: s_cmp_ge_u32 s4, s9 ; GCN-NEXT: s_cselect_b32 s5, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s13, s8 -; GCN-NEXT: s_cselect_b32 s15, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s14, s9 -; GCN-NEXT: s_cselect_b32 s15, s15, s5 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_subb_u32 s11, s11, s9 -; GCN-NEXT: s_sub_u32 s16, s13, s8 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_subb_u32 s4, s11, 0 -; GCN-NEXT: s_cmp_lg_u32 s15, 0 -; GCN-NEXT: s_cselect_b32 s5, s16, s13 -; GCN-NEXT: s_cselect_b32 s4, s4, s14 -; GCN-NEXT: s_cmp_lg_u32 s12, 0 -; GCN-NEXT: s_subb_u32 s7, s7, s10 -; GCN-NEXT: s_cmp_ge_u32 s7, s9 -; GCN-NEXT: s_cselect_b32 s10, -1, 0 ; GCN-NEXT: s_cmp_ge_u32 s6, s8 -; GCN-NEXT: s_cselect_b32 s8, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s7, s9 -; GCN-NEXT: s_cselect_b32 s8, s8, s10 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 -; GCN-NEXT: s_cselect_b32 s4, s4, s7 -; GCN-NEXT: s_cselect_b32 s5, s5, s6 +; GCN-NEXT: s_cselect_b32 s7, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s4, s9 +; GCN-NEXT: s_cselect_b32 s5, s7, s5 +; GCN-NEXT: s_cmp_lg_u32 s5, 0 +; GCN-NEXT: s_cselect_b32 s4, s10, s4 +; GCN-NEXT: s_cselect_b32 s5, s11, s6 ; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -187,7 +182,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_add_u32 s14, s12, 1 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_or_b32 s8, s8, s9 -; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-IR-NEXT: s_addc_u32 s8, s13, 0 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 @@ -221,7 +215,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_add_u32 s16, s16, 1 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_or_b32 s18, s18, s19 -; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0 ; GCN-IR-NEXT: s_addc_u32 s17, s17, 0 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] @@ -1016,10 +1009,9 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_addc_u32 s13, 0, s14 ; GCN-NEXT: s_add_u32 s14, s8, s9 ; GCN-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-NEXT: s_addc_u32 s12, s12, s13 ; GCN-NEXT: s_mul_i32 s8, s10, s12 ; GCN-NEXT: v_readfirstlane_b32 s9, v0 @@ -1050,7 +1042,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_add_u32 s11, s14, s8 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-NEXT: s_addc_u32 s10, s12, s10 ; GCN-NEXT: s_ashr_i32 s8, s7, 31 ; GCN-NEXT: s_add_u32 s6, s6, s8 @@ -1083,46 +1074,43 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: v_readfirstlane_b32 s12, v0 ; GCN-NEXT: s_add_i32 s11, s12, s11 ; GCN-NEXT: s_mul_i32 s12, s5, s10 -; GCN-NEXT: s_add_i32 s12, s11, s12 -; GCN-NEXT: s_sub_i32 s13, s7, s12 +; GCN-NEXT: s_add_i32 s14, s11, s12 +; GCN-NEXT: s_sub_i32 s12, s7, s14 ; GCN-NEXT: s_mul_i32 s10, s4, s10 ; GCN-NEXT: s_sub_u32 s6, s6, s10 ; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s14, s10, s11 -; GCN-NEXT: s_cmp_lg_u32 s14, 0 -; GCN-NEXT: s_subb_u32 s13, s13, s5 -; GCN-NEXT: s_sub_u32 s15, s6, s4 -; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s10, s10, s11 -; GCN-NEXT: s_cmp_lg_u32 s10, 0 -; GCN-NEXT: s_subb_u32 s16, s13, 0 -; GCN-NEXT: s_cmp_ge_u32 s16, s5 -; GCN-NEXT: s_cselect_b32 s11, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s15, s4 -; GCN-NEXT: s_cselect_b32 s17, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s16, s5 -; GCN-NEXT: s_cselect_b32 s17, s17, s11 -; GCN-NEXT: s_cmp_lg_u32 s10, 0 -; GCN-NEXT: s_subb_u32 s13, s13, s5 -; GCN-NEXT: s_sub_u32 s18, s15, s4 -; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GCN-NEXT: s_or_b32 s13, s10, s11 +; GCN-NEXT: s_subb_u32 s15, s12, s5 +; GCN-NEXT: s_sub_u32 s16, s6, s4 +; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GCN-NEXT: s_or_b32 s17, s12, s13 +; GCN-NEXT: s_subb_u32 s17, s15, 0 +; GCN-NEXT: s_cmp_ge_u32 s17, s5 +; GCN-NEXT: s_cselect_b32 s18, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s16, s4 +; GCN-NEXT: s_cselect_b32 s19, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s17, s5 +; GCN-NEXT: s_cselect_b32 s18, s19, s18 +; GCN-NEXT: s_or_b32 s12, s12, s13 +; GCN-NEXT: s_subb_u32 s15, s15, s5 +; GCN-NEXT: s_sub_u32 s19, s16, s4 +; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GCN-NEXT: s_or_b32 s12, s12, s13 +; GCN-NEXT: s_subb_u32 s12, s15, 0 +; GCN-NEXT: s_cmp_lg_u32 s18, 0 +; GCN-NEXT: s_cselect_b32 s13, s19, s16 +; GCN-NEXT: s_cselect_b32 s12, s12, s17 ; GCN-NEXT: s_or_b32 s10, s10, s11 -; GCN-NEXT: s_cmp_lg_u32 s10, 0 -; GCN-NEXT: s_subb_u32 s10, s13, 0 -; GCN-NEXT: s_cmp_lg_u32 s17, 0 -; GCN-NEXT: s_cselect_b32 s11, s18, s15 -; GCN-NEXT: s_cselect_b32 s10, s10, s16 -; GCN-NEXT: s_cmp_lg_u32 s14, 0 -; GCN-NEXT: s_subb_u32 s7, s7, s12 +; GCN-NEXT: s_subb_u32 s7, s7, s14 ; GCN-NEXT: s_cmp_ge_u32 s7, s5 -; GCN-NEXT: s_cselect_b32 s12, -1, 0 +; GCN-NEXT: s_cselect_b32 s10, -1, 0 ; GCN-NEXT: s_cmp_ge_u32 s6, s4 ; GCN-NEXT: s_cselect_b32 s4, -1, 0 ; GCN-NEXT: s_cmp_eq_u32 s7, s5 -; GCN-NEXT: s_cselect_b32 s4, s4, s12 +; GCN-NEXT: s_cselect_b32 s4, s4, s10 ; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_cselect_b32 s5, s10, s7 -; GCN-NEXT: s_cselect_b32 s4, s11, s6 +; GCN-NEXT: s_cselect_b32 s5, s12, s7 +; GCN-NEXT: s_cselect_b32 s4, s13, s6 ; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] ; GCN-NEXT: s_sub_u32 s4, s4, s8 ; GCN-NEXT: s_subb_u32 s5, s5, s8 @@ -1170,7 +1158,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_add_u32 s16, s14, 1 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_or_b32 s10, s10, s11 -; GCN-IR-NEXT: s_cmp_lg_u32 s10, 0 ; GCN-IR-NEXT: s_addc_u32 s10, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s14, 63, s14 @@ -1204,7 +1191,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_add_u32 s18, s18, 1 ; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0 ; GCN-IR-NEXT: s_or_b32 s20, s20, s21 -; GCN-IR-NEXT: s_cmp_lg_u32 s20, 0 ; GCN-IR-NEXT: s_addc_u32 s19, s19, 0 ; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[12:13], s[2:3] @@ -1369,10 +1355,9 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_addc_u32 s10, 0, s11 ; GCN-NEXT: s_add_u32 s11, s6, s7 ; GCN-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 +; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-NEXT: s_or_b32 s6, s6, s7 -; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_addc_u32 s9, s9, s10 ; GCN-NEXT: s_mul_i32 s6, s2, s9 ; GCN-NEXT: v_readfirstlane_b32 s7, v0 @@ -1403,7 +1388,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_add_u32 s2, s11, s2 ; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-NEXT: s_or_b32 s6, s6, s7 -; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_addc_u32 s6, s9, s8 ; GCN-NEXT: v_mul_hi_u32 v1, s2, 24 ; GCN-NEXT: v_mul_hi_u32 v0, s6, 24 @@ -1418,45 +1402,42 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_mul_i32 s7, s5, s6 ; GCN-NEXT: s_mul_i32 s6, s4, s6 ; GCN-NEXT: v_readfirstlane_b32 s8, v0 -; GCN-NEXT: s_add_i32 s8, s8, s7 -; GCN-NEXT: s_sub_i32 s9, 0, s8 -; GCN-NEXT: s_sub_u32 s10, 24, s6 -; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-NEXT: s_or_b32 s11, s6, s7 -; GCN-NEXT: s_cmp_lg_u32 s11, 0 -; GCN-NEXT: s_subb_u32 s9, s9, s5 -; GCN-NEXT: s_sub_u32 s12, s10, s4 +; GCN-NEXT: s_add_i32 s10, s8, s7 +; GCN-NEXT: s_sub_i32 s8, 0, s10 +; GCN-NEXT: s_sub_u32 s11, 24, s6 ; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN-NEXT: s_or_b32 s9, s6, s7 +; GCN-NEXT: s_subb_u32 s12, s8, s5 +; GCN-NEXT: s_sub_u32 s13, s11, s4 +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: s_or_b32 s14, s8, s9 +; GCN-NEXT: s_subb_u32 s14, s12, 0 +; GCN-NEXT: s_cmp_ge_u32 s14, s5 +; GCN-NEXT: s_cselect_b32 s15, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s13, s4 +; GCN-NEXT: s_cselect_b32 s16, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s14, s5 +; GCN-NEXT: s_cselect_b32 s15, s16, s15 +; GCN-NEXT: s_or_b32 s8, s8, s9 +; GCN-NEXT: s_subb_u32 s12, s12, s5 +; GCN-NEXT: s_sub_u32 s16, s13, s4 +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: s_or_b32 s8, s8, s9 +; GCN-NEXT: s_subb_u32 s8, s12, 0 +; GCN-NEXT: s_cmp_lg_u32 s15, 0 +; GCN-NEXT: s_cselect_b32 s9, s16, s13 +; GCN-NEXT: s_cselect_b32 s8, s8, s14 ; GCN-NEXT: s_or_b32 s6, s6, s7 -; GCN-NEXT: s_cmp_lg_u32 s6, 0 -; GCN-NEXT: s_subb_u32 s13, s9, 0 -; GCN-NEXT: s_cmp_ge_u32 s13, s5 +; GCN-NEXT: s_subb_u32 s6, 0, s10 +; GCN-NEXT: s_cmp_ge_u32 s6, s5 ; GCN-NEXT: s_cselect_b32 s7, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s12, s4 -; GCN-NEXT: s_cselect_b32 s14, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s13, s5 -; GCN-NEXT: s_cselect_b32 s14, s14, s7 -; GCN-NEXT: s_cmp_lg_u32 s6, 0 -; GCN-NEXT: s_subb_u32 s9, s9, s5 -; GCN-NEXT: s_sub_u32 s15, s12, s4 -; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-NEXT: s_or_b32 s6, s6, s7 -; GCN-NEXT: s_cmp_lg_u32 s6, 0 -; GCN-NEXT: s_subb_u32 s6, s9, 0 -; GCN-NEXT: s_cmp_lg_u32 s14, 0 -; GCN-NEXT: s_cselect_b32 s7, s15, s12 -; GCN-NEXT: s_cselect_b32 s6, s6, s13 -; GCN-NEXT: s_cmp_lg_u32 s11, 0 -; GCN-NEXT: s_subb_u32 s8, 0, s8 -; GCN-NEXT: s_cmp_ge_u32 s8, s5 -; GCN-NEXT: s_cselect_b32 s9, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s10, s4 +; GCN-NEXT: s_cmp_ge_u32 s11, s4 ; GCN-NEXT: s_cselect_b32 s4, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s8, s5 -; GCN-NEXT: s_cselect_b32 s4, s4, s9 +; GCN-NEXT: s_cmp_eq_u32 s6, s5 +; GCN-NEXT: s_cselect_b32 s4, s4, s7 ; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_cselect_b32 s4, s6, s8 -; GCN-NEXT: s_cselect_b32 s5, s7, s10 +; GCN-NEXT: s_cselect_b32 s4, s8, s6 +; GCN-NEXT: s_cselect_b32 s5, s9, s11 ; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1489,7 +1470,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s8, s2, 1 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_or_b32 s9, s10, s11 -; GCN-IR-NEXT: s_cmp_lg_u32 s9, 0 ; GCN-IR-NEXT: s_addc_u32 s3, s3, 0 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s2, 63, s2 @@ -1522,7 +1502,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s14, s14, 1 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_or_b32 s16, s16, s17 -; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0 ; GCN-IR-NEXT: s_addc_u32 s15, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll index bb5918b256d28..bdd22f25e91c8 100644 --- a/llvm/test/CodeGen/AMDGPU/uaddo.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll @@ -18,7 +18,6 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: s_or_b32 s0, s0, s1 -; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_addc_u32 s3, s3, s9 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -35,10 +34,8 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s2, s2, s4 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 ; VI-NEXT: s_addc_u32 s3, s3, s5 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -53,14 +50,12 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s6, s2, s6 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s4, s3, s7 +; GFX9-NEXT: s_add_u32 s4, s2, s6 +; GFX9-NEXT: s_addc_u32 s5, s3, s7 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm @@ -73,8 +68,6 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s2, s2, s6 -; GFX10-NEXT: s_cselect_b32 s4, -1, 0 -; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: s_addc_u32 s3, s3, s7 ; GFX10-NEXT: s_cselect_b32 s4, -1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 @@ -91,14 +84,12 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s2, s2, s4 -; GFX11-NEXT: s_cselect_b32 s4, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_lg_u32 s4, 0 ; GFX11-NEXT: s_addc_u32 s3, s3, s5 ; GFX11-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, s2, s2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm @@ -444,7 +435,6 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_add_u32 s4, s4, s6 ; SI-NEXT: s_cselect_b64 s[12:13], -1, 0 ; SI-NEXT: s_or_b32 s6, s12, s13 -; SI-NEXT: s_cmp_lg_u32 s6, 0 ; SI-NEXT: s_addc_u32 s5, s5, s7 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 @@ -465,16 +455,14 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: s_add_u32 s2, s4, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_add_u32 s0, s4, s6 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_addc_u32 s1, s5, s7 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 -; VI-NEXT: s_addc_u32 s0, s5, s7 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: v_mov_b32_e32 v5, s0 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -486,12 +474,10 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s2, s12, s14 -; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_addc_u32 s0, s13, s15 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_add_u32 s0, s12, s14 +; GFX9-NEXT: s_addc_u32 s1, s13, s15 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -504,10 +490,8 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s12, s14 -; GFX10-NEXT: s_cselect_b32 s1, -1, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10-NEXT: s_addc_u32 s1, s13, s15 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_cselect_b32 s0, -1, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 @@ -520,10 +504,8 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s4, s4, s6 -; GFX11-NEXT: s_cselect_b32 s6, -1, 0 -; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: s_cmp_lg_u32 s6, 0 ; GFX11-NEXT: s_addc_u32 s5, s5, s7 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-NEXT: s_cselect_b32 s4, -1, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index 41199b0268ae4..fd461ac80ea55 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -148,7 +148,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_add_u32 s14, s12, 1 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_or_b32 s8, s8, s9 -; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-IR-NEXT: s_addc_u32 s8, s13, 0 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 @@ -182,7 +181,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_add_u32 s10, s10, 1 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_or_b32 s16, s16, s17 -; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0 ; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[2:3], s[4:5] @@ -831,10 +829,9 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_addc_u32 s10, 0, s11 ; GCN-NEXT: s_add_u32 s11, s4, s5 ; GCN-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_addc_u32 s9, s9, s10 ; GCN-NEXT: s_mul_i32 s4, s6, s9 ; GCN-NEXT: v_readfirstlane_b32 s5, v0 @@ -865,7 +862,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_add_u32 s8, s11, s4 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_addc_u32 s4, s9, s6 ; GCN-NEXT: v_mul_hi_u32 v1, s8, 24 ; GCN-NEXT: v_mul_hi_u32 v0, s4, 24 @@ -874,52 +870,50 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_readfirstlane_b32 s8, v1 ; GCN-NEXT: v_readfirstlane_b32 s5, v0 ; GCN-NEXT: s_add_u32 s4, s8, s4 -; GCN-NEXT: s_addc_u32 s8, 0, s5 -; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: s_addc_u32 s10, 0, s5 +; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_mul_i32 s0, s3, s8 +; GCN-NEXT: s_mul_i32 s0, s3, s10 ; GCN-NEXT: v_readfirstlane_b32 s1, v0 -; GCN-NEXT: s_add_i32 s9, s1, s0 -; GCN-NEXT: s_sub_i32 s10, 0, s9 -; GCN-NEXT: s_mul_i32 s0, s2, s8 -; GCN-NEXT: s_sub_u32 s11, 24, s0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s12, s0, s1 -; GCN-NEXT: s_cmp_lg_u32 s12, 0 -; GCN-NEXT: s_subb_u32 s10, s10, s3 -; GCN-NEXT: s_sub_u32 s13, s11, s2 +; GCN-NEXT: s_add_i32 s11, s1, s0 +; GCN-NEXT: s_sub_i32 s8, 0, s11 +; GCN-NEXT: s_mul_i32 s0, s2, s10 +; GCN-NEXT: s_sub_u32 s12, 24, s0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_or_b32 s9, s0, s1 +; GCN-NEXT: s_subb_u32 s13, s8, s3 +; GCN-NEXT: s_sub_u32 s14, s12, s2 +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: s_or_b32 s8, s8, s9 +; GCN-NEXT: s_subb_u32 s8, s13, 0 +; GCN-NEXT: s_cmp_ge_u32 s8, s3 +; GCN-NEXT: s_cselect_b32 s9, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s14, s2 +; GCN-NEXT: s_cselect_b32 s13, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, s3 +; GCN-NEXT: s_cselect_b32 s8, s13, s9 +; GCN-NEXT: s_add_u32 s9, s10, 1 +; GCN-NEXT: s_addc_u32 s13, 0, 0 +; GCN-NEXT: s_add_u32 s14, s10, 2 +; GCN-NEXT: s_addc_u32 s15, 0, 0 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_cselect_b32 s8, s14, s9 +; GCN-NEXT: s_cselect_b32 s9, s15, s13 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_subb_u32 s0, s10, 0 +; GCN-NEXT: s_subb_u32 s0, 0, s11 ; GCN-NEXT: s_cmp_ge_u32 s0, s3 ; GCN-NEXT: s_cselect_b32 s1, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s13, s2 -; GCN-NEXT: s_cselect_b32 s10, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s12, s2 +; GCN-NEXT: s_cselect_b32 s2, -1, 0 ; GCN-NEXT: s_cmp_eq_u32 s0, s3 -; GCN-NEXT: s_cselect_b32 s0, s10, s1 -; GCN-NEXT: s_add_u32 s1, s8, 1 -; GCN-NEXT: s_addc_u32 s10, 0, 0 -; GCN-NEXT: s_add_u32 s13, s8, 2 -; GCN-NEXT: s_addc_u32 s14, 0, 0 +; GCN-NEXT: s_cselect_b32 s0, s2, s1 ; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_cselect_b32 s0, s13, s1 -; GCN-NEXT: s_cselect_b32 s1, s14, s10 -; GCN-NEXT: s_cmp_lg_u32 s12, 0 -; GCN-NEXT: s_subb_u32 s9, 0, s9 -; GCN-NEXT: s_cmp_ge_u32 s9, s3 -; GCN-NEXT: s_cselect_b32 s10, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s11, s2 -; GCN-NEXT: s_cselect_b32 s2, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s9, s3 -; GCN-NEXT: s_cselect_b32 s2, s2, s10 -; GCN-NEXT: s_cmp_lg_u32 s2, 0 -; GCN-NEXT: s_cselect_b32 s1, s1, 0 -; GCN-NEXT: s_cselect_b32 s0, s0, s8 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: s_cselect_b32 s0, s9, 0 +; GCN-NEXT: s_cselect_b32 s1, s8, s10 +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; @@ -945,7 +939,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s10, s8, 1 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_or_b32 s6, s6, s7 -; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-IR-NEXT: s_addc_u32 s6, s9, 0 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 @@ -978,7 +971,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s14, s14, 1 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_or_b32 s16, s16, s17 -; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0 ; GCN-IR-NEXT: s_addc_u32 s15, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5] @@ -1317,7 +1309,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s11, s8, 1 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_or_b32 s6, s6, s7 -; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-IR-NEXT: s_addc_u32 s6, s9, 0 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 @@ -1347,7 +1338,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s10, s10, 1 ; GCN-IR-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GCN-IR-NEXT: s_or_b32 s12, s12, s13 -; GCN-IR-NEXT: s_cmp_lg_u32 s12, 0 ; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 ; GCN-IR-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index cdcc9146cc5ae..137dc1fe42294 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -51,10 +51,9 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-NEXT: s_addc_u32 s13, 0, s14 ; GCN-NEXT: s_add_u32 s14, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_addc_u32 s12, s12, s13 ; GCN-NEXT: s_mul_i32 s0, s10, s12 ; GCN-NEXT: v_readfirstlane_b32 s1, v0 @@ -85,7 +84,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-NEXT: s_add_u32 s11, s14, s0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_addc_u32 s1, s12, s10 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 @@ -115,46 +113,43 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-NEXT: v_readfirstlane_b32 s10, v0 ; GCN-NEXT: s_add_i32 s5, s10, s5 ; GCN-NEXT: s_mul_i32 s10, s9, s4 -; GCN-NEXT: s_add_i32 s10, s5, s10 -; GCN-NEXT: s_sub_i32 s11, s7, s10 +; GCN-NEXT: s_add_i32 s12, s5, s10 +; GCN-NEXT: s_sub_i32 s10, s7, s12 ; GCN-NEXT: s_mul_i32 s4, s8, s4 ; GCN-NEXT: s_sub_u32 s6, s6, s4 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s12, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s12, 0 -; GCN-NEXT: s_subb_u32 s11, s11, s9 -; GCN-NEXT: s_sub_u32 s13, s6, s8 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: s_or_b32 s11, s4, s5 +; GCN-NEXT: s_subb_u32 s13, s10, s9 +; GCN-NEXT: s_sub_u32 s14, s6, s8 +; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GCN-NEXT: s_or_b32 s15, s10, s11 +; GCN-NEXT: s_subb_u32 s15, s13, 0 +; GCN-NEXT: s_cmp_ge_u32 s15, s9 +; GCN-NEXT: s_cselect_b32 s16, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s14, s8 +; GCN-NEXT: s_cselect_b32 s17, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s15, s9 +; GCN-NEXT: s_cselect_b32 s16, s17, s16 +; GCN-NEXT: s_or_b32 s10, s10, s11 +; GCN-NEXT: s_subb_u32 s13, s13, s9 +; GCN-NEXT: s_sub_u32 s17, s14, s8 +; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GCN-NEXT: s_or_b32 s10, s10, s11 +; GCN-NEXT: s_subb_u32 s10, s13, 0 +; GCN-NEXT: s_cmp_lg_u32 s16, 0 +; GCN-NEXT: s_cselect_b32 s11, s17, s14 +; GCN-NEXT: s_cselect_b32 s10, s10, s15 ; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_subb_u32 s14, s11, 0 -; GCN-NEXT: s_cmp_ge_u32 s14, s9 +; GCN-NEXT: s_subb_u32 s4, s7, s12 +; GCN-NEXT: s_cmp_ge_u32 s4, s9 ; GCN-NEXT: s_cselect_b32 s5, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s13, s8 -; GCN-NEXT: s_cselect_b32 s15, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s14, s9 -; GCN-NEXT: s_cselect_b32 s15, s15, s5 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_subb_u32 s11, s11, s9 -; GCN-NEXT: s_sub_u32 s16, s13, s8 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_subb_u32 s4, s11, 0 -; GCN-NEXT: s_cmp_lg_u32 s15, 0 -; GCN-NEXT: s_cselect_b32 s5, s16, s13 -; GCN-NEXT: s_cselect_b32 s4, s4, s14 -; GCN-NEXT: s_cmp_lg_u32 s12, 0 -; GCN-NEXT: s_subb_u32 s7, s7, s10 -; GCN-NEXT: s_cmp_ge_u32 s7, s9 -; GCN-NEXT: s_cselect_b32 s10, -1, 0 ; GCN-NEXT: s_cmp_ge_u32 s6, s8 -; GCN-NEXT: s_cselect_b32 s8, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s7, s9 -; GCN-NEXT: s_cselect_b32 s8, s8, s10 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 -; GCN-NEXT: s_cselect_b32 s4, s4, s7 -; GCN-NEXT: s_cselect_b32 s5, s5, s6 +; GCN-NEXT: s_cselect_b32 s7, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s4, s9 +; GCN-NEXT: s_cselect_b32 s5, s7, s5 +; GCN-NEXT: s_cmp_lg_u32 s5, 0 +; GCN-NEXT: s_cselect_b32 s4, s10, s4 +; GCN-NEXT: s_cselect_b32 s5, s11, s6 ; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -187,7 +182,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_add_u32 s14, s12, 1 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_or_b32 s8, s8, s9 -; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-IR-NEXT: s_addc_u32 s8, s13, 0 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 @@ -221,7 +215,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_add_u32 s16, s16, 1 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_or_b32 s18, s18, s19 -; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0 ; GCN-IR-NEXT: s_addc_u32 s17, s17, 0 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] @@ -853,10 +846,9 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_addc_u32 s10, 0, s11 ; GCN-NEXT: s_add_u32 s11, s4, s5 ; GCN-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_addc_u32 s9, s9, s10 ; GCN-NEXT: s_mul_i32 s4, s6, s9 ; GCN-NEXT: v_readfirstlane_b32 s5, v0 @@ -887,7 +879,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_add_u32 s8, s11, s4 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_addc_u32 s4, s9, s6 ; GCN-NEXT: v_mul_hi_u32 v1, s8, 24 ; GCN-NEXT: v_mul_hi_u32 v0, s4, 24 @@ -903,46 +894,43 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_mul_i32 s0, s3, s8 ; GCN-NEXT: v_readfirstlane_b32 s1, v0 -; GCN-NEXT: s_add_i32 s9, s1, s0 -; GCN-NEXT: s_sub_i32 s10, 0, s9 +; GCN-NEXT: s_add_i32 s10, s1, s0 +; GCN-NEXT: s_sub_i32 s9, 0, s10 ; GCN-NEXT: s_mul_i32 s0, s2, s8 -; GCN-NEXT: s_sub_u32 s8, 24, s0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s11, s0, s1 -; GCN-NEXT: s_cmp_lg_u32 s11, 0 -; GCN-NEXT: s_subb_u32 s10, s10, s3 -; GCN-NEXT: s_sub_u32 s12, s8, s2 +; GCN-NEXT: s_sub_u32 s11, 24, s0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_or_b32 s8, s0, s1 +; GCN-NEXT: s_subb_u32 s12, s9, s3 +; GCN-NEXT: s_sub_u32 s13, s11, s2 +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: s_or_b32 s14, s8, s9 +; GCN-NEXT: s_subb_u32 s14, s12, 0 +; GCN-NEXT: s_cmp_ge_u32 s14, s3 +; GCN-NEXT: s_cselect_b32 s15, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s13, s2 +; GCN-NEXT: s_cselect_b32 s16, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s14, s3 +; GCN-NEXT: s_cselect_b32 s15, s16, s15 +; GCN-NEXT: s_or_b32 s8, s8, s9 +; GCN-NEXT: s_subb_u32 s12, s12, s3 +; GCN-NEXT: s_sub_u32 s16, s13, s2 +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: s_or_b32 s8, s8, s9 +; GCN-NEXT: s_subb_u32 s8, s12, 0 +; GCN-NEXT: s_cmp_lg_u32 s15, 0 +; GCN-NEXT: s_cselect_b32 s9, s16, s13 +; GCN-NEXT: s_cselect_b32 s8, s8, s14 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_subb_u32 s13, s10, 0 -; GCN-NEXT: s_cmp_ge_u32 s13, s3 +; GCN-NEXT: s_subb_u32 s0, 0, s10 +; GCN-NEXT: s_cmp_ge_u32 s0, s3 ; GCN-NEXT: s_cselect_b32 s1, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s12, s2 -; GCN-NEXT: s_cselect_b32 s14, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s13, s3 -; GCN-NEXT: s_cselect_b32 s14, s14, s1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_subb_u32 s10, s10, s3 -; GCN-NEXT: s_sub_u32 s15, s12, s2 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_subb_u32 s0, s10, 0 -; GCN-NEXT: s_cmp_lg_u32 s14, 0 -; GCN-NEXT: s_cselect_b32 s1, s15, s12 -; GCN-NEXT: s_cselect_b32 s0, s0, s13 -; GCN-NEXT: s_cmp_lg_u32 s11, 0 -; GCN-NEXT: s_subb_u32 s9, 0, s9 -; GCN-NEXT: s_cmp_ge_u32 s9, s3 -; GCN-NEXT: s_cselect_b32 s10, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s8, s2 +; GCN-NEXT: s_cmp_ge_u32 s11, s2 ; GCN-NEXT: s_cselect_b32 s2, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s9, s3 -; GCN-NEXT: s_cselect_b32 s2, s2, s10 -; GCN-NEXT: s_cmp_lg_u32 s2, 0 -; GCN-NEXT: s_cselect_b32 s0, s0, s9 -; GCN-NEXT: s_cselect_b32 s1, s1, s8 +; GCN-NEXT: s_cmp_eq_u32 s0, s3 +; GCN-NEXT: s_cselect_b32 s1, s2, s1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s8, s0 +; GCN-NEXT: s_cselect_b32 s1, s9, s11 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -970,7 +958,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s10, s8, 1 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_or_b32 s6, s6, s7 -; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-IR-NEXT: s_addc_u32 s6, s9, 0 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 @@ -1003,7 +990,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s14, s14, 1 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_or_b32 s16, s16, s17 -; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0 ; GCN-IR-NEXT: s_addc_u32 s15, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5] @@ -1093,7 +1079,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s11, s8, 1 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_or_b32 s6, s6, s7 -; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-IR-NEXT: s_addc_u32 s6, s9, 0 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 @@ -1123,7 +1108,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s12, s12, 1 ; GCN-IR-NEXT: s_cselect_b64 s[14:15], -1, 0 ; GCN-IR-NEXT: s_or_b32 s14, s14, s15 -; GCN-IR-NEXT: s_cmp_lg_u32 s14, 0 ; GCN-IR-NEXT: s_addc_u32 s13, s13, 0 ; GCN-IR-NEXT: s_cselect_b64 s[14:15], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll index d67a7b151948e..e8db6471b6a46 100644 --- a/llvm/test/CodeGen/AMDGPU/usubo.ll +++ b/llvm/test/CodeGen/AMDGPU/usubo.ll @@ -18,7 +18,6 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: s_or_b32 s0, s0, s1 -; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_subb_u32 s3, s3, s9 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -35,10 +34,8 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_sub_u32 s2, s2, s4 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 ; VI-NEXT: s_subb_u32 s3, s3, s5 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -53,14 +50,12 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_u32 s6, s2, s6 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_subb_u32 s4, s3, s7 +; GFX9-NEXT: s_sub_u32 s4, s2, s6 +; GFX9-NEXT: s_subb_u32 s5, s3, s7 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm @@ -73,8 +68,6 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sub_u32 s2, s2, s6 -; GFX10-NEXT: s_cselect_b32 s4, -1, 0 -; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: s_subb_u32 s3, s3, s7 ; GFX10-NEXT: s_cselect_b32 s4, -1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 @@ -91,14 +84,12 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s2, s2, s4 -; GFX11-NEXT: s_cselect_b32 s4, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_lg_u32 s4, 0 ; GFX11-NEXT: s_subb_u32 s3, s3, s5 ; GFX11-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, s2, s2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm @@ -443,7 +434,6 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_sub_u32 s4, s4, s6 ; SI-NEXT: s_cselect_b64 s[12:13], -1, 0 ; SI-NEXT: s_or_b32 s6, s12, s13 -; SI-NEXT: s_cmp_lg_u32 s6, 0 ; SI-NEXT: s_subb_u32 s5, s5, s7 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 @@ -464,16 +454,14 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: s_sub_u32 s2, s4, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_sub_u32 s0, s4, s6 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_subb_u32 s1, s5, s7 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 -; VI-NEXT: s_subb_u32 s0, s5, s7 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: v_mov_b32_e32 v5, s0 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -485,12 +473,10 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_u32 s2, s12, s14 -; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_subb_u32 s0, s13, s15 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_sub_u32 s0, s12, s14 +; GFX9-NEXT: s_subb_u32 s1, s13, s15 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -503,10 +489,8 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sub_u32 s0, s12, s14 -; GFX10-NEXT: s_cselect_b32 s1, -1, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10-NEXT: s_subb_u32 s1, s13, s15 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_cselect_b32 s0, -1, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 @@ -519,10 +503,8 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s4, s4, s6 -; GFX11-NEXT: s_cselect_b32 s6, -1, 0 -; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: s_cmp_lg_u32 s6, 0 ; GFX11-NEXT: s_subb_u32 s5, s5, s7 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-NEXT: s_cselect_b32 s4, -1, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index 75db3879e7b03..28c6b40554bb6 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -774,44 +774,40 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: s_add_u32 s11, s12, s11 ; GFX1032-NEXT: s_addc_u32 s12, 0, s13 ; GFX1032-NEXT: s_add_u32 s8, s8, s11 -; GFX1032-NEXT: s_cselect_b32 s11, -1, 0 -; GFX1032-NEXT: s_mul_hi_u32 s13, s9, s8 -; GFX1032-NEXT: s_cmp_lg_u32 s11, 0 -; GFX1032-NEXT: s_mul_i32 s11, s9, s8 ; GFX1032-NEXT: s_addc_u32 s5, s5, s12 -; GFX1032-NEXT: s_mul_i32 s10, s10, s8 +; GFX1032-NEXT: s_mul_hi_u32 s11, s9, s8 +; GFX1032-NEXT: s_mul_i32 s12, s9, s8 ; GFX1032-NEXT: s_mul_i32 s9, s9, s5 -; GFX1032-NEXT: s_mul_hi_u32 s12, s8, s11 -; GFX1032-NEXT: s_add_i32 s9, s13, s9 -; GFX1032-NEXT: s_mul_hi_u32 s13, s5, s11 +; GFX1032-NEXT: s_mul_i32 s10, s10, s8 +; GFX1032-NEXT: s_add_i32 s9, s11, s9 +; GFX1032-NEXT: s_mul_i32 s11, s5, s12 ; GFX1032-NEXT: s_add_i32 s9, s9, s10 -; GFX1032-NEXT: s_mul_i32 s10, s5, s11 +; GFX1032-NEXT: s_mul_hi_u32 s10, s8, s12 ; GFX1032-NEXT: s_mul_i32 s15, s8, s9 ; GFX1032-NEXT: s_mul_hi_u32 s14, s8, s9 -; GFX1032-NEXT: s_add_u32 s12, s12, s15 +; GFX1032-NEXT: s_add_u32 s10, s10, s15 +; GFX1032-NEXT: s_mul_hi_u32 s13, s5, s12 ; GFX1032-NEXT: s_addc_u32 s14, 0, s14 -; GFX1032-NEXT: s_mul_hi_u32 s11, s5, s9 -; GFX1032-NEXT: s_add_u32 s10, s12, s10 +; GFX1032-NEXT: s_mul_hi_u32 s12, s5, s9 +; GFX1032-NEXT: s_add_u32 s10, s10, s11 ; GFX1032-NEXT: s_mul_i32 s9, s5, s9 ; GFX1032-NEXT: s_addc_u32 s10, s14, s13 -; GFX1032-NEXT: s_addc_u32 s11, s11, 0 +; GFX1032-NEXT: s_addc_u32 s11, s12, 0 ; GFX1032-NEXT: s_add_u32 s9, s10, s9 ; GFX1032-NEXT: s_addc_u32 s10, 0, s11 ; GFX1032-NEXT: s_add_u32 s8, s8, s9 -; GFX1032-NEXT: s_cselect_b32 s9, -1, 0 -; GFX1032-NEXT: s_mul_hi_u32 s11, s2, s8 -; GFX1032-NEXT: s_cmp_lg_u32 s9, 0 -; GFX1032-NEXT: s_mul_hi_u32 s9, s3, s8 ; GFX1032-NEXT: s_addc_u32 s5, s5, s10 -; GFX1032-NEXT: s_mul_i32 s8, s3, s8 +; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s8 ; GFX1032-NEXT: s_mul_i32 s12, s2, s5 -; GFX1032-NEXT: s_mul_hi_u32 s10, s2, s5 -; GFX1032-NEXT: s_add_u32 s11, s11, s12 -; GFX1032-NEXT: s_addc_u32 s10, 0, s10 +; GFX1032-NEXT: s_mul_hi_u32 s11, s2, s5 +; GFX1032-NEXT: s_mul_hi_u32 s10, s3, s8 +; GFX1032-NEXT: s_mul_i32 s8, s3, s8 +; GFX1032-NEXT: s_add_u32 s9, s9, s12 +; GFX1032-NEXT: s_addc_u32 s11, 0, s11 ; GFX1032-NEXT: s_mul_hi_u32 s13, s3, s5 -; GFX1032-NEXT: s_add_u32 s8, s11, s8 +; GFX1032-NEXT: s_add_u32 s8, s9, s8 ; GFX1032-NEXT: s_mul_i32 s5, s3, s5 -; GFX1032-NEXT: s_addc_u32 s8, s10, s9 +; GFX1032-NEXT: s_addc_u32 s8, s11, s10 ; GFX1032-NEXT: s_addc_u32 s9, s13, 0 ; GFX1032-NEXT: s_add_u32 s5, s8, s5 ; GFX1032-NEXT: s_addc_u32 s8, 0, s9 @@ -824,11 +820,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: s_sub_i32 s11, s3, s9 ; GFX1032-NEXT: s_sub_u32 s10, s2, s10 ; GFX1032-NEXT: s_cselect_b32 s12, -1, 0 -; GFX1032-NEXT: s_cmp_lg_u32 s12, 0 ; GFX1032-NEXT: s_subb_u32 s11, s11, s1 ; GFX1032-NEXT: s_sub_u32 s13, s10, s0 -; GFX1032-NEXT: s_cselect_b32 s14, -1, 0 -; GFX1032-NEXT: s_cmp_lg_u32 s14, 0 ; GFX1032-NEXT: s_subb_u32 s11, s11, 0 ; GFX1032-NEXT: s_cmp_ge_u32 s11, s1 ; GFX1032-NEXT: s_cselect_b32 s14, -1, 0 @@ -901,8 +894,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s1 -; GFX1064-NEXT: s_sub_u32 s9, 0, s0 -; GFX1064-NEXT: s_subb_u32 s10, 0, s1 +; GFX1064-NEXT: s_sub_u32 s8, 0, s0 +; GFX1064-NEXT: s_subb_u32 s9, 0, s1 ; GFX1064-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX1064-NEXT: v_rcp_f32_e32 v0, v0 ; GFX1064-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -911,109 +904,102 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1064-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GFX1064-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX1064-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX1064-NEXT: v_readfirstlane_b32 s8, v1 -; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 -; GFX1064-NEXT: s_mul_i32 s5, s9, s8 -; GFX1064-NEXT: s_mul_hi_u32 s12, s9, s4 -; GFX1064-NEXT: s_mul_i32 s11, s10, s4 -; GFX1064-NEXT: s_add_i32 s5, s12, s5 -; GFX1064-NEXT: s_mul_i32 s13, s9, s4 -; GFX1064-NEXT: s_add_i32 s5, s5, s11 -; GFX1064-NEXT: s_mul_hi_u32 s12, s4, s13 -; GFX1064-NEXT: s_mul_i32 s15, s4, s5 -; GFX1064-NEXT: s_mul_hi_u32 s14, s8, s13 -; GFX1064-NEXT: s_mul_i32 s11, s8, s13 -; GFX1064-NEXT: s_mul_hi_u32 s13, s4, s5 +; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1064-NEXT: v_readfirstlane_b32 s5, v0 +; GFX1064-NEXT: s_mul_i32 s10, s8, s4 +; GFX1064-NEXT: s_mul_hi_u32 s12, s8, s5 +; GFX1064-NEXT: s_mul_i32 s11, s9, s5 +; GFX1064-NEXT: s_add_i32 s10, s12, s10 +; GFX1064-NEXT: s_mul_i32 s13, s8, s5 +; GFX1064-NEXT: s_add_i32 s10, s10, s11 +; GFX1064-NEXT: s_mul_hi_u32 s12, s5, s13 +; GFX1064-NEXT: s_mul_i32 s15, s5, s10 +; GFX1064-NEXT: s_mul_hi_u32 s14, s4, s13 +; GFX1064-NEXT: s_mul_i32 s11, s4, s13 +; GFX1064-NEXT: s_mul_hi_u32 s13, s5, s10 ; GFX1064-NEXT: s_add_u32 s12, s12, s15 ; GFX1064-NEXT: s_addc_u32 s13, 0, s13 -; GFX1064-NEXT: s_mul_hi_u32 s16, s8, s5 +; GFX1064-NEXT: s_mul_hi_u32 s16, s4, s10 ; GFX1064-NEXT: s_add_u32 s11, s12, s11 -; GFX1064-NEXT: s_mul_i32 s5, s8, s5 +; GFX1064-NEXT: s_mul_i32 s10, s4, s10 ; GFX1064-NEXT: s_addc_u32 s11, s13, s14 ; GFX1064-NEXT: s_addc_u32 s12, s16, 0 -; GFX1064-NEXT: s_add_u32 s5, s11, s5 +; GFX1064-NEXT: s_add_u32 s10, s11, s10 ; GFX1064-NEXT: s_addc_u32 s11, 0, s12 -; GFX1064-NEXT: s_add_u32 s12, s4, s5 -; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX1064-NEXT: s_mul_hi_u32 s13, s9, s12 -; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX1064-NEXT: s_mul_i32 s4, s9, s12 -; GFX1064-NEXT: s_addc_u32 s8, s8, s11 -; GFX1064-NEXT: s_mul_i32 s10, s10, s12 -; GFX1064-NEXT: s_mul_i32 s9, s9, s8 -; GFX1064-NEXT: s_mul_hi_u32 s5, s12, s4 -; GFX1064-NEXT: s_add_i32 s9, s13, s9 -; GFX1064-NEXT: s_mul_hi_u32 s11, s8, s4 -; GFX1064-NEXT: s_add_i32 s9, s9, s10 -; GFX1064-NEXT: s_mul_i32 s4, s8, s4 -; GFX1064-NEXT: s_mul_i32 s14, s12, s9 -; GFX1064-NEXT: s_mul_hi_u32 s13, s12, s9 -; GFX1064-NEXT: s_add_u32 s5, s5, s14 +; GFX1064-NEXT: s_add_u32 s5, s5, s10 +; GFX1064-NEXT: s_addc_u32 s4, s4, s11 +; GFX1064-NEXT: s_mul_hi_u32 s10, s8, s5 +; GFX1064-NEXT: s_mul_i32 s11, s8, s5 +; GFX1064-NEXT: s_mul_i32 s8, s8, s4 +; GFX1064-NEXT: s_mul_i32 s9, s9, s5 +; GFX1064-NEXT: s_add_i32 s8, s10, s8 +; GFX1064-NEXT: s_mul_i32 s10, s4, s11 +; GFX1064-NEXT: s_add_i32 s8, s8, s9 +; GFX1064-NEXT: s_mul_hi_u32 s9, s5, s11 +; GFX1064-NEXT: s_mul_i32 s14, s5, s8 +; GFX1064-NEXT: s_mul_hi_u32 s13, s5, s8 +; GFX1064-NEXT: s_add_u32 s9, s9, s14 +; GFX1064-NEXT: s_mul_hi_u32 s12, s4, s11 ; GFX1064-NEXT: s_addc_u32 s13, 0, s13 -; GFX1064-NEXT: s_mul_hi_u32 s10, s8, s9 -; GFX1064-NEXT: s_add_u32 s4, s5, s4 -; GFX1064-NEXT: s_mul_i32 s9, s8, s9 -; GFX1064-NEXT: s_addc_u32 s4, s13, s11 -; GFX1064-NEXT: s_addc_u32 s5, s10, 0 -; GFX1064-NEXT: s_add_u32 s4, s4, s9 -; GFX1064-NEXT: s_addc_u32 s9, 0, s5 -; GFX1064-NEXT: s_add_u32 s10, s12, s4 -; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX1064-NEXT: s_mul_hi_u32 s11, s2, s10 -; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX1064-NEXT: s_mul_hi_u32 s4, s3, s10 -; GFX1064-NEXT: s_addc_u32 s5, s8, s9 -; GFX1064-NEXT: s_mul_i32 s8, s3, s10 -; GFX1064-NEXT: s_mul_i32 s10, s2, s5 -; GFX1064-NEXT: s_mul_hi_u32 s9, s2, s5 -; GFX1064-NEXT: s_add_u32 s10, s11, s10 -; GFX1064-NEXT: s_addc_u32 s9, 0, s9 -; GFX1064-NEXT: s_mul_hi_u32 s12, s3, s5 -; GFX1064-NEXT: s_add_u32 s8, s10, s8 +; GFX1064-NEXT: s_mul_hi_u32 s11, s4, s8 +; GFX1064-NEXT: s_add_u32 s9, s9, s10 +; GFX1064-NEXT: s_mul_i32 s8, s4, s8 +; GFX1064-NEXT: s_addc_u32 s9, s13, s12 +; GFX1064-NEXT: s_addc_u32 s10, s11, 0 +; GFX1064-NEXT: s_add_u32 s8, s9, s8 +; GFX1064-NEXT: s_addc_u32 s9, 0, s10 +; GFX1064-NEXT: s_add_u32 s5, s5, s8 +; GFX1064-NEXT: s_addc_u32 s4, s4, s9 +; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s5 +; GFX1064-NEXT: s_mul_i32 s11, s2, s4 +; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s4 +; GFX1064-NEXT: s_mul_hi_u32 s9, s3, s5 ; GFX1064-NEXT: s_mul_i32 s5, s3, s5 -; GFX1064-NEXT: s_addc_u32 s4, s9, s4 +; GFX1064-NEXT: s_add_u32 s8, s8, s11 +; GFX1064-NEXT: s_addc_u32 s10, 0, s10 +; GFX1064-NEXT: s_mul_hi_u32 s12, s3, s4 +; GFX1064-NEXT: s_add_u32 s5, s8, s5 +; GFX1064-NEXT: s_mul_i32 s4, s3, s4 +; GFX1064-NEXT: s_addc_u32 s5, s10, s9 ; GFX1064-NEXT: s_addc_u32 s8, s12, 0 -; GFX1064-NEXT: s_add_u32 s10, s4, s5 +; GFX1064-NEXT: s_add_u32 s10, s5, s4 ; GFX1064-NEXT: s_addc_u32 s11, 0, s8 ; GFX1064-NEXT: s_mul_hi_u32 s4, s0, s10 ; GFX1064-NEXT: s_mul_i32 s5, s0, s11 ; GFX1064-NEXT: s_mul_i32 s8, s1, s10 ; GFX1064-NEXT: s_add_i32 s4, s4, s5 -; GFX1064-NEXT: s_add_i32 s12, s4, s8 +; GFX1064-NEXT: s_add_i32 s8, s4, s8 ; GFX1064-NEXT: s_mul_i32 s4, s0, s10 -; GFX1064-NEXT: s_sub_i32 s8, s3, s12 -; GFX1064-NEXT: s_sub_u32 s13, s2, s4 +; GFX1064-NEXT: s_sub_i32 s9, s3, s8 +; GFX1064-NEXT: s_sub_u32 s12, s2, s4 ; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX1064-NEXT: s_subb_u32 s14, s8, s1 -; GFX1064-NEXT: s_sub_u32 s15, s13, s0 -; GFX1064-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX1064-NEXT: s_cmp_lg_u64 s[8:9], 0 -; GFX1064-NEXT: s_subb_u32 s8, s14, 0 -; GFX1064-NEXT: s_cmp_ge_u32 s8, s1 -; GFX1064-NEXT: s_cselect_b32 s9, -1, 0 -; GFX1064-NEXT: s_cmp_ge_u32 s15, s0 +; GFX1064-NEXT: s_subb_u32 s9, s9, s1 +; GFX1064-NEXT: s_sub_u32 s13, s12, s0 +; GFX1064-NEXT: s_subb_u32 s9, s9, 0 +; GFX1064-NEXT: s_cmp_ge_u32 s9, s1 ; GFX1064-NEXT: s_cselect_b32 s14, -1, 0 -; GFX1064-NEXT: s_cmp_eq_u32 s8, s1 -; GFX1064-NEXT: s_cselect_b32 s8, s14, s9 -; GFX1064-NEXT: s_add_u32 s9, s10, 1 +; GFX1064-NEXT: s_cmp_ge_u32 s13, s0 +; GFX1064-NEXT: s_cselect_b32 s13, -1, 0 +; GFX1064-NEXT: s_cmp_eq_u32 s9, s1 +; GFX1064-NEXT: s_cselect_b32 s9, s13, s14 +; GFX1064-NEXT: s_add_u32 s13, s10, 1 ; GFX1064-NEXT: s_addc_u32 s14, s11, 0 ; GFX1064-NEXT: s_add_u32 s15, s10, 2 ; GFX1064-NEXT: s_addc_u32 s16, s11, 0 -; GFX1064-NEXT: s_cmp_lg_u32 s8, 0 -; GFX1064-NEXT: s_cselect_b32 s15, s15, s9 +; GFX1064-NEXT: s_cmp_lg_u32 s9, 0 +; GFX1064-NEXT: s_cselect_b32 s13, s15, s13 ; GFX1064-NEXT: s_cselect_b32 s14, s16, s14 ; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX1064-NEXT: s_subb_u32 s3, s3, s12 +; GFX1064-NEXT: s_subb_u32 s3, s3, s8 ; GFX1064-NEXT: s_cmp_ge_u32 s3, s1 ; GFX1064-NEXT: s_cselect_b32 s4, -1, 0 -; GFX1064-NEXT: s_cmp_ge_u32 s13, s0 +; GFX1064-NEXT: s_cmp_ge_u32 s12, s0 ; GFX1064-NEXT: s_cselect_b32 s5, -1, 0 ; GFX1064-NEXT: s_cmp_eq_u32 s3, s1 ; GFX1064-NEXT: s_cselect_b32 s1, s5, s4 ; GFX1064-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1064-NEXT: s_cselect_b32 s5, s14, s11 -; GFX1064-NEXT: s_cselect_b32 s4, s15, s10 +; GFX1064-NEXT: s_cselect_b32 s4, s13, s10 ; GFX1064-NEXT: s_cbranch_execnz .LBB15_3 ; GFX1064-NEXT: .LBB15_2: ; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll index 64d055bc40e98..4445383bd0ace 100644 --- a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll +++ b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll @@ -271,7 +271,6 @@ define i1 @workgroup_nonzero() { ; DAGISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL-GFX8-NEXT: s_or_b32 s4, s12, s13 ; DAGISEL-GFX8-NEXT: s_or_b32 s4, s4, s14 -; DAGISEL-GFX8-NEXT: s_cmp_lg_u32 s4, 0 ; DAGISEL-GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 ; DAGISEL-GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; DAGISEL-GFX8-NEXT: s_setpc_b64 s[30:31] @@ -281,7 +280,6 @@ define i1 @workgroup_nonzero() { ; DAGISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL-GFX942-NEXT: s_or_b32 s0, s12, s13 ; DAGISEL-GFX942-NEXT: s_or_b32 s0, s0, s14 -; DAGISEL-GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; DAGISEL-GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 ; DAGISEL-GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; DAGISEL-GFX942-NEXT: s_setpc_b64 s[30:31] @@ -299,8 +297,6 @@ define i1 @workgroup_nonzero() { ; DAGISEL-GFX12-NEXT: s_or_b32 s0, ttmp9, s0 ; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe ; DAGISEL-GFX12-NEXT: s_or_b32 s0, s0, s1 -; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe -; DAGISEL-GFX12-NEXT: s_cmp_lg_u32 s0, 0 ; DAGISEL-GFX12-NEXT: s_cselect_b32 s0, -1, 0 ; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe ; DAGISEL-GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 @@ -311,7 +307,6 @@ define i1 @workgroup_nonzero() { ; GISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-GFX8-NEXT: s_or_b32 s4, s12, s13 ; GISEL-GFX8-NEXT: s_or_b32 s4, s4, s14 -; GISEL-GFX8-NEXT: s_cmp_lg_u32 s4, 0 ; GISEL-GFX8-NEXT: s_cselect_b32 s4, 1, 0 ; GISEL-GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GISEL-GFX8-NEXT: s_setpc_b64 s[30:31] @@ -321,7 +316,6 @@ define i1 @workgroup_nonzero() { ; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-GFX942-NEXT: s_or_b32 s0, s12, s13 ; GISEL-GFX942-NEXT: s_or_b32 s0, s0, s14 -; GISEL-GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GISEL-GFX942-NEXT: s_cselect_b32 s0, 1, 0 ; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-GFX942-NEXT: s_setpc_b64 s[30:31] @@ -339,8 +333,6 @@ define i1 @workgroup_nonzero() { ; GISEL-GFX12-NEXT: s_or_b32 s0, ttmp9, s0 ; GISEL-GFX12-NEXT: s_wait_alu 0xfffe ; GISEL-GFX12-NEXT: s_or_b32 s0, s0, s1 -; GISEL-GFX12-NEXT: s_wait_alu 0xfffe -; GISEL-GFX12-NEXT: s_cmp_lg_u32 s0, 0 ; GISEL-GFX12-NEXT: s_cselect_b32 s0, 1, 0 ; GISEL-GFX12-NEXT: s_wait_alu 0xfffe ; GISEL-GFX12-NEXT: v_mov_b32_e32 v0, s0 From 0c7617d8184bc3eb267c42938031100a04e8764d Mon Sep 17 00:00:00 2001 From: Tarun Prabhu Date: Wed, 22 Oct 2025 07:50:08 -0600 Subject: [PATCH 101/530] [flang][docs] Replace references to f18 in the docs with flang (Part 1) In addition to the replacement, some light editing of the text was also carried out. These edits primarily address issues of grammar and style. The remaining references to "f18" will be replaced in a later commit. --- flang/docs/AssumedRank.md | 6 +-- flang/docs/C++17.md | 16 +++---- flang/docs/C++style.md | 12 ++--- flang/docs/Calls.md | 6 +-- flang/docs/Character.md | 8 ++-- flang/docs/DoConcurrent.md | 3 +- flang/docs/Extensions.md | 34 +++++++------- flang/docs/FortranForCProgrammers.md | 2 +- flang/docs/FortranIR.md | 2 +- flang/docs/GettingInvolved.md | 2 +- flang/docs/Intrinsics.md | 70 ++++++++++++++-------------- 11 files changed, 81 insertions(+), 80 deletions(-) diff --git a/flang/docs/AssumedRank.md b/flang/docs/AssumedRank.md index c5d2c3e3909c7..0154adc7c3ad4 100644 --- a/flang/docs/AssumedRank.md +++ b/flang/docs/AssumedRank.md @@ -101,9 +101,9 @@ Assumed-rank dummies are also represented in the represent assumed-rank in procedure characteristics. ### Runtime Representation of Assumed-Ranks -Assumed-ranks are implemented as CFI_cdesc_t (18.5.3) with the addition of an -f18 specific addendum when required for the type. This is the usual f18 -descriptor, and no changes is required to represent assumed-ranks in this data +Assumed-ranks are implemented as CFI_cdesc_t (18.5.3) with the addition of a +Flang specific addendum when required for the type. This is the usual Flang +descriptor, and no changes are required to represent assumed-ranks in this data structure. In fact, there is no difference between the runtime descriptor created for an assumed shape and the runtime descriptor created when the corresponding entity is passed as an assumed-rank. diff --git a/flang/docs/C++17.md b/flang/docs/C++17.md index f36110af86847..9137827911426 100644 --- a/flang/docs/C++17.md +++ b/flang/docs/C++17.md @@ -6,7 +6,7 @@ --> -# C++14/17 features used in f18 +# C++14/17 features used in Flang ```{contents} --- @@ -27,7 +27,7 @@ out the details of how our C++ code should look and gives guidance about feature usage. We have chosen to use some features of the recent C++17 -language standard in f18. +language standard in Flang. The most important of these are: * sum types (discriminated unions) in the form of `std::variant` * `using` template parameter packs @@ -41,7 +41,7 @@ in this list because it's not particularly well known.) ## Sum types First, some background information to explain the need for sum types -in f18. +in Flang. Fortran is notoriously problematic to lex and parse, as tokenization depends on the state of the partial parse; @@ -57,7 +57,7 @@ a unified lexer/parser. We have chosen to do so because it is simpler and should reduce both initial bugs and long-term maintenance. -Specifically, f18's parser uses the technique of recursive descent with +Specifically, Flang's parser uses the technique of recursive descent with backtracking. It is constructed as the incremental composition of pure parsing functions that each, when given a context (location in the input stream plus some state), @@ -73,7 +73,7 @@ of Fortran. The specification of Fortran uses a form of BNF with alternatives, optional elements, sequences, and lists. Each of these constructs -in the Fortran grammar maps directly in the f18 parser to both +in the Fortran grammar maps directly in Flang's parser to both the means of combining other parsers as alternatives, &c., and to the declarations of the parse tree data structures that represent the results of successful parses. @@ -87,10 +87,10 @@ The bounded polymorphism supplied by the C++17 `std::variant` fits those needs exactly. For example, production R502 in Fortran defines the top-level program unit of Fortran as being a function, subroutine, module, &c. -The `struct ProgramUnit` in the f18 parse tree header file +`struct ProgramUnit` in the Flang parse tree header file represents each program unit with a member that is a `std::variant` over the six possibilities. -Similarly, the parser for that type in the f18 grammar has six alternatives, +Similarly, the parser for that type in Flang's grammar has six alternatives, each of which constructs an instance of `ProgramUnit` upon the result of parsing a `Module`, `FunctionSubprogram`, and so on. @@ -99,7 +99,7 @@ parse is typically implemented with overloaded functions. A function instantiated on `ProgramUnit` will use `std::visit` to identify the right alternative and perform the right actions. The call to `std::visit` must pass a visitor that can handle all -of the possibilities, and f18 will fail to build if one is missing. +of the possibilities, and Flang will fail to build if one is missing. Were we unable to use `std::variant` directly, we would likely have chosen to implement a local `SumType` replacement; in the diff --git a/flang/docs/C++style.md b/flang/docs/C++style.md index cbb96f15eb5f1..a4ca9628b56f5 100644 --- a/flang/docs/C++style.md +++ b/flang/docs/C++style.md @@ -30,7 +30,7 @@ is clear on usage, follow it. is pretty good and comes with lots of justifications for its rules. * Reasonable exceptions to these guidelines can be made. * Be aware of some workarounds for known issues in older C++ compilers that should - still be able to compile f18. They are listed at the end of this document. + still be able to compile Flang. They are listed at the end of this document. ## In particular: @@ -261,7 +261,7 @@ move semantics, member access, and comparison for equality; suitable for use in `std::variant<>`. * `std::unique_ptr<>`: A nullable pointer with ownership, null by default, not copyable, reassignable. -F18 has a helpful `Deleter<>` class template that makes `unique_ptr<>` +Flang has a helpful `Deleter<>` class template that makes `unique_ptr<>` easier to use with forward-referenced data types. * `std::shared_ptr<>`: A nullable pointer with shared ownership via reference counting, null by default, shallowly copyable, reassignable, and slow. @@ -312,9 +312,9 @@ Consistency is one of many aspects in the pursuit of clarity, but not an end in itself. ## C++ compiler bug workarounds -Below is a list of workarounds for C++ compiler bugs met with f18 that, even -if the bugs are fixed in latest C++ compiler versions, need to be applied so -that all desired tool-chains can compile f18. +Below is a list of workarounds for C++ compiler bugs encountered when building +Flang. Even if the bugs are fixed in latest C++ compiler versions, these need to +be applied so that all desired tool-chains can compile Flang. ### Explicitly move noncopyable local variable into optional results @@ -338,7 +338,7 @@ std::optional fooOK() { } ``` The underlying bug is actually not specific to `std::optional` but this is the most common -case in f18 where the issue may occur. The actual bug can be reproduced with any class `B` +case in Flang where the issue may occur. The actual bug can be reproduced with any class `B` that has a perfect forwarding constructor taking `CantBeCopied` as argument: `template B(CantBeCopied&& x) x_{std::forward(x)} {}`. In such scenarios, Ubuntu 18.04 g++ fails to instantiate the move constructor diff --git a/flang/docs/Calls.md b/flang/docs/Calls.md index f518dc00ed8e8..f27af1a461bac 100644 --- a/flang/docs/Calls.md +++ b/flang/docs/Calls.md @@ -529,7 +529,7 @@ PGI passes host instance links in descriptors in additional arguments that are not always successfully forwarded across implicit interfaces, sometimes leading to crashes when they turn out to be needed. -F18 will manage a pool of trampolines in its runtime support library +Flang will manage a pool of trampolines in its runtime support library that can be used to pass internal procedures as effective arguments to F77ish procedures, so that a bare code address can serve to represent the effective argument. @@ -569,14 +569,14 @@ Fortran 2018 explicitly enables us to do this with a correction to Fortran 2003 in 4.3.4(5). Last, there must be reasonably permanent naming conventions used -by the F18 runtime library for those unrestricted specific intrinsic +by Flang's runtime library for those unrestricted specific intrinsic functions (table 16.2 in 16.8) and extensions that can be passed as arguments. In these cases where external naming is at the discretion of the implementation, we should use names that are not in the C language user namespace, begin with something that identifies -the current incompatible version of F18, the module, the submodule, and +the current incompatible version of Flang, the module, the submodule, and elemental SIMD width, and are followed by the external name. The parts of the external name can be separated by some character that is acceptable for use in LLVM IR and assembly language but not in user diff --git a/flang/docs/Character.md b/flang/docs/Character.md index 4e1d40774d6db..96e0a069e2ba2 100644 --- a/flang/docs/Character.md +++ b/flang/docs/Character.md @@ -6,7 +6,7 @@ --> -# Implementation of `CHARACTER` types in f18 +# Implementation of `CHARACTER` types in Flang ```{contents} --- @@ -16,7 +16,7 @@ local: ## Kinds and Character Sets -The f18 compiler and runtime support three kinds of the intrinsic +The Flang compiler and runtime support three kinds of the intrinsic `CHARACTER` type of Fortran 2018. The default (`CHARACTER(KIND=1)`) holds 8-bit character codes; `CHARACTER(KIND=2)` holds 16-bit character codes; @@ -108,12 +108,12 @@ The result of `//` may be used * as the value of a specifier of an I/O statement, * or as the value of a statement function. -The f18 compiler has a general (but slow) means of implementing concatenation +The Flang compiler has a general (but slow) means of implementing concatenation and a specialized (fast) option to optimize the most common case. ### General concatenation -In the most general case, the f18 compiler's generated code and +In the most general case, Flang's generated code and runtime support library represent the result as a deferred-length allocatable `CHARACTER` temporary scalar or array variable that is initialized as a zero-length array by `AllocatableInitCharacter()` diff --git a/flang/docs/DoConcurrent.md b/flang/docs/DoConcurrent.md index bd1008af86f6b..eba2656fc9e52 100644 --- a/flang/docs/DoConcurrent.md +++ b/flang/docs/DoConcurrent.md @@ -280,7 +280,8 @@ Specifically, an easy means is required that stipulates that localization should apply at most only to the obvious cases of local non-pointer non-allocatable scalars. -In the LLVM Fortran compiler project (a/k/a "flang", "f18") we considered +In the LLVM Fortran compiler project (now known as "flang", previously also +known as "f18") we considered several solutions to this problem. 1. Add syntax (e.g., `DO PARALLEL` or `DO CONCURRENT() DEFAULT(PARALLEL)`) by which one can inform the compiler that it should localize only diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md index 420b7517922b7..6d872094811e3 100644 --- a/flang/docs/Extensions.md +++ b/flang/docs/Extensions.md @@ -84,7 +84,7 @@ end be "local identifiers" and should be distinct in the "inclusive scope" -- i.e., not scoped by `BLOCK` constructs. As most (but not all) compilers implement `BLOCK` scoping of construct - names, so does f18, with a portability warning. + names, so does Flang, with a portability warning. * 15.6.4 paragraph 2 prohibits an implicitly typed statement function from sharing the same name as a symbol in its scope's host, if it has one. @@ -153,7 +153,7 @@ end that a call to intrinsic module procedure `ieee_support_halting` with a constant argument has a compile time constant result in `constant expression` and `specification expression` contexts. In compilations - where this information is not known at compile time, f18 generates code + where this information is not known at compile time, Flang generates code to determine the absence or presence of this capability at runtime. A call to `ieee_support_halting` in contexts that the standard requires to be constant will generate a compilation error. `ieee_support_standard` @@ -366,7 +366,7 @@ end * The legacy extension intrinsic functions `IZEXT` and `JZEXT` are supported; `ZEXT` has different behavior with various older compilers, so it is not supported. -* f18 doesn't impose a limit on the number of continuation lines +* Flang doesn't impose a limit on the number of continuation lines allowed for a single statement. * When a type-bound procedure declaration statement has neither interface nor attributes, the "::" before the bindings is optional, even @@ -553,7 +553,7 @@ end * Fortran explicitly ignores type declaration statements when they attempt to type the name of a generic intrinsic function (8.2 p3). One can declare `CHARACTER::COS` and still get a real result - from `COS(3.14159)`, for example. f18 will complain when a + from `COS(3.14159)`, for example. Flang will complain when a generic intrinsic function's inferred result type does not match an explicit declaration. This message is a warning. @@ -570,7 +570,7 @@ end ## Standard features that might as well not be -* f18 supports designators with constant expressions, properly +* Flang supports designators with constant expressions, properly constrained, as initial data targets for data pointers in initializers of variable and component declarations and in `DATA` statements; e.g., `REAL, POINTER :: P => T(1:10:2)`. @@ -587,8 +587,8 @@ end * The standard doesn't explicitly require that a named constant that appears as part of a complex-literal-constant be a scalar, but most compilers emit an error when an array appears. - f18 supports them with a portability warning. -* f18 does not enforce a blanket prohibition against generic + Flang supports them with a portability warning. +* Flang does not enforce a blanket prohibition against generic interfaces containing a mixture of functions and subroutines. We allow both to appear, unlike several other Fortran compilers. This is especially desirable when two generics of the same @@ -655,7 +655,7 @@ end treat them as references to implicitly typed local variables, and load uninitialized values. - In f18, we chose to emit an error message for this case since the standard + In Flang, we chose to emit an error message for this case since the standard is unclear, the usage is not portable, and the issue can be easily resolved by adding a declaration. @@ -686,7 +686,7 @@ end * When a `DATA` statement in a `BLOCK` construct could be construed as either initializing a host-associated object or declaring a new local - initialized object, f18 interprets the standard's classification of + initialized object, Flang interprets the standard's classification of a `DATA` statement as being a "declaration" rather than a "specification" construct, and notes that the `BLOCK` construct is defined as localizing names that have specifications in the `BLOCK` construct. @@ -703,7 +703,7 @@ end subroutine Other Fortran compilers disagree with each other in their interpretations of this example. The precedent among the most commonly used compilers - agrees with f18's interpretation: a `DATA` statement without any other + agrees with Flang's interpretation: a `DATA` statement without any other specification of the name refers to the host-associated object. * Many Fortran compilers allow a non-generic procedure to be `USE`-associated @@ -729,7 +729,7 @@ module m2 end module ``` - This case elicits a warning from f18, as it should not be treated + This case elicits a warning from Flang, as it should not be treated any differently than the same case with the non-generic procedure of the same name being defined in the same scope rather than being `USE`-associated into it, which is explicitly non-conforming in the @@ -747,7 +747,7 @@ end module symbols, much less appear in specification inquiries, and there are application codes that expect exterior symbols whose names match components to be visible in a derived-type definition's default initialization - expressions, and so f18 follows that precedent. + expressions, and so Flang follows that precedent. * 19.3.1p1 "Within its scope, a local identifier of an entity of class (1) or class (4) shall not be the same as a global identifier used in that scope..." @@ -769,17 +769,17 @@ end module left-hand side for a pointer assignment statement, and we emit a portability warning when it is not. -* F18 allows a `USE` statement to reference a module that is defined later +* Flang allows a `USE` statement to reference a module that is defined later in the same compilation unit, so long as mutual dependencies do not form a cycle. This feature forestalls any risk of such a `USE` statement reading an obsolete module file from a previous compilation and then overwriting that file later. -* F18 allows `OPTIONAL` dummy arguments to interoperable procedures +* Flang allows `OPTIONAL` dummy arguments to interoperable procedures unless they are `VALUE` (C865). -* F18 processes the `NAMELIST` group declarations in a scope after it +* Flang processes the `NAMELIST` group declarations in a scope after it has resolved all of the names in that scope. This means that names that appear before their local declarations do not resolve to host associated objects and do not elicit errors about improper redeclarations @@ -862,11 +862,11 @@ print *, [(j,j=1,10)] * The Fortran standard doesn't mention integer overflow explicitly. In many cases, however, integer overflow makes programs non-conforming. - F18 follows other widely-used Fortran compilers. Specifically, f18 assumes + Flang follows other widely-used Fortran compilers. Specifically, Flang assumes integer overflow never occurs in address calculations and increment of do-variable unless the option `-fwrapv` is enabled. -* Two new ieee_round_type values were added in f18 beyond the four values +* Two new ieee_round_type values were added in Flang beyond the four values defined in f03 and f08: ieee_away and ieee_other. Contemporary hardware typically does not have support for these rounding modes; ieee_support_rounding calls for these values return false. diff --git a/flang/docs/FortranForCProgrammers.md b/flang/docs/FortranForCProgrammers.md index 135e6b71711d3..9023fdc8a6038 100644 --- a/flang/docs/FortranForCProgrammers.md +++ b/flang/docs/FortranForCProgrammers.md @@ -304,7 +304,7 @@ Preprocessing behavior varies across implementations and one should not depend o much portability. Preprocessing is typically requested by the use of a capitalized filename suffix (e.g., "foo.F90") or a compiler command line option. -(Since the F18 compiler always runs its built-in preprocessing stage, +(Since Flang always runs its built-in preprocessing stage, no special option or filename suffix is required.) ## "Object Oriented" Programming diff --git a/flang/docs/FortranIR.md b/flang/docs/FortranIR.md index f9f8f6416b37a..7f3c7b2ae5241 100644 --- a/flang/docs/FortranIR.md +++ b/flang/docs/FortranIR.md @@ -171,7 +171,7 @@ FIR is intentionally similar to SIL from the statement level up to the level of Program, procedure, region, and basic block all leverage code from LLVM, in much the same way as SIL. These data structures have significant investment and engineering behind their use in compilers, and it makes sense to leverage that work. * Pro: Uses LLVM data structures, pervasive in compiler projects such as LLVM, SIL, etc. -* Pro: Get used to seeing and using LLVM, as f18 aims to be an LLVM project +* Pro: Get used to seeing and using LLVM, as Flang aims to be an LLVM project * Con: Uses LLVM data structures, which the project has been avoiding #### Alternative: C++ Standard Template Library diff --git a/flang/docs/GettingInvolved.md b/flang/docs/GettingInvolved.md index 79af78897c9e4..2d283422b7308 100644 --- a/flang/docs/GettingInvolved.md +++ b/flang/docs/GettingInvolved.md @@ -41,7 +41,7 @@ Contributions to Flang are done using GitHub Pull Requests and follow the ### Flang Slack Workspace - There is a Slack workspace dedicated to Flang. -- There are a number of topic-oriented channels available (e.g., #driver, #f18-semantics, #fir). +- There are a number of topic-oriented channels available (e.g., #driver, #fir). - Add yourself via the *[invitation link](https://join.slack.com/t/flang-compiler/shared_invite/zt-2pcn51lh-VrRQL_YUOkxA_1CEfMGQhw "title")* ## Calls diff --git a/flang/docs/Intrinsics.md b/flang/docs/Intrinsics.md index 34b6559e4345f..bfda5f3253a68 100644 --- a/flang/docs/Intrinsics.md +++ b/flang/docs/Intrinsics.md @@ -19,7 +19,7 @@ of functions or subroutines with similar interfaces as an aid to comprehension beyond that which might be gained from the standard's alphabetical list. -A brief status of intrinsic procedure support in f18 is also given at the end. +A brief status of intrinsic procedure support in Flang is also given at the end. Few procedures are actually described here apart from their interfaces; see the Fortran 2018 standard (section 16) for the complete story. @@ -733,20 +733,20 @@ In case the invocation would be an error if the procedure were the intrinsic leaves two choices to the compiler: emit an error about the intrinsic invocation, or consider this is an external procedure and emit no error. -f18 will always consider this case to be the intrinsic and emit errors, unless the procedure +Flang will always consider this case to be the intrinsic and emit errors, unless the procedure is used as a function (resp. subroutine) and the intrinsic is a subroutine (resp. function). The table below gives some examples of decisions made by Fortran compilers in such case. | What is ACOS ? | Bad intrinsic call | External with warning | External no warning | Other error | | --- | --- | --- | --- | --- | -| `print*, ACOS()` | gfortran, nag, xlf, f18 | ifort | nvfortran | | -| `print*, ACOS(I)` | gfortran, nag, xlf, f18 | ifort | nvfortran | | -| `print*, ACOS(X=I)` | gfortran, nag, xlf, f18 | ifort | | nvfortran (keyword on implicit extrenal )| -| `print*, ACOS(X, X)` | gfortran, nag, xlf, f18 | ifort | nvfortran | | -| `CALL ACOS(X)` | | | gfortran, nag, xlf, nvfortran, ifort, f18 | | +| `print*, ACOS()` | gfortran, nag, xlf, flang | ifort | nvfortran | | +| `print*, ACOS(I)` | gfortran, nag, xlf, flang | ifort | nvfortran | | +| `print*, ACOS(X=I)` | gfortran, nag, xlf, flang | ifort | | nvfortran (keyword on implicit extrenal )| +| `print*, ACOS(X, X)` | gfortran, nag, xlf, flang | ifort | nvfortran | | +| `CALL ACOS(X)` | | | gfortran, nag, xlf, nvfortran, ifort, flang | | -The rationale for f18 behavior is that when referring to a procedure with an +The rationale for Flang behavior is that when referring to a procedure with an argument number or type that does not match the intrinsic specification, it seems safer to block the rather likely case where the user is using the intrinsic the wrong way. In case the user wanted to refer to an external function, he can add an explicit EXTERNAL @@ -759,13 +759,13 @@ Also note that in general, the standard gives the compiler the right to consider any procedure that is not explicitly external as a non standard intrinsic (section 4.2 point 4). So it is highly advised for the programmer to use EXTERNAL statements to prevent any ambiguity. -## Intrinsic Procedure Support in f18 -This section gives an overview of the support inside f18 libraries for the +## Intrinsic Procedure Support in Flang +This section gives an overview of the support inside Flang libraries for the intrinsic procedures listed above. -It may be outdated, refer to f18 code base for the actual support status. +It may be outdated, refer to Flang code base for the actual support status. ### Semantic Analysis -F18 semantic expression analysis phase detects intrinsic procedure references, +Flang semantic expression analysis phase detects intrinsic procedure references, validates the argument types and deduces the return types. This phase currently supports all the intrinsic procedures listed above but the ones in the table below. @@ -789,17 +789,17 @@ Constant Expressions may be used to define kind arguments. Therefore, the semant expression analysis phase must be able to fold references to intrinsic functions listed in section 10.1.12. -F18 intrinsic function folding is either performed by implementations directly -operating on f18 scalar types or by using host runtime functions and -host hardware types. F18 supports folding elemental intrinsic functions over +Flang intrinsic function folding is either performed by implementations directly +operating on Flang scalar types or by using host runtime functions and +host hardware types. Flang supports folding elemental intrinsic functions over arrays when an implementation is provided for the scalars (regardless of whether it is using host hardware types or not). The status of intrinsic function folding support is given in the sub-sections below. #### Intrinsic Functions with Host Independent Folding Support -Implementations using f18 scalar types enables folding intrinsic functions -on any host and with any possible type kind supported by f18. The intrinsic functions -listed below are folded using host independent implementations. +Implementations using Flang scalar types enables folding intrinsic functions +on any host and with any possible type kind supported by Flang. The intrinsic +functions listed below are folded using host independent implementations. | Return Type | Intrinsic Functions with Host Independent Folding Support| | --- | --- | @@ -810,12 +810,12 @@ listed below are folded using host independent implementations. #### Intrinsic Functions with Host Dependent Folding Support Implementations using the host runtime may not be available for all supported -f18 types depending on the host hardware types and the libraries available on the host. +Flang types depending on the hardware type of the host and the libraries available on it. The actual support on a host depends on what the host hardware types are. The list below gives the functions that are folded using host runtime and the related C/C++ types. -F18 automatically detects if these types match an f18 scalar type. If so, -folding of the intrinsic functions will be possible for the related f18 scalar type, -otherwise an error message will be produced by f18 when attempting to fold related intrinsic functions. +Flang automatically detects if these types match an Flang scalar type. If so, +folding of the intrinsic functions will be possible for the related Flang scalar type, +otherwise an error message will be produced by Flang when attempting to fold related intrinsic functions. | C/C++ Host Type | Intrinsic Functions with Host Standard C++ Library Based Folding Support | | --- | --- | @@ -823,17 +823,17 @@ otherwise an error message will be produced by f18 when attempting to fold relat | std::complex for float, double and long double| ACOS, ACOSH, ASIN, ASINH, ATAN, ATANH, COS, COSH, EXP, LOG, SIN, SINH, SQRT, TAN, TANH | On top of the default usage of C++ standard library functions for folding described -in the table above, it is possible to compile f18 evaluate library with +in the table above, it is possible to compile Flang evaluate library with [libpgmath](https://github.com/flang-compiler/flang/tree/master/runtime/libpgmath) so that it can be used for folding. To do so, one must have a compiled version of the libpgmath library available on the host and add -`-DLIBPGMATH_DIR=` to the f18 cmake command. +`-DLIBPGMATH_DIR=` to the Flang cmake command. Libpgmath comes with real and complex functions that replace C++ standard library float and double functions to fold all the intrinsic functions listed in the table above. -It has no long double versions. If the host long double matches an f18 scalar type, +It has no long double versions. If the host long double matches a Flang scalar type, C++ standard library functions will still be used for folding expressions with this scalar type. -Libpgmath adds the possibility to fold the following functions for f18 real scalar +Libpgmath adds the possibility to fold the following functions for Flang's real scalar types related to host float and double types. | C/C++ Host Type | Additional Intrinsic Function Folding Support with Libpgmath (Optional) | @@ -841,10 +841,10 @@ types related to host float and double types. |float and double| BESSEL_J0, BESSEL_J1, BESSEL_JN (elemental only), BESSEL_Y0, BESSEL_Y1, BESSEL_Yn (elemental only), DERFC_SCALED, ERFC_SCALED, QERFC_SCALED | Libpgmath comes in three variants (precise, relaxed and fast). So far, only the -precise version is used for intrinsic function folding in f18. It guarantees the greatest numerical precision. +precise version is used for intrinsic function folding in Flang. It guarantees the greatest numerical precision. ### Intrinsic Functions with Missing Folding Support -The following intrinsic functions are allowed in constant expressions but f18 +The following intrinsic functions are allowed in constant expressions but Flang is not yet able to fold them. Note that there might be constraints on the arguments so that these intrinsics can be used in constant expressions (see section 10.1.12 of Fortran 2018 standard). @@ -1133,8 +1133,8 @@ end program rename_proc - **Standard:** GNU extension - **Class:** function - **Syntax:** result = `SECNDS(refTime)` -- **Arguments:** - +- **Arguments:** + | ARGUMENT | INTENT | TYPE | KIND | Description | |-----------|--------|---------------|-------------------------|------------------------------------------| | `refTime` | `IN` | `REAL, scalar`| REAL(KIND=4), required | Reference time in seconds since midnight | @@ -1157,16 +1157,16 @@ END PROGRAM example_secnds since midnight minus a user-supplied reference time `refTime`. Uses `REAL(KIND=8)` for higher precision. #### Usage and Info -- **Standard:** PGI extension -- **Class:** function -- **Syntax:** result = `DSECNDS(refTime)` -- **Arguments:** +- **Standard:** PGI extension +- **Class:** function +- **Syntax:** result = `DSECNDS(refTime)` +- **Arguments:** | ARGUMENT | INTENT | TYPE | KIND | Description | |-----------|--------|---------------|-------------------------|------------------------------------------| | `refTime` | `IN` | `REAL, scalar`| REAL(KIND=8), required | Reference time in seconds since midnight | -- **Return Value:** REAL(KIND=8), scalar — seconds elapsed since `refTime`. +- **Return Value:** REAL(KIND=8), scalar — seconds elapsed since `refTime`. - **Purity:** Impure #### Example From 35db983596f0f7deb67e261d77b0daad0bbe5ba9 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 22 Oct 2025 06:50:52 -0700 Subject: [PATCH 102/530] [ADT] Modernize type traits in BitmaskEnum.h (NFC) (#164567) This patch replaces std::enable_if_t with std::void_t in two type traits. Both approaches enable the template specialization if and only if the LLVM_BITMASK_LARGEST_ENUMERATOR enumerator exists. --- llvm/include/llvm/ADT/BitmaskEnum.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/include/llvm/ADT/BitmaskEnum.h b/llvm/include/llvm/ADT/BitmaskEnum.h index d464cbcfcdffd..9555fadda6e47 100644 --- a/llvm/include/llvm/ADT/BitmaskEnum.h +++ b/llvm/include/llvm/ADT/BitmaskEnum.h @@ -106,7 +106,7 @@ struct is_bitmask_enum : std::false_type {}; template struct is_bitmask_enum< - E, std::enable_if_t= 0>> + E, std::void_t> : std::true_type {}; /// Trait class to determine bitmask enumeration largest bit. @@ -114,7 +114,7 @@ template struct largest_bitmask_enum_bit; template struct largest_bitmask_enum_bit< - E, std::enable_if_t= 0>> { + E, std::void_t> { using UnderlyingTy = std::underlying_type_t; static constexpr UnderlyingTy value = static_cast(E::LLVM_BITMASK_LARGEST_ENUMERATOR); From d5a2047c58b428e848d1a540430cbb46d38e2b97 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 22 Oct 2025 06:51:00 -0700 Subject: [PATCH 103/530] [CodeGen] Remove an unused #include (NFC) (#164569) We've switched to llvm::identity_cxx20 for SparseMultiSet, so we don't need llvm::identity in this file. --- llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h index 4eacbdca163d9..26d708060ed6d 100644 --- a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h +++ b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h @@ -18,7 +18,6 @@ #include "llvm/ADT/PointerIntPair.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/SparseMultiSet.h" -#include "llvm/ADT/identity.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/LiveRegUnits.h" #include "llvm/CodeGen/MachineBasicBlock.h" From 6bee6b2090a7cd0dedbc0af789a7cb4648e974f2 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 22 Oct 2025 06:51:08 -0700 Subject: [PATCH 104/530] [CodeGen] Add "override" where appropriate (NFC) (#164571) Note that "override" makes "virtual" redundant. Identified with modernize-use-override. --- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 24 +++++++++---------- .../llvm/CodeGen/CodeGenTargetMachineImpl.h | 2 +- llvm/include/llvm/CodeGen/DebugHandlerBase.h | 2 +- .../llvm/CodeGen/DroppedVariableStatsMIR.h | 9 ++++--- .../include/llvm/CodeGen/GlobalISel/CSEInfo.h | 6 ++--- .../llvm/CodeGen/GlobalISel/Combiner.h | 2 +- .../CodeGen/GlobalISel/GISelValueTracking.h | 2 +- .../llvm/CodeGen/GlobalISel/IRTranslator.h | 2 +- .../CodeGen/GlobalISel/InstructionSelector.h | 2 +- .../llvm/CodeGen/MachineModuleSlotTracker.h | 2 +- llvm/include/llvm/CodeGen/MachineOutliner.h | 2 +- .../llvm/CodeGen/ResourcePriorityQueue.h | 2 +- .../include/llvm/CodeGen/TargetRegisterInfo.h | 2 +- .../llvm/CodeGen/VLIWMachineScheduler.h | 2 +- llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h | 2 +- llvm/lib/CodeGen/GlobalISel/Combiner.cpp | 4 ++-- llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 2 +- .../LiveDebugValues/LiveDebugValues.cpp | 2 +- .../LiveDebugValues/VarLocBasedImpl.cpp | 2 +- llvm/lib/CodeGen/MachineBasicBlock.cpp | 2 +- llvm/lib/CodeGen/PeepholeOptimizer.cpp | 2 +- .../unittests/CodeGen/AsmPrinterDwarfTest.cpp | 16 ++++++------- llvm/unittests/CodeGen/InstrRefLDVTest.cpp | 2 +- 23 files changed, 47 insertions(+), 48 deletions(-) diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 4f27d9f103e91..76b6c8ec68c72 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -366,7 +366,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { protected: explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL) : BaseT(DL) {} - virtual ~BasicTTIImplBase() = default; + ~BasicTTIImplBase() override = default; using TargetTransformInfoImplBase::DL; @@ -821,13 +821,13 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { SimplifyAndSetOp); } - virtual std::optional + std::optional getCacheSize(TargetTransformInfo::CacheLevel Level) const override { return std::optional( getST()->getCacheSize(static_cast(Level))); } - virtual std::optional + std::optional getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const override { std::optional TargetResult = getST()->getCacheAssociativity(static_cast(Level)); @@ -838,31 +838,31 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { return BaseT::getCacheAssociativity(Level); } - virtual unsigned getCacheLineSize() const override { + unsigned getCacheLineSize() const override { return getST()->getCacheLineSize(); } - virtual unsigned getPrefetchDistance() const override { + unsigned getPrefetchDistance() const override { return getST()->getPrefetchDistance(); } - virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, - unsigned NumStridedMemAccesses, - unsigned NumPrefetches, - bool HasCall) const override { + unsigned getMinPrefetchStride(unsigned NumMemAccesses, + unsigned NumStridedMemAccesses, + unsigned NumPrefetches, + bool HasCall) const override { return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses, NumPrefetches, HasCall); } - virtual unsigned getMaxPrefetchIterationsAhead() const override { + unsigned getMaxPrefetchIterationsAhead() const override { return getST()->getMaxPrefetchIterationsAhead(); } - virtual bool enableWritePrefetching() const override { + bool enableWritePrefetching() const override { return getST()->enableWritePrefetching(); } - virtual bool shouldPrefetchAddressSpace(unsigned AS) const override { + bool shouldPrefetchAddressSpace(unsigned AS) const override { return getST()->shouldPrefetchAddressSpace(AS); } diff --git a/llvm/include/llvm/CodeGen/CodeGenTargetMachineImpl.h b/llvm/include/llvm/CodeGen/CodeGenTargetMachineImpl.h index 3950b95f859c3..7a6feda9bda56 100644 --- a/llvm/include/llvm/CodeGen/CodeGenTargetMachineImpl.h +++ b/llvm/include/llvm/CodeGen/CodeGenTargetMachineImpl.h @@ -42,7 +42,7 @@ class LLVM_ABI CodeGenTargetMachineImpl : public TargetMachine { /// Create a pass configuration object to be used by addPassToEmitX methods /// for generating a pipeline of CodeGen passes. - virtual TargetPassConfig *createPassConfig(PassManagerBase &PM) override; + TargetPassConfig *createPassConfig(PassManagerBase &PM) override; /// Add passes to the specified pass manager to get the specified file /// emitted. Typically this will involve several steps of code generation. diff --git a/llvm/include/llvm/CodeGen/DebugHandlerBase.h b/llvm/include/llvm/CodeGen/DebugHandlerBase.h index fee4bb116bb87..e72801b6caf34 100644 --- a/llvm/include/llvm/CodeGen/DebugHandlerBase.h +++ b/llvm/include/llvm/CodeGen/DebugHandlerBase.h @@ -118,7 +118,7 @@ class DebugHandlerBase : public AsmPrinterHandler { // AsmPrinterHandler overrides. public: - virtual ~DebugHandlerBase() override; + ~DebugHandlerBase() override; void beginModule(Module *M) override; diff --git a/llvm/include/llvm/CodeGen/DroppedVariableStatsMIR.h b/llvm/include/llvm/CodeGen/DroppedVariableStatsMIR.h index bc8dc1b23fa9e..6da10d8148239 100644 --- a/llvm/include/llvm/CodeGen/DroppedVariableStatsMIR.h +++ b/llvm/include/llvm/CodeGen/DroppedVariableStatsMIR.h @@ -44,12 +44,11 @@ class LLVM_ABI DroppedVariableStatsMIR : public DroppedVariableStats { StringRef FuncOrModName); /// Override base class method to run on an llvm::MachineFunction /// specifically. - virtual void - visitEveryInstruction(unsigned &DroppedCount, - DenseMap &InlinedAtsMap, - VarID Var) override; + void visitEveryInstruction(unsigned &DroppedCount, + DenseMap &InlinedAtsMap, + VarID Var) override; /// Override base class method to run on DBG_VALUEs specifically. - virtual void visitEveryDebugRecord( + void visitEveryDebugRecord( DenseSet &VarIDSet, DenseMap> &InlinedAtsMap, StringRef FuncName, bool Before) override; diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CSEInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/CSEInfo.h index ea3f1a8375c4a..6701ae0510581 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CSEInfo.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CSEInfo.h @@ -40,14 +40,14 @@ class UniqueMachineInstr : public FoldingSetNode { // A CSE config for fully optimized builds. class LLVM_ABI CSEConfigFull : public CSEConfigBase { public: - virtual ~CSEConfigFull() = default; + ~CSEConfigFull() override = default; bool shouldCSEOpc(unsigned Opc) override; }; // Commonly used for O0 config. class LLVM_ABI CSEConfigConstantOnly : public CSEConfigBase { public: - virtual ~CSEConfigConstantOnly() = default; + ~CSEConfigConstantOnly() override = default; bool shouldCSEOpc(unsigned Opc) override; }; @@ -118,7 +118,7 @@ class LLVM_ABI GISelCSEInfo : public GISelChangeObserver { public: GISelCSEInfo() = default; - virtual ~GISelCSEInfo(); + ~GISelCSEInfo() override; void setMF(MachineFunction &MF); diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Combiner.h b/llvm/include/llvm/CodeGen/GlobalISel/Combiner.h index 39ff90c2687f4..7a313f493a6d3 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/Combiner.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/Combiner.h @@ -60,7 +60,7 @@ class Combiner : public GIMatchTableExecutor { Combiner(MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, GISelValueTracking *VT, GISelCSEInfo *CSEInfo = nullptr); - virtual ~Combiner(); + ~Combiner() override; virtual bool tryCombineAll(MachineInstr &I) const = 0; diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h b/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h index 2db66ba9584a3..17d656abdab34 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h @@ -58,7 +58,7 @@ class LLVM_ABI GISelValueTracking : public GISelChangeObserver { public: GISelValueTracking(MachineFunction &MF, unsigned MaxDepth = 6); - ~GISelValueTracking() = default; + ~GISelValueTracking() override = default; const MachineFunction &getMachineFunction() const { return MF; } diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h index 3d7ccd55ee042..268025e7018d3 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h @@ -656,7 +656,7 @@ class IRTranslator : public MachineFunctionPass { IRT->addSuccessorWithProb(Src, Dst, Prob); } - virtual ~GISelSwitchLowering() = default; + ~GISelSwitchLowering() override = default; private: IRTranslator *IRT; diff --git a/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h b/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h index cf65f34ce805a..569407963695e 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h @@ -21,7 +21,7 @@ class GISelObserverWrapper; class LLVM_ABI InstructionSelector : public GIMatchTableExecutor { public: - virtual ~InstructionSelector(); + ~InstructionSelector() override; /// Select the (possibly generic) instruction \p I to only use target-specific /// opcodes. It is OK to insert multiple instructions, but they cannot be diff --git a/llvm/include/llvm/CodeGen/MachineModuleSlotTracker.h b/llvm/include/llvm/CodeGen/MachineModuleSlotTracker.h index 770f1b37f24f5..55048960fd4d5 100644 --- a/llvm/include/llvm/CodeGen/MachineModuleSlotTracker.h +++ b/llvm/include/llvm/CodeGen/MachineModuleSlotTracker.h @@ -37,7 +37,7 @@ class LLVM_ABI MachineModuleSlotTracker : public ModuleSlotTracker { MachineModuleSlotTracker(const MachineModuleInfo &MMI, const MachineFunction *MF, bool ShouldInitializeAllMetadata = true); - ~MachineModuleSlotTracker(); + ~MachineModuleSlotTracker() override; void collectMachineMDNodes(MachineMDNodeListType &L) const; }; diff --git a/llvm/include/llvm/CodeGen/MachineOutliner.h b/llvm/include/llvm/CodeGen/MachineOutliner.h index fbb958ccf6488..66cab3d652104 100644 --- a/llvm/include/llvm/CodeGen/MachineOutliner.h +++ b/llvm/include/llvm/CodeGen/MachineOutliner.h @@ -306,7 +306,7 @@ struct GlobalOutlinedFunction : public OutlinedFunction { } GlobalOutlinedFunction() = delete; - ~GlobalOutlinedFunction() = default; + ~GlobalOutlinedFunction() override = default; }; } // namespace outliner diff --git a/llvm/include/llvm/CodeGen/ResourcePriorityQueue.h b/llvm/include/llvm/CodeGen/ResourcePriorityQueue.h index c15bc677ae53f..0af4f4720212b 100644 --- a/llvm/include/llvm/CodeGen/ResourcePriorityQueue.h +++ b/llvm/include/llvm/CodeGen/ResourcePriorityQueue.h @@ -75,7 +75,7 @@ namespace llvm { public: ResourcePriorityQueue(SelectionDAGISel *IS); - ~ResourcePriorityQueue(); + ~ResourcePriorityQueue() override; bool isBottomUp() const override { return false; } diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h index 822245f13edff..f031353422e40 100644 --- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h @@ -280,7 +280,7 @@ class LLVM_ABI TargetRegisterInfo : public MCRegisterInfo { unsigned Mode = 0); public: - virtual ~TargetRegisterInfo(); + ~TargetRegisterInfo() override; /// Return the number of registers for the function. (may overestimate) virtual unsigned getNumSupportedRegs(const MachineFunction &) const { diff --git a/llvm/include/llvm/CodeGen/VLIWMachineScheduler.h b/llvm/include/llvm/CodeGen/VLIWMachineScheduler.h index 112ff6df03445..65ff1ebe9bd8a 100644 --- a/llvm/include/llvm/CodeGen/VLIWMachineScheduler.h +++ b/llvm/include/llvm/CodeGen/VLIWMachineScheduler.h @@ -223,7 +223,7 @@ class ConvergingVLIWScheduler : public MachineSchedStrategy { enum { TopQID = 1, BotQID = 2, LogMaxQID = 2 }; ConvergingVLIWScheduler() : Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {} - virtual ~ConvergingVLIWScheduler() = default; + ~ConvergingVLIWScheduler() override = default; void initialize(ScheduleDAGMI *dag) override; diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h index 9288d7edbf156..9c0b68b315b50 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h @@ -334,7 +334,7 @@ class DwarfUnit : public DIEUnit { const DIE &TyDIE); protected: - ~DwarfUnit(); + ~DwarfUnit() override; /// Create new static data member DIE. DIE *getOrCreateStaticMemberDIE(const DIDerivedType *DT); diff --git a/llvm/lib/CodeGen/GlobalISel/Combiner.cpp b/llvm/lib/CodeGen/GlobalISel/Combiner.cpp index 2cba6f0504540..066543744984e 100644 --- a/llvm/lib/CodeGen/GlobalISel/Combiner.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Combiner.cpp @@ -62,7 +62,7 @@ class Combiner::WorkListMaintainer : public GISelChangeObserver { static std::unique_ptr create(Level Lvl, WorkListTy &WorkList, MachineRegisterInfo &MRI); - virtual ~WorkListMaintainer() = default; + ~WorkListMaintainer() override = default; void reportFullyCreatedInstrs() { LLVM_DEBUG({ @@ -95,7 +95,7 @@ class Combiner::WorkListMaintainerImpl : public Combiner::WorkListMaintainer { WorkListMaintainerImpl(WorkListTy &WorkList, MachineRegisterInfo &MRI) : WorkList(WorkList), MRI(MRI) {} - virtual ~WorkListMaintainerImpl() = default; + ~WorkListMaintainerImpl() override = default; void reset() override { DeferList.clear(); diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 884c3f1692e94..1fe38d61e400c 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -139,7 +139,7 @@ class DILocationVerifier : public GISelChangeObserver { public: DILocationVerifier() = default; - ~DILocationVerifier() = default; + ~DILocationVerifier() override = default; const Instruction *getCurrentInst() const { return CurrInst; } void setCurrentInst(const Instruction *Inst) { CurrInst = Inst; } diff --git a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp index b655375303137..94e3a8273e843 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp @@ -69,7 +69,7 @@ class LiveDebugValuesLegacy : public MachineFunctionPass { static char ID; LiveDebugValuesLegacy(); - ~LiveDebugValuesLegacy() = default; + ~LiveDebugValuesLegacy() override = default; /// Calculate the liveness information for the given machine function. bool runOnMachineFunction(MachineFunction &MF) override; diff --git a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp index b9ea03f949ef8..1c4b2f9136857 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp @@ -1094,7 +1094,7 @@ class VarLocBasedLDV : public LDVImpl { /// Default construct and initialize the pass. VarLocBasedLDV(); - ~VarLocBasedLDV(); + ~VarLocBasedLDV() override; /// Print to ostream with a message. void printVarLocInMBB(const MachineFunction &MF, const VarLocInMBB &V, diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp index 1cb57a4fa4258..ba0b025167307 100644 --- a/llvm/lib/CodeGen/MachineBasicBlock.cpp +++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp @@ -1137,7 +1137,7 @@ class SlotIndexUpdateDelegate : public MachineFunction::Delegate { MF.setDelegate(this); } - ~SlotIndexUpdateDelegate() { + ~SlotIndexUpdateDelegate() override { MF.resetDelegate(this); for (auto MI : Insertions) Indexes->insertMachineInstrInMaps(*MI); diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp index e1d39d64e9fb8..493d8dfa8c00a 100644 --- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp +++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp @@ -196,7 +196,7 @@ class CopyRewriter : public Rewriter { CopyRewriter(MachineInstr &MI) : Rewriter(MI) { assert(MI.isCopy() && "Expected copy instruction"); } - virtual ~CopyRewriter() = default; + ~CopyRewriter() override = default; bool getNextRewritableSource(RegSubRegPair &Src, RegSubRegPair &Dst) override { diff --git a/llvm/unittests/CodeGen/AsmPrinterDwarfTest.cpp b/llvm/unittests/CodeGen/AsmPrinterDwarfTest.cpp index 6c08173f78622..af2d56df33d38 100644 --- a/llvm/unittests/CodeGen/AsmPrinterDwarfTest.cpp +++ b/llvm/unittests/CodeGen/AsmPrinterDwarfTest.cpp @@ -383,14 +383,14 @@ class AsmPrinterHandlerTest : public AsmPrinterFixtureBase { public: TestHandler(AsmPrinterHandlerTest &Test) : Test(Test) {} - virtual ~TestHandler() {} - virtual void setSymbolSize(const MCSymbol *Sym, uint64_t Size) override {} - virtual void beginModule(Module *M) override { Test.BeginCount++; } - virtual void endModule() override { Test.EndCount++; } - virtual void beginFunction(const MachineFunction *MF) override {} - virtual void endFunction(const MachineFunction *MF) override {} - virtual void beginInstruction(const MachineInstr *MI) override {} - virtual void endInstruction() override {} + ~TestHandler() override {} + void setSymbolSize(const MCSymbol *Sym, uint64_t Size) override {} + void beginModule(Module *M) override { Test.BeginCount++; } + void endModule() override { Test.EndCount++; } + void beginFunction(const MachineFunction *MF) override {} + void endFunction(const MachineFunction *MF) override {} + void beginInstruction(const MachineInstr *MI) override {} + void endInstruction() override {} }; protected: diff --git a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp index ce2a38b785655..ff87e7b6a1018 100644 --- a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp +++ b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp @@ -69,7 +69,7 @@ class InstrRefLDVTest : public testing::Test { InstrRefLDVTest() : Ctx(), Mod(std::make_unique("beehives", Ctx)) {} - void SetUp() { + void SetUp() override { // Boilerplate that creates a MachineFunction and associated blocks. Mod->setDataLayout("e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-" From 0b9ed5dc64149147aadc588d9077d80fb73a54d6 Mon Sep 17 00:00:00 2001 From: paperchalice Date: Wed, 22 Oct 2025 22:04:13 +0800 Subject: [PATCH 105/530] [polly] Remove unsafe-fp-math uses (NFC) (#164603) Post cleanup for #164534. --- polly/docs/experiments/matmul/matmul.ll | 11 ++-- .../experiments/matmul/matmul.normalopt.ll | 18 +++---- ....polly.interchanged+tiled+vector+openmp.ll | 45 +++++++---------- .../matmul.polly.interchanged+tiled+vector.ll | 21 +++----- .../matmul/matmul.polly.interchanged+tiled.ll | 21 +++----- .../matmul/matmul.polly.interchanged.ll | 21 +++----- .../docs/experiments/matmul/matmul.preopt.ll | 18 +++---- .../test/CodeGen/OpenMP/mapped-phi-access.ll | 5 +- ...ference-argument-from-non-affine-region.ll | 4 +- ...d_instruction_referenced_by_parameter_1.ll | 27 ++++------ ...d_instruction_referenced_by_parameter_2.ll | 10 ++-- polly/test/CodeGen/debug-intrinsics.ll | 9 ++-- ...or_block_contains_invalid_memory_access.ll | 10 ++-- polly/test/CodeGen/hoisting_1.ll | 4 +- polly/test/CodeGen/hoisting_2.ll | 4 +- polly/test/CodeGen/intrinsics_lifetime.ll | 13 ++--- polly/test/CodeGen/intrinsics_misc.ll | 19 +++---- .../CodeGen/invariant_cannot_handle_void.ll | 4 +- .../invariant_load_different_sized_types.ll | 5 +- ...ant_load_not_executed_but_in_parameters.ll | 22 +++----- .../invariant_verify_function_failed.ll | 10 ++-- .../invariant_verify_function_failed_2.ll | 4 +- .../loop-invariant-load-type-mismatch.ll | 4 +- .../multiple-types-invariant-load-2.ll | 4 +- .../test/CodeGen/out-of-scop-phi-node-use.ll | 4 +- polly/test/CodeGen/phi-defined-before-scop.ll | 4 +- polly/test/CodeGen/pr25241.ll | 5 +- .../CodeGen/scev_expansion_in_nonaffine.ll | 4 +- polly/test/DependenceInfo/fine_grain_dep_0.ll | 4 +- polly/test/ForwardOpTree/atax.ll | 8 +-- polly/test/ForwardOpTree/jacobi-1d.ll | 5 +- .../runtime_context_with_error_blocks.ll | 7 +-- ...attern-matching-based-opts-after-delicm.ll | 4 +- .../prevectorization-without-tiling.ll | 4 +- .../ScheduleOptimizer/prevectorization.ll | 5 +- .../ScopDetect/error-block-always-executed.ll | 10 ++-- .../error-block-referenced-from-scop.ll | 10 ++-- .../ScopDetect/expand-region-correctly-2.ll | 4 +- polly/test/ScopDetect/intrinsics_1.ll | 50 +++++++++---------- polly/test/ScopDetect/intrinsics_2.ll | 13 ++--- polly/test/ScopDetect/intrinsics_3.ll | 19 +++---- polly/test/ScopDetect/report-scop-location.ll | 7 +-- .../ReportIrreducibleRegion.ll | 8 +-- .../ReportLoopBound-01.ll | 3 -- .../ReportLoopHasNoExit.ll | 9 +--- .../ReportNonAffineAccess-01.ll | 4 -- .../ReportUnprofitable.ll | 12 ++--- .../ReportVariantBasePtr-01.ll | 7 +-- .../test/ScopInfo/BoundChecks/single-loop.ll | 12 ++--- polly/test/ScopInfo/BoundChecks/two-loops.ll | 12 ++--- polly/test/ScopInfo/complex-expression.ll | 4 +- .../do-not-model-error-block-accesses.ll | 10 ++-- .../early_exit_for_complex_domains.ll | 4 +- .../ScopInfo/expensive-boundary-context.ll | 4 +- polly/test/ScopInfo/intrinsics.ll | 7 +-- .../long-sequence-of-error-blocks-2.ll | 19 +++---- .../ScopInfo/long-sequence-of-error-blocks.ll | 15 ++---- polly/test/ScopInfo/memcpy-raw-source.ll | 7 +-- .../ScopInfo/mismatching-array-dimensions.ll | 4 +- polly/test/ScopInfo/multidim_srem.ll | 11 ++-- polly/test/ScopInfo/remarks.ll | 17 +++---- .../scev-div-with-evaluatable-divisor.ll | 4 +- polly/test/ScopInfo/unnamed_stmts.ll | 4 +- polly/test/Simplify/phi_in_regionstmt.ll | 5 +- 64 files changed, 220 insertions(+), 442 deletions(-) diff --git a/polly/docs/experiments/matmul/matmul.ll b/polly/docs/experiments/matmul/matmul.ll index b5bc4b0b7f690..6c789edc19ea1 100644 --- a/polly/docs/experiments/matmul/matmul.ll +++ b/polly/docs/experiments/matmul/matmul.ll @@ -14,7 +14,7 @@ target triple = "x86_64-unknown-linux-gnu" @.str.1 = private unnamed_addr constant [2 x i8] c"\0A\00", align 1 ; Function Attrs: noinline nounwind uwtable -define dso_local void @init_array() #0 { +define dso_local void @init_array() { entry: %i = alloca i32, align 4 %j = alloca i32, align 4 @@ -88,7 +88,7 @@ for.end19: ; preds = %for.cond } ; Function Attrs: noinline nounwind uwtable -define dso_local void @print_array() #0 { +define dso_local void @print_array() { entry: %i = alloca i32, align 4 %j = alloca i32, align 4 @@ -154,10 +154,10 @@ for.end12: ; preds = %for.cond ret void } -declare dso_local i32 @fprintf(%struct._IO_FILE*, i8*, ...) #1 +declare dso_local i32 @fprintf(%struct._IO_FILE*, i8*, ...) ; Function Attrs: noinline nounwind uwtable -define dso_local i32 @main() #0 { +define dso_local i32 @main() { entry: %retval = alloca i32, align 4 %i = alloca i32, align 4 @@ -261,9 +261,6 @@ for.end30: ; preds = %for.cond ret i32 0 } -attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.module.flags = !{!0} !llvm.ident = !{!1} diff --git a/polly/docs/experiments/matmul/matmul.normalopt.ll b/polly/docs/experiments/matmul/matmul.normalopt.ll index 1625dc7511a2b..a2c28b89da500 100644 --- a/polly/docs/experiments/matmul/matmul.normalopt.ll +++ b/polly/docs/experiments/matmul/matmul.normalopt.ll @@ -13,7 +13,7 @@ target triple = "x86_64-unknown-linux-gnu" @C = common dso_local local_unnamed_addr global [1536 x [1536 x float]] zeroinitializer, align 16 ; Function Attrs: noinline norecurse nounwind uwtable writeonly -define dso_local void @init_array() local_unnamed_addr #0 { +define dso_local void @init_array() local_unnamed_addr { entry: br label %for.cond1.preheader @@ -60,7 +60,7 @@ for.end19: ; preds = %for.inc17 } ; Function Attrs: noinline nounwind uwtable -define dso_local void @print_array() local_unnamed_addr #1 { +define dso_local void @print_array() local_unnamed_addr { entry: br label %for.cond1.preheader @@ -75,7 +75,7 @@ for.body3: ; preds = %for.inc, %for.cond1 %arrayidx5 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %indvars.iv6, i64 %indvars.iv %2 = load float, float* %arrayidx5, align 4 %conv = fpext float %2 to double - %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) #4 + %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) %3 = trunc i64 %indvars.iv to i32 %rem = urem i32 %3, 80 %cmp6 = icmp eq i32 %rem, 79 @@ -103,10 +103,10 @@ for.end12: ; preds = %for.end } ; Function Attrs: nounwind -declare dso_local i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture readonly, ...) local_unnamed_addr #2 +declare dso_local i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture readonly, ...) local_unnamed_addr ; Function Attrs: noinline norecurse nounwind uwtable -define dso_local i32 @main() local_unnamed_addr #3 { +define dso_local i32 @main() local_unnamed_addr { entry: tail call void @init_array() br label %for.cond1.preheader @@ -164,13 +164,7 @@ for.end30: ; preds = %for.inc28 } ; Function Attrs: nounwind -declare i32 @fputc(i32, %struct._IO_FILE* nocapture) local_unnamed_addr #4 - -attributes #0 = { noinline norecurse nounwind uwtable writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #4 = { nounwind } +declare i32 @fputc(i32, %struct._IO_FILE* nocapture) local_unnamed_addr !llvm.module.flags = !{!0} !llvm.ident = !{!1} diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.ll b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.ll index 3216d79c2b448..3be4393f3051a 100644 --- a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.ll +++ b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.ll @@ -13,18 +13,18 @@ target triple = "x86_64-unknown-linux-gnu" @C = common dso_local local_unnamed_addr global [1536 x [1536 x float]] zeroinitializer, align 16 ; Function Attrs: noinline nounwind uwtable -define dso_local void @init_array() local_unnamed_addr #0 { +define dso_local void @init_array() local_unnamed_addr { entry: %polly.par.userContext = alloca {}, align 8 %polly.par.userContext1 = bitcast {}* %polly.par.userContext to i8* - call void @GOMP_parallel_loop_runtime_start(void (i8*)* nonnull @init_array_polly_subfn, i8* nonnull %polly.par.userContext1, i32 0, i64 0, i64 1536, i64 1) #3 - call void @init_array_polly_subfn(i8* nonnull %polly.par.userContext1) #3 - call void @GOMP_parallel_end() #3 + call void @GOMP_parallel_loop_runtime_start(void (i8*)* nonnull @init_array_polly_subfn, i8* nonnull %polly.par.userContext1, i32 0, i64 0, i64 1536, i64 1) + call void @init_array_polly_subfn(i8* nonnull %polly.par.userContext1) + call void @GOMP_parallel_end() ret void } ; Function Attrs: noinline nounwind uwtable -define dso_local void @print_array() local_unnamed_addr #1 { +define dso_local void @print_array() local_unnamed_addr { entry: br label %for.cond1.preheader @@ -39,7 +39,7 @@ for.body3: ; preds = %for.inc, %for.cond1 %arrayidx5 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %indvars.iv6, i64 %indvars.iv %2 = load float, float* %arrayidx5, align 4 %conv = fpext float %2 to double - %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) #3 + %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) %3 = trunc i64 %indvars.iv to i32 %rem = urem i32 %3, 80 %cmp6 = icmp eq i32 %rem, 79 @@ -67,27 +67,27 @@ for.end12: ; preds = %for.end } ; Function Attrs: nounwind -declare dso_local i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture readonly, ...) local_unnamed_addr #2 +declare dso_local i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture readonly, ...) local_unnamed_addr ; Function Attrs: noinline nounwind uwtable -define dso_local i32 @main() local_unnamed_addr #0 { +define dso_local i32 @main() local_unnamed_addr { entry: %polly.par.userContext3 = alloca {}, align 8 tail call void @init_array() %polly.par.userContext1 = bitcast {}* %polly.par.userContext3 to i8* - call void @GOMP_parallel_loop_runtime_start(void (i8*)* nonnull @main_polly_subfn, i8* nonnull %polly.par.userContext1, i32 0, i64 0, i64 1536, i64 1) #3 - call void @main_polly_subfn(i8* nonnull %polly.par.userContext1) #3 - call void @GOMP_parallel_end() #3 - call void @GOMP_parallel_loop_runtime_start(void (i8*)* nonnull @main_polly_subfn_1, i8* nonnull %polly.par.userContext1, i32 0, i64 0, i64 1536, i64 64) #3 - call void @main_polly_subfn_1(i8* nonnull %polly.par.userContext1) #3 - call void @GOMP_parallel_end() #3 + call void @GOMP_parallel_loop_runtime_start(void (i8*)* nonnull @main_polly_subfn, i8* nonnull %polly.par.userContext1, i32 0, i64 0, i64 1536, i64 1) + call void @main_polly_subfn(i8* nonnull %polly.par.userContext1) + call void @GOMP_parallel_end() + call void @GOMP_parallel_loop_runtime_start(void (i8*)* nonnull @main_polly_subfn_1, i8* nonnull %polly.par.userContext1, i32 0, i64 0, i64 1536, i64 64) + call void @main_polly_subfn_1(i8* nonnull %polly.par.userContext1) + call void @GOMP_parallel_end() ret i32 0 } ; Function Attrs: nounwind -declare i32 @fputc(i32, %struct._IO_FILE* nocapture) local_unnamed_addr #3 +declare i32 @fputc(i32, %struct._IO_FILE* nocapture) local_unnamed_addr -define internal void @init_array_polly_subfn(i8* nocapture readnone %polly.par.userContext) #4 { +define internal void @init_array_polly_subfn(i8* nocapture readnone %polly.par.userContext) { polly.par.setup: %polly.par.LBPtr = alloca i64, align 8 %polly.par.UBPtr = alloca i64, align 8 @@ -146,7 +146,7 @@ declare void @GOMP_parallel_loop_runtime_start(void (i8*)*, i8*, i32, i64, i64, declare void @GOMP_parallel_end() local_unnamed_addr -define internal void @main_polly_subfn(i8* nocapture readnone %polly.par.userContext) #4 { +define internal void @main_polly_subfn(i8* nocapture readnone %polly.par.userContext) { polly.par.setup: %polly.par.LBPtr = alloca i64, align 8 %polly.par.UBPtr = alloca i64, align 8 @@ -175,7 +175,7 @@ polly.par.loadIVBounds: ; preds = %polly.par.setup, %p br i1 %7, label %polly.par.exit, label %polly.par.loadIVBounds } -define internal void @main_polly_subfn_1(i8* nocapture readnone %polly.par.userContext) #4 { +define internal void @main_polly_subfn_1(i8* nocapture readnone %polly.par.userContext) { polly.par.setup: %polly.par.LBPtr = alloca i64, align 8 %polly.par.UBPtr = alloca i64, align 8 @@ -363,14 +363,7 @@ vector.ph: ; preds = %polly.loop_header14 } ; Function Attrs: argmemonly nounwind -declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) #5 - -attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { nounwind } -attributes #4 = { "polly.skip.fn" } -attributes #5 = { argmemonly nounwind } +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) !llvm.module.flags = !{!0} !llvm.ident = !{!1} diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.ll b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.ll index 4afccd594245a..60228a455d56b 100644 --- a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.ll +++ b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.ll @@ -13,7 +13,7 @@ target triple = "x86_64-unknown-linux-gnu" @C = common dso_local local_unnamed_addr global [1536 x [1536 x float]] zeroinitializer, align 16 ; Function Attrs: noinline norecurse nounwind uwtable writeonly -define dso_local void @init_array() local_unnamed_addr #0 { +define dso_local void @init_array() local_unnamed_addr { entry: br label %polly.loop_header @@ -61,7 +61,7 @@ polly.loop_header1: ; preds = %polly.loop_header1, } ; Function Attrs: noinline nounwind uwtable -define dso_local void @print_array() local_unnamed_addr #1 { +define dso_local void @print_array() local_unnamed_addr { entry: br label %for.cond1.preheader @@ -76,7 +76,7 @@ for.body3: ; preds = %for.inc, %for.cond1 %arrayidx5 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %indvars.iv6, i64 %indvars.iv %2 = load float, float* %arrayidx5, align 4 %conv = fpext float %2 to double - %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) #4 + %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) %3 = trunc i64 %indvars.iv to i32 %rem = urem i32 %3, 80 %cmp6 = icmp eq i32 %rem, 79 @@ -104,10 +104,10 @@ for.end12: ; preds = %for.end } ; Function Attrs: nounwind -declare dso_local i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture readonly, ...) local_unnamed_addr #2 +declare dso_local i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture readonly, ...) local_unnamed_addr ; Function Attrs: noinline norecurse nounwind uwtable -define dso_local i32 @main() local_unnamed_addr #3 { +define dso_local i32 @main() local_unnamed_addr { entry: tail call void @init_array() call void @llvm.memset.p0i8.i64(i8* align 16 bitcast ([1536 x [1536 x float]]* @C to i8*), i8 0, i64 9437184, i1 false) @@ -282,17 +282,10 @@ vector.ph: ; preds = %polly.loop_header26 } ; Function Attrs: nounwind -declare i32 @fputc(i32, %struct._IO_FILE* nocapture) local_unnamed_addr #4 +declare i32 @fputc(i32, %struct._IO_FILE* nocapture) local_unnamed_addr ; Function Attrs: argmemonly nounwind -declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) #5 - -attributes #0 = { noinline norecurse nounwind uwtable writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #4 = { nounwind } -attributes #5 = { argmemonly nounwind } +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) !llvm.module.flags = !{!0} !llvm.ident = !{!1} diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.ll b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.ll index 0641b551ebecd..d2786e0272b02 100644 --- a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.ll +++ b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.ll @@ -13,7 +13,7 @@ target triple = "x86_64-unknown-linux-gnu" @C = common dso_local local_unnamed_addr global [1536 x [1536 x float]] zeroinitializer, align 16 ; Function Attrs: noinline norecurse nounwind uwtable writeonly -define dso_local void @init_array() local_unnamed_addr #0 { +define dso_local void @init_array() local_unnamed_addr { entry: br label %polly.loop_header @@ -61,7 +61,7 @@ polly.loop_header1: ; preds = %polly.loop_header1, } ; Function Attrs: noinline nounwind uwtable -define dso_local void @print_array() local_unnamed_addr #1 { +define dso_local void @print_array() local_unnamed_addr { entry: br label %for.cond1.preheader @@ -76,7 +76,7 @@ for.body3: ; preds = %for.inc, %for.cond1 %arrayidx5 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %indvars.iv6, i64 %indvars.iv %2 = load float, float* %arrayidx5, align 4 %conv = fpext float %2 to double - %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) #4 + %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) %3 = trunc i64 %indvars.iv to i32 %rem = urem i32 %3, 80 %cmp6 = icmp eq i32 %rem, 79 @@ -104,10 +104,10 @@ for.end12: ; preds = %for.end } ; Function Attrs: nounwind -declare dso_local i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture readonly, ...) local_unnamed_addr #2 +declare dso_local i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture readonly, ...) local_unnamed_addr ; Function Attrs: noinline norecurse nounwind uwtable -define dso_local i32 @main() local_unnamed_addr #3 { +define dso_local i32 @main() local_unnamed_addr { entry: tail call void @init_array() call void @llvm.memset.p0i8.i64(i8* align 16 bitcast ([1536 x [1536 x float]]* @C to i8*), i8 0, i64 9437184, i1 false) @@ -343,17 +343,10 @@ vector.ph: ; preds = %polly.loop_header26 } ; Function Attrs: nounwind -declare i32 @fputc(i32, %struct._IO_FILE* nocapture) local_unnamed_addr #4 +declare i32 @fputc(i32, %struct._IO_FILE* nocapture) local_unnamed_addr ; Function Attrs: argmemonly nounwind -declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) #5 - -attributes #0 = { noinline norecurse nounwind uwtable writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #4 = { nounwind } -attributes #5 = { argmemonly nounwind } +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) !llvm.module.flags = !{!0} !llvm.ident = !{!1} diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged.ll b/polly/docs/experiments/matmul/matmul.polly.interchanged.ll index bfc74f2bd290f..5854cadff1c2f 100644 --- a/polly/docs/experiments/matmul/matmul.polly.interchanged.ll +++ b/polly/docs/experiments/matmul/matmul.polly.interchanged.ll @@ -13,7 +13,7 @@ target triple = "x86_64-unknown-linux-gnu" @C = common dso_local local_unnamed_addr global [1536 x [1536 x float]] zeroinitializer, align 16 ; Function Attrs: noinline norecurse nounwind uwtable writeonly -define dso_local void @init_array() local_unnamed_addr #0 { +define dso_local void @init_array() local_unnamed_addr { entry: br label %polly.loop_header @@ -61,7 +61,7 @@ polly.loop_header1: ; preds = %polly.loop_header1, } ; Function Attrs: noinline nounwind uwtable -define dso_local void @print_array() local_unnamed_addr #1 { +define dso_local void @print_array() local_unnamed_addr { entry: br label %for.cond1.preheader @@ -76,7 +76,7 @@ for.body3: ; preds = %for.inc, %for.cond1 %arrayidx5 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %indvars.iv6, i64 %indvars.iv %2 = load float, float* %arrayidx5, align 4 %conv = fpext float %2 to double - %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) #4 + %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) %3 = trunc i64 %indvars.iv to i32 %rem = urem i32 %3, 80 %cmp6 = icmp eq i32 %rem, 79 @@ -104,10 +104,10 @@ for.end12: ; preds = %for.end } ; Function Attrs: nounwind -declare dso_local i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture readonly, ...) local_unnamed_addr #2 +declare dso_local i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture readonly, ...) local_unnamed_addr ; Function Attrs: noinline norecurse nounwind uwtable -define dso_local i32 @main() local_unnamed_addr #3 { +define dso_local i32 @main() local_unnamed_addr { entry: tail call void @init_array() call void @llvm.memset.p0i8.i64(i8* align 16 bitcast ([1536 x [1536 x float]]* @C to i8*), i8 0, i64 9437184, i1 false) @@ -189,17 +189,10 @@ polly.loop_exit22: ; preds = %vector.body } ; Function Attrs: nounwind -declare i32 @fputc(i32, %struct._IO_FILE* nocapture) local_unnamed_addr #4 +declare i32 @fputc(i32, %struct._IO_FILE* nocapture) local_unnamed_addr ; Function Attrs: argmemonly nounwind -declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) #5 - -attributes #0 = { noinline norecurse nounwind uwtable writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #4 = { nounwind } -attributes #5 = { argmemonly nounwind } +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) !llvm.module.flags = !{!0} !llvm.ident = !{!1} diff --git a/polly/docs/experiments/matmul/matmul.preopt.ll b/polly/docs/experiments/matmul/matmul.preopt.ll index 3b9446fcc9668..e55be8bad6f2e 100644 --- a/polly/docs/experiments/matmul/matmul.preopt.ll +++ b/polly/docs/experiments/matmul/matmul.preopt.ll @@ -14,7 +14,7 @@ target triple = "x86_64-unknown-linux-gnu" @.str.1 = private unnamed_addr constant [2 x i8] c"\0A\00", align 1 ; Function Attrs: noinline nounwind uwtable -define dso_local void @init_array() #0 { +define dso_local void @init_array() { entry: br label %entry.split @@ -52,7 +52,7 @@ for.end19: ; preds = %for.inc17 } ; Function Attrs: noinline nounwind uwtable -define dso_local void @print_array() #0 { +define dso_local void @print_array() { entry: br label %entry.split @@ -70,7 +70,7 @@ for.body3: ; preds = %for.cond1.preheader %arrayidx5 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %indvars.iv6, i64 %indvars.iv %2 = load float, float* %arrayidx5, align 4 %conv = fpext float %2 to double - %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) #2 + %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) %3 = trunc i64 %indvars.iv to i32 %rem = urem i32 %3, 80 %cmp6 = icmp eq i32 %rem, 79 @@ -98,10 +98,10 @@ for.end12: ; preds = %for.end ret void } -declare dso_local i32 @fprintf(%struct._IO_FILE*, i8*, ...) #1 +declare dso_local i32 @fprintf(%struct._IO_FILE*, i8*, ...) ; Function Attrs: noinline nounwind uwtable -define dso_local i32 @main() #0 { +define dso_local i32 @main() { entry: br label %entry.split @@ -148,14 +148,10 @@ for.end30: ; preds = %for.inc28 } ; Function Attrs: nounwind -declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #2 +declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) ; Function Attrs: nounwind -declare i32 @fputc(i32, %struct._IO_FILE* nocapture) #2 - -attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind } +declare i32 @fputc(i32, %struct._IO_FILE* nocapture) !llvm.module.flags = !{!0} !llvm.ident = !{!1} diff --git a/polly/test/CodeGen/OpenMP/mapped-phi-access.ll b/polly/test/CodeGen/OpenMP/mapped-phi-access.ll index ac78b4e6c0c55..4deab1af0ccf0 100644 --- a/polly/test/CodeGen/OpenMP/mapped-phi-access.ll +++ b/polly/test/CodeGen/OpenMP/mapped-phi-access.ll @@ -5,7 +5,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; Function Attrs: nounwind uwtable -define void @main() local_unnamed_addr #0 { +define void @main() local_unnamed_addr { entry: %0 = load ptr, ptr undef, align 8, !tbaa !1 %1 = load ptr, ptr undef, align 8, !tbaa !1 @@ -35,8 +35,6 @@ kernel_gemver_StrictFP.exit: ; preds = %for.inc85.i238 ret void } -attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.ident = !{!0} !0 = !{!"clang version 6.0.0 "} @@ -47,7 +45,6 @@ attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="fals !5 = !{!6, !6, i64 0} !6 = !{!"double", !3, i64 0} - ; CHECK-LABEL: define internal void @main_polly_subfn(ptr %polly.par.userContext) ; ; CHECK: polly.stmt.for.body65.i226: diff --git a/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll b/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll index 96dc4250cd05f..c207f589e4da0 100644 --- a/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll +++ b/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll @@ -23,7 +23,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @longLimit = external global [9 x [23 x i32]], align 16 @shortLimit = external global [9 x [14 x i32]], align 16 -define void @init_layer3(i32 %down_sample_sblimit) #0 { +define void @init_layer3(i32 %down_sample_sblimit) { entry: br label %for.cond.463.preheader @@ -63,8 +63,6 @@ for.inc.530: ; preds = %for.inc.527 br i1 %exitcond142, label %for.cond.499.preheader, label %for.cond.533.preheader } -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.ident = !{!0} !0 = !{!"clang version 3.8.0 (trunk 246359)"} diff --git a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll index 6ffe6bf67d54f..dfef4202391d4 100644 --- a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll +++ b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll @@ -35,7 +35,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @global4 = external global ptr, align 8 ; Function Attrs: uwtable -define i32 @foo(ptr %arg) #0 personality ptr @blam { +define i32 @foo(ptr %arg) personality ptr @blam { bb: br label %bb3 @@ -83,40 +83,33 @@ bb19: ; preds = %bb19, %bb14 } ; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.start(i64, ptr nocapture) #1 +declare void @llvm.lifetime.start(i64, ptr nocapture) ; Function Attrs: nounwind readnone -declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64) #2 +declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64) ; Function Attrs: nobuiltin -declare noalias ptr @eggs(i64) #3 +declare noalias ptr @eggs(i64) ; Function Attrs: nobuiltin -declare noalias ptr @bar(i64) #3 +declare noalias ptr @bar(i64) ; Function Attrs: uwtable -declare void @zot(ptr, i32, i32, i32, i32, i32, ptr, i32, i32, ptr) unnamed_addr #0 align 2 +declare void @zot(ptr, i32, i32, i32, i32, i32, ptr, i32, i32, ptr) unnamed_addr align 2 declare i32 @blam(...) ; Function Attrs: nobuiltin nounwind -declare void @zot5(ptr) #4 +declare void @zot5(ptr) ; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.end(i64, ptr nocapture) #1 +declare void @llvm.lifetime.end(i64, ptr nocapture) ; Function Attrs: uwtable -declare i32 @eggs6(ptr) #0 +declare i32 @eggs6(ptr) ; Function Attrs: nounwind uwtable -declare void @eggs7(ptr, i32, i32, i32) unnamed_addr #5 align 2 - -attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { argmemonly nounwind } -attributes #2 = { nounwind readnone } -attributes #3 = { nobuiltin "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #4 = { nobuiltin nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #5 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } +declare void @eggs7(ptr, i32, i32, i32) unnamed_addr align 2 !llvm.ident = !{!0} diff --git a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll index 68c247a608311..fcc6764ce9c21 100644 --- a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll +++ b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll @@ -8,7 +8,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @REGISTER = external global [10 x i32], align 16 ; Function Attrs: nounwind uwtable -define void @FORMAT3_4() #0 { +define void @FORMAT3_4() { entry: %INSTR = alloca [32 x i32], align 16 br label %entry.split @@ -20,7 +20,7 @@ entry.split: ; preds = %entry br i1 %cmp, label %if.end.36, label %if.else if.else: ; preds = %entry.split - call void (i32, i32, ptr, ...) @BYTES_TO_BITS(i32 undef, i32 1, ptr undef) #2 + call void (i32, i32, ptr, ...) @BYTES_TO_BITS(i32 undef, i32 1, ptr undef) %1 = load i32, ptr undef, align 4 %cmp14 = icmp eq i32 %1, 1 br i1 %cmp14, label %land.lhs.true, label %if.end.36 @@ -179,8 +179,4 @@ return: ; preds = %if.then.219, %if.th ret void } -declare void @BYTES_TO_BITS(...) #1 - -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cmov,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+hle,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+xsave,+xsaveopt,-adx,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512pf,-avx512vl,-fma4,-prfchw,-rdseed,-sha,-sse4a,-tbm,-xop,-xsavec,-xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cmov,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+hle,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+xsave,+xsaveopt,-adx,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512pf,-avx512vl,-fma4,-prfchw,-rdseed,-sha,-sse4a,-tbm,-xop,-xsavec,-xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind } +declare void @BYTES_TO_BITS(...) diff --git a/polly/test/CodeGen/debug-intrinsics.ll b/polly/test/CodeGen/debug-intrinsics.ll index 65fa6780d9720..ed4b81a8e3a3c 100644 --- a/polly/test/CodeGen/debug-intrinsics.ll +++ b/polly/test/CodeGen/debug-intrinsics.ll @@ -9,7 +9,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; Function Attrs: nounwind uwtable -define void @foo(ptr %A, i64 %N) #0 !dbg !4 { +define void @foo(ptr %A, i64 %N) !dbg !4 { entry: br label %entry.split @@ -49,13 +49,10 @@ for.end: ; preds = %for.cond.for.end_cr ; CHECK-NOT: #dbg_value ; Function Attrs: nounwind readnone -declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 +declare void @llvm.dbg.declare(metadata, metadata, metadata) ; Function Attrs: nounwind readnone -declare void @llvm.dbg.value(metadata, metadata, metadata) #1 - -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone } +declare void @llvm.dbg.value(metadata, metadata, metadata) !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!11, !12} diff --git a/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll b/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll index 008e16caf9c23..abec28894f45b 100644 --- a/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll +++ b/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll @@ -17,10 +17,10 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" %struct.timeb.11.32.53.242.347.557.599.851.998.1208.2069.2153.2174.2237.2258.2279.2321 = type { i64, i16, i16, i16 } %struct.pix_pos.13.34.55.244.349.559.601.853.1000.1210.2071.2155.2176.2239.2260.2281.2323 = type { i32, i32, i32, i32, i32, i32 } -declare void @getLuma4x4Neighbour() #0 +declare void @getLuma4x4Neighbour() ; Function Attrs: nounwind uwtable -define void @readCBP_CABAC(ptr %img) #1 { +define void @readCBP_CABAC(ptr %img) { entry: %block_a = alloca %struct.pix_pos.13.34.55.244.349.559.601.853.1000.1210.2071.2155.2176.2239.2260.2281.2323, align 4 %mb_data = getelementptr inbounds %struct.img_par.12.33.54.243.348.558.600.852.999.1209.2070.2154.2175.2238.2259.2280.2322, ptr %img, i64 0, i32 39 @@ -45,7 +45,7 @@ if.end.35: ; preds = %if.else.19, %if.els br i1 %cmp36, label %if.then.38, label %if.else.66 if.then.38: ; preds = %if.end.35 - call void @getLuma4x4Neighbour() #2 + call void @getLuma4x4Neighbour() %0 = load i32, ptr null, align 4 %tobool = icmp eq i32 %0, 0 br i1 %tobool, label %if.end.72, label %if.then.42 @@ -72,7 +72,3 @@ for.inc.84: ; preds = %if.end.72 for.end.86: ; preds = %for.inc.84 ret void } - -attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cmov,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+hle,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+xsave,+xsaveopt,-adx,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512pf,-avx512vl,-fma4,-prfchw,-rdseed,-sha,-sse4a,-tbm,-xop,-xsavec,-xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cmov,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+hle,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+xsave,+xsaveopt,-adx,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512pf,-avx512vl,-fma4,-prfchw,-rdseed,-sha,-sse4a,-tbm,-xop,-xsavec,-xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind } diff --git a/polly/test/CodeGen/hoisting_1.ll b/polly/test/CodeGen/hoisting_1.ll index 1f065bec80327..aa29bfd7dbcbc 100644 --- a/polly/test/CodeGen/hoisting_1.ll +++ b/polly/test/CodeGen/hoisting_1.ll @@ -26,7 +26,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" %struct.bar.11 = type { ptr, ptr, ptr } ; Function Attrs: nounwind uwtable -define void @foo(ptr %arg) #0 { +define void @foo(ptr %arg) { bb: br label %bb2 @@ -49,8 +49,6 @@ bb10: ; preds = %bb9 ret void } -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.ident = !{!0} !0 = !{!"clang version 3.9.0 (trunk 259751) (llvm/trunk 259869)"} diff --git a/polly/test/CodeGen/hoisting_2.ll b/polly/test/CodeGen/hoisting_2.ll index e76ee066af08d..1b913f2cb07be 100644 --- a/polly/test/CodeGen/hoisting_2.ll +++ b/polly/test/CodeGen/hoisting_2.ll @@ -27,7 +27,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" %struct.foo.12 = type { %struct.foo.4, i32, i32, i32, i32 } ; Function Attrs: nounwind uwtable -define void @eggs(ptr %arg) #0 { +define void @eggs(ptr %arg) { bb: %tmp = load ptr, ptr undef, align 8, !tbaa !1 br label %bb5 @@ -62,8 +62,6 @@ bb22: ; preds = %bb13.bb22_crit_edge ret void } -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.ident = !{!0} !0 = !{!"clang version 3.9.0 (trunk 259751) (llvm/trunk 259869)"} diff --git a/polly/test/CodeGen/intrinsics_lifetime.ll b/polly/test/CodeGen/intrinsics_lifetime.ll index 6dca218b63862..a708548798ebb 100644 --- a/polly/test/CodeGen/intrinsics_lifetime.ll +++ b/polly/test/CodeGen/intrinsics_lifetime.ll @@ -21,7 +21,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @A = common global [1024 x i32] zeroinitializer, align 16 ; Function Attrs: nounwind uwtable -define void @jd() #0 { +define void @jd() { entry: %tmp = alloca [1024 x i32], align 16 br label %for.cond @@ -32,7 +32,7 @@ for.cond: ; preds = %for.inc11, %entry br i1 %exitcond5, label %for.body, label %for.end13 for.body: ; preds = %for.cond - call void @llvm.lifetime.start(i64 4096, ptr %tmp) #1 + call void @llvm.lifetime.start(i64 4096, ptr %tmp) br label %for.cond2 for.cond2: ; preds = %for.inc, %for.body @@ -59,7 +59,7 @@ for.end: ; preds = %for.cond2 %tmp8 = load i32, ptr %arrayidx8, align 4 %arrayidx10 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %indvars.iv3 store i32 %tmp8, ptr %arrayidx10, align 4 - call void @llvm.lifetime.end(i64 4096, ptr %tmp) #1 + call void @llvm.lifetime.end(i64 4096, ptr %tmp) br label %for.inc11 for.inc11: ; preds = %for.end @@ -71,10 +71,7 @@ for.end13: ; preds = %for.cond } ; Function Attrs: nounwind -declare void @llvm.lifetime.start(i64, ptr nocapture) #1 +declare void @llvm.lifetime.start(i64, ptr nocapture) ; Function Attrs: nounwind -declare void @llvm.lifetime.end(i64, ptr nocapture) #1 - -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #1 = { nounwind } +declare void @llvm.lifetime.end(i64, ptr nocapture) diff --git a/polly/test/CodeGen/intrinsics_misc.ll b/polly/test/CodeGen/intrinsics_misc.ll index 84164893ebf72..a643b8accd4e9 100644 --- a/polly/test/CodeGen/intrinsics_misc.ll +++ b/polly/test/CodeGen/intrinsics_misc.ll @@ -28,7 +28,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @A = common global [1024 x i32] zeroinitializer, align 16 ; Function Attrs: nounwind uwtable -define void @jd() #0 { +define void @jd() { entry: %tmp = alloca [1024 x i32], align 16 br label %for.cond @@ -39,7 +39,7 @@ for.cond: ; preds = %for.inc11, %entry br i1 %exitcond5, label %for.body, label %for.end13 for.body: ; preds = %for.cond - %lis = call ptr @llvm.invariant.start(i64 4096, ptr @A) #1 + %lis = call ptr @llvm.invariant.start(i64 4096, ptr @A) br label %for.cond2 for.cond2: ; preds = %for.inc, %for.body @@ -68,7 +68,7 @@ for.end: ; preds = %for.cond2 %arrayidx8 = getelementptr inbounds [1024 x i32], ptr %tmp, i64 0, i64 %indvars.iv3 %tmp8 = load i32, ptr %arrayidx8, align 4 %arrayidx10 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %indvars.iv3 - call void @llvm.invariant.end(ptr %lis, i64 4096, ptr @A) #1 + call void @llvm.invariant.end(ptr %lis, i64 4096, ptr @A) store i32 %tmp8, ptr %arrayidx10, align 4 br label %for.inc11 @@ -81,19 +81,16 @@ for.end13: ; preds = %for.cond } ; Function Attrs: nounwind -declare void @llvm.donothing() #1 +declare void @llvm.donothing() ; Function Attrs: nounwind -declare void @llvm.assume(i1) #1 +declare void @llvm.assume(i1) ; Function Attrs: nounwind -declare i1 @llvm.expect.i1(i1, i1) #1 +declare i1 @llvm.expect.i1(i1, i1) ; Function Attrs: nounwind -declare ptr @llvm.invariant.start(i64, ptr nocapture) #1 +declare ptr @llvm.invariant.start(i64, ptr nocapture) ; Function Attrs: nounwind -declare void @llvm.invariant.end(ptr, i64, ptr nocapture) #1 - -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #1 = { nounwind } +declare void @llvm.invariant.end(ptr, i64, ptr nocapture) diff --git a/polly/test/CodeGen/invariant_cannot_handle_void.ll b/polly/test/CodeGen/invariant_cannot_handle_void.ll index 0859a4e4997e1..420cb608f9ba4 100644 --- a/polly/test/CodeGen/invariant_cannot_handle_void.ll +++ b/polly/test/CodeGen/invariant_cannot_handle_void.ll @@ -24,7 +24,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; Function Attrs: nounwind uwtable -define void @sudecrypt(ptr %buff) #0 { +define void @sudecrypt(ptr %buff) { entry: br i1 undef, label %cleanup, label %if.end @@ -62,8 +62,6 @@ cleanup: ; preds = %entry ret void } -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.ident = !{!0} !0 = !{!"clang version 3.8.0 (trunk 250010) (llvm/trunk 250018)"} diff --git a/polly/test/CodeGen/invariant_load_different_sized_types.ll b/polly/test/CodeGen/invariant_load_different_sized_types.ll index 2995bce4c6601..0a88bb70966d2 100644 --- a/polly/test/CodeGen/invariant_load_different_sized_types.ll +++ b/polly/test/CodeGen/invariant_load_different_sized_types.ll @@ -8,11 +8,10 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; CHECK-NEXT: %polly.access.tmp2.load = load i32, ptr %polly.access.tmp2, align 1 ; CHECK-NEXT: store i32 %polly.access.tmp2.load, ptr %tmp.preload.s2a - %struct.hoge = type { [4 x i8], i32, i32, i32, i32, i32, [16 x i8], [16 x i8], i64, i64, i64, i64, i64 } ; Function Attrs: nounwind uwtable -define void @widget() #0 { +define void @widget() { bb: %tmp2 = alloca %struct.hoge, align 1 br label %bb3 @@ -40,8 +39,6 @@ bb13: ; preds = %bb10 ret void } -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.ident = !{!0} !0 = !{!"clang version 3.9.0 (trunk 259751) (llvm/trunk 259771)"} diff --git a/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll b/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll index 01b01761d908b..5a11adcdebbc5 100644 --- a/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll +++ b/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll @@ -35,7 +35,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @donestkptr = external global ptr, align 8 ; Function Attrs: uwtable -define i32 @_Z13dotableswitchP9Classfile(ptr %c) #0 personality ptr @__gxx_personality_v0 { +define i32 @_Z13dotableswitchP9Classfile(ptr %c) personality ptr @__gxx_personality_v0 { entry: br label %entry.split @@ -82,33 +82,27 @@ for.end: ; preds = %for.cond.for.end_cr } ; Function Attrs: nounwind readnone -declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64) #1 +declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64) ; Function Attrs: nobuiltin -declare noalias ptr @_Znam(i64) #2 +declare noalias ptr @_Znam(i64) ; Function Attrs: nobuiltin -declare noalias ptr @_Znwm(i64) #2 +declare noalias ptr @_Znwm(i64) ; Function Attrs: uwtable -declare void @_ZN3ExpC2Ejj7Exptype4Type2OpPS_jjP4Case(ptr, i32, i32, i32, i32, i32, ptr, i32, i32, ptr) unnamed_addr #0 align 2 +declare void @_ZN3ExpC2Ejj7Exptype4Type2OpPS_jjP4Case(ptr, i32, i32, i32, i32, i32, ptr, i32, i32, ptr) unnamed_addr align 2 declare i32 @__gxx_personality_v0(...) ; Function Attrs: nobuiltin nounwind -declare void @_ZdlPv(ptr) #3 +declare void @_ZdlPv(ptr) ; Function Attrs: uwtable -declare i32 @_Z10doluswitchP9Classfile(ptr) #0 +declare i32 @_Z10doluswitchP9Classfile(ptr) ; Function Attrs: nounwind uwtable -declare void @_ZN4Exp_C2E7Exptype4Type2Op(ptr, i32, i32, i32) unnamed_addr #4 align 2 - -attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone } -attributes #2 = { nobuiltin "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { nobuiltin nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #4 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } +declare void @_ZN4Exp_C2E7Exptype4Type2Op(ptr, i32, i32, i32) unnamed_addr align 2 !llvm.ident = !{!0} diff --git a/polly/test/CodeGen/invariant_verify_function_failed.ll b/polly/test/CodeGen/invariant_verify_function_failed.ll index c9affac076e92..1dcc175ebb163 100644 --- a/polly/test/CodeGen/invariant_verify_function_failed.ll +++ b/polly/test/CodeGen/invariant_verify_function_failed.ll @@ -15,7 +15,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; Function Attrs: nounwind uwtable -define void @fileblobSetFilename() #0 { +define void @fileblobSetFilename() { entry: br i1 undef, label %if.end, label %cleanup @@ -23,7 +23,7 @@ if.end: ; preds = %entry br i1 undef, label %land.lhs.true, label %if.end.18 land.lhs.true: ; preds = %if.end - %call9 = tail call ptr @__errno_location() #2 + %call9 = tail call ptr @__errno_location() %tmp = load i32, ptr %call9, align 4, !tbaa !1 br i1 false, label %if.then.12, label %if.end.18 @@ -45,11 +45,7 @@ cleanup: ; preds = %if.end.27, %entry } ; Function Attrs: nounwind readnone -declare ptr @__errno_location() #1 - -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind readnone } +declare ptr @__errno_location() !llvm.ident = !{!0} diff --git a/polly/test/CodeGen/invariant_verify_function_failed_2.ll b/polly/test/CodeGen/invariant_verify_function_failed_2.ll index 7ef5608d7d197..43b3d99e11a2f 100644 --- a/polly/test/CodeGen/invariant_verify_function_failed_2.ll +++ b/polly/test/CodeGen/invariant_verify_function_failed_2.ll @@ -39,7 +39,7 @@ target triple = "x86_64-unknown-linux-gnu" @enc_picture = external global ptr, align 8 ; Function Attrs: nounwind uwtable -define void @compute_colocated(ptr %listX, ptr %A, ptr %B) #0 { +define void @compute_colocated(ptr %listX, ptr %A, ptr %B) { entry: br label %for.body2414 @@ -82,8 +82,6 @@ if.end2624: ; preds = %for.inc2621 ret void } -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.ident = !{!0} !0 = !{!"clang version 3.9.0"} diff --git a/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll b/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll index d9065858ff250..f43247b3e5057 100644 --- a/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll +++ b/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll @@ -5,7 +5,7 @@ target triple = "x86_64-unknown-linux-gnu" ; Just make sure this test passes correctly. -define void @kernel_ludcmp(ptr %b, ptr %y) #0 { +define void @kernel_ludcmp(ptr %b, ptr %y) { entry: br label %entry.split @@ -115,8 +115,6 @@ for.end.131: ; preds = %for.end.118 ret void } -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.ident = !{!0} !0 = !{!"clang version 3.8.0 (trunk 250010) (llvm/trunk 250018)"} diff --git a/polly/test/CodeGen/multiple-types-invariant-load-2.ll b/polly/test/CodeGen/multiple-types-invariant-load-2.ll index f6aca37c932bd..101fcaff0c82e 100644 --- a/polly/test/CodeGen/multiple-types-invariant-load-2.ll +++ b/polly/test/CodeGen/multiple-types-invariant-load-2.ll @@ -5,7 +5,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -define void @hoge(ptr %arg) #0 { +define void @hoge(ptr %arg) { bb: br label %bb3 @@ -22,8 +22,6 @@ bb7: ; preds = %bb4, %bb3 ret void } -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.ident = !{!0} !0 = !{!"clang version 3.9.0 (trunk 259751) (llvm/trunk 259869)"} diff --git a/polly/test/CodeGen/out-of-scop-phi-node-use.ll b/polly/test/CodeGen/out-of-scop-phi-node-use.ll index a4f942309ed28..dd0a24b14a3b8 100644 --- a/polly/test/CodeGen/out-of-scop-phi-node-use.ll +++ b/polly/test/CodeGen/out-of-scop-phi-node-use.ll @@ -13,7 +13,7 @@ target triple = "x86_64-unknown-linux-gnu" ; CHECK-NEXT: %_s.sroa.343.0.ph5161118.ph.final_reload = load i32, ptr %_s.sroa.343.0.ph5161118.s2a ; Function Attrs: nounwind uwtable -define void @lzmaDecode() #0 { +define void @lzmaDecode() { entry: br label %for.cond.outer.outer.outer @@ -54,8 +54,6 @@ cleanup.1072: ; preds = %for.cond ret void } -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.ident = !{!0} !0 = !{!"clang version 3.8.0 (trunk 250010) (llvm/trunk 250018)"} diff --git a/polly/test/CodeGen/phi-defined-before-scop.ll b/polly/test/CodeGen/phi-defined-before-scop.ll index 23612061156d9..447a14e9999c2 100644 --- a/polly/test/CodeGen/phi-defined-before-scop.ll +++ b/polly/test/CodeGen/phi-defined-before-scop.ll @@ -14,7 +14,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @global = external global ptr, align 8 ; Function Attrs: nounwind uwtable -define void @wobble() #0 { +define void @wobble() { bb: br label %bb1 @@ -41,8 +41,6 @@ bb9: ; preds = %bb8 unreachable } -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.ident = !{!0} !0 = !{!"clang version 3.8.0 (trunk 250010) (llvm/trunk 250018)"} diff --git a/polly/test/CodeGen/pr25241.ll b/polly/test/CodeGen/pr25241.ll index 4a4add8ba2a6d..7547b0bbed749 100644 --- a/polly/test/CodeGen/pr25241.ll +++ b/polly/test/CodeGen/pr25241.ll @@ -20,12 +20,11 @@ ; CHECK: %curr.3.ph.final_reload = load i32, ptr %curr.3.s2a ; CHECK: br label - target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" ; Function Attrs: nounwind uwtable -define void @BZ2_decompress() #0 { +define void @BZ2_decompress() { entry: %tmp = load i32, ptr undef, align 4, !tbaa !1 switch i32 undef, label %save_state_and_return [ @@ -56,8 +55,6 @@ save_state_and_return: ; preds = %entry ret void } -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.ident = !{!0} !0 = !{!"clang version 3.8.0 (trunk 250010) (llvm/trunk 250018)"} diff --git a/polly/test/CodeGen/scev_expansion_in_nonaffine.ll b/polly/test/CodeGen/scev_expansion_in_nonaffine.ll index 6c6c2572da109..0adb0ba7eea81 100644 --- a/polly/test/CodeGen/scev_expansion_in_nonaffine.ll +++ b/polly/test/CodeGen/scev_expansion_in_nonaffine.ll @@ -31,7 +31,7 @@ target triple = "x86_64-unknown-linux-gnu" @reduce_sidechannel = external global i32, align 4 ; Function Attrs: nounwind uwtable -define void @iteration_init(ptr %gfp, ptr %l3_side, ptr %l3_enc) #0 { +define void @iteration_init(ptr %gfp, ptr %l3_side, ptr %l3_enc) { entry: %resvDrain = getelementptr inbounds %struct.III_side_info_t.7.62.139.227.293, ptr %l3_side, i64 0, i32 2 store i32 0, ptr %resvDrain, align 8 @@ -86,5 +86,3 @@ for.inc.117: ; preds = %for.inc.114, %for.c %cmp95 = icmp slt i64 %indvars.iv.next158, %6 br i1 %cmp95, label %for.cond.98.preheader, label %for.cond.120.preheader } - -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/polly/test/DependenceInfo/fine_grain_dep_0.ll b/polly/test/DependenceInfo/fine_grain_dep_0.ll index f93814c1c4be0..5abbf48136891 100644 --- a/polly/test/DependenceInfo/fine_grain_dep_0.ll +++ b/polly/test/DependenceInfo/fine_grain_dep_0.ll @@ -31,7 +31,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" ; Function Attrs: nounwind uwtable -define void @test(ptr %a, ptr %b, i64 %N) #0 { +define void @test(ptr %a, ptr %b, i64 %N) { entry: br label %for.cond @@ -67,8 +67,6 @@ for.end: ; preds = %for.cond ret void } -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.ident = !{!0} !0 = !{!"clang version 3.9.0"} diff --git a/polly/test/ForwardOpTree/atax.ll b/polly/test/ForwardOpTree/atax.ll index 496e8315b068b..6c81fb12e8cdc 100644 --- a/polly/test/ForwardOpTree/atax.ll +++ b/polly/test/ForwardOpTree/atax.ll @@ -2,7 +2,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -define internal fastcc void @kernel_atax(ptr nocapture readonly %A, ptr nocapture readonly %x, ptr nocapture %y, ptr nocapture %tmp) unnamed_addr #0 { +define internal fastcc void @kernel_atax(ptr nocapture readonly %A, ptr nocapture readonly %x, ptr nocapture %y, ptr nocapture %tmp) unnamed_addr { entry: br label %entry.split @@ -61,10 +61,7 @@ for.end42: ; preds = %for.inc40 } ; Function Attrs: argmemonly nounwind -declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i32, i1) #1 - -attributes #0 = { noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { argmemonly nounwind } +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i32, i1) !llvm.module.flags = !{!0} !llvm.ident = !{!1} @@ -78,7 +75,6 @@ attributes #1 = { argmemonly nounwind } !6 = !{!7, !7, i64 0} !7 = !{!"double", !4, i64 0} - ; CHECK: Statistics { ; CHECK: Operand trees forwarded: 2 ; CHECK: Statements with forwarded operand trees: 2 diff --git a/polly/test/ForwardOpTree/jacobi-1d.ll b/polly/test/ForwardOpTree/jacobi-1d.ll index c9c71a15a4261..cb035bb749c7b 100644 --- a/polly/test/ForwardOpTree/jacobi-1d.ll +++ b/polly/test/ForwardOpTree/jacobi-1d.ll @@ -2,7 +2,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -define internal fastcc void @kernel_jacobi_1d(ptr noalias nocapture %A, ptr noalias nocapture %B) unnamed_addr #0 { +define internal fastcc void @kernel_jacobi_1d(ptr noalias nocapture %A, ptr noalias nocapture %B) unnamed_addr { entry: br label %entry.split @@ -46,8 +46,6 @@ for.end35: ; preds = %for.inc33 ret void } -attributes #0 = { noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.module.flags = !{!0} !llvm.ident = !{!1} @@ -60,7 +58,6 @@ attributes #0 = { noinline norecurse nounwind uwtable "correctly-rounded-divide- !6 = !{!7, !7, i64 0} !7 = !{!"double", !4, i64 0} - ; CHECK: Statistics { ; CHECK: Operand trees forwarded: 2 ; CHECK: Statements with forwarded operand trees: 1 diff --git a/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll b/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll index 8c3f230cb413c..aef509a865b6a 100644 --- a/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll +++ b/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll @@ -16,7 +16,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" $_ZNKSt5ctypeIcE5widenEc = comdat any ; Function Attrs: uwtable -define weak_odr signext i8 @_ZNKSt5ctypeIcE5widenEc(ptr %this, i8 signext %__c) #0 comdat align 2 { +define weak_odr signext i8 @_ZNKSt5ctypeIcE5widenEc(ptr %this, i8 signext %__c) comdat align 2 { entry: br label %entry.split @@ -45,10 +45,7 @@ return: ; preds = %if.end, %if.then ret i8 %retval.0 } -declare void @_ZNKSt5ctypeIcE13_M_widen_initEv(ptr) #1 - -attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } +declare void @_ZNKSt5ctypeIcE13_M_widen_initEv(ptr) !llvm.ident = !{!0} diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll index 6e9ade869ec6c..a19b93d9915dd 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll @@ -47,7 +47,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" ; Function Attrs: norecurse nounwind uwtable -define void @kernel_2mm(i32 %ni, i32 %nj, i32 %nk, i32 %nl, double %alpha, double %beta, ptr nocapture %tmp, ptr nocapture readonly %A, ptr nocapture readonly %B, ptr nocapture readnone %C, ptr nocapture readnone %D) local_unnamed_addr #0 { +define void @kernel_2mm(i32 %ni, i32 %nj, i32 %nk, i32 %nl, double %alpha, double %beta, ptr nocapture %tmp, ptr nocapture readonly %A, ptr nocapture readonly %B, ptr nocapture readnone %C, ptr nocapture readnone %D) local_unnamed_addr { entry: br label %entry.split @@ -93,8 +93,6 @@ for.end27: ; preds = %for.inc25 ret void } -attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cmov,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-adx,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vl,-avx512vpopcntdq,-clflushopt,-clwb,-clzero,-fma4,-lwp,-mwaitx,-pku,-prefetchwt1,-prfchw,-rdseed,-rtm,-sgx,-sha,-sse4a,-tbm,-xop,-xsavec,-xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.module.flags = !{!0} !llvm.ident = !{!1} diff --git a/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll b/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll index a18ba1daef84b..1c6d289744e39 100644 --- a/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll +++ b/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll @@ -6,7 +6,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @B = common global [1536 x [1536 x float]] zeroinitializer, align 16 ; Function Attrs: nounwind uwtable -define void @foo() #0 { +define void @foo() { entry: br label %entry.split @@ -51,8 +51,6 @@ for.end30: ; preds = %for.inc28 ret void } -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } - ; CHECK: #pragma known-parallel ; CHECK: for (int c0 = 0; c0 <= 1535; c0 += 1) ; CHECK: for (int c1 = 0; c1 <= 383; c1 += 1) diff --git a/polly/test/ScheduleOptimizer/prevectorization.ll b/polly/test/ScheduleOptimizer/prevectorization.ll index 4db61ad032ea8..1ff20d165ce5e 100644 --- a/polly/test/ScheduleOptimizer/prevectorization.ll +++ b/polly/test/ScheduleOptimizer/prevectorization.ll @@ -8,7 +8,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @B = common global [1536 x [1536 x float]] zeroinitializer, align 16 ; Function Attrs: nounwind uwtable -define void @foo() #0 { +define void @foo() { entry: br label %entry.split @@ -53,8 +53,6 @@ for.end30: ; preds = %for.inc28 ret void } -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } - ; CHECK: #pragma known-parallel ; CHECK: for (int c0 = 0; c0 <= 47; c0 += 1) ; CHECK: for (int c1 = 0; c1 <= 47; c1 += 1) @@ -95,7 +93,6 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"= ; VEC16: Stmt_for_body8(32 * c0 + c3, 32 * c1 + 16 * c4 + c6, 32 * c2 + c5); ; VEC16: } - !llvm.ident = !{!0} !0 = !{!"clang version 3.5.0 "} diff --git a/polly/test/ScopDetect/error-block-always-executed.ll b/polly/test/ScopDetect/error-block-always-executed.ll index d799d575a530b..20d02b1c1ae0b 100644 --- a/polly/test/ScopDetect/error-block-always-executed.ll +++ b/polly/test/ScopDetect/error-block-always-executed.ll @@ -8,14 +8,14 @@ target triple = "x86_64-unknown-linux-gnu" %struct.hoge = type { i32, i32, i32, i32 } ; Function Attrs: nounwind uwtable -define void @widget() #0 { +define void @widget() { bb13: %tmp1 = alloca %struct.hoge, align 4 br i1 undef, label %bb14, label %bb19 bb14: ; preds = %bb13 %tmp = load i32, ptr undef, align 4, !tbaa !1 - call void @quux() #2 + call void @quux() br i1 false, label %bb15, label %bb18 bb15: ; preds = %bb14 @@ -46,11 +46,7 @@ bb25: ; preds = %bb2 unreachable } -declare void @quux() #1 - -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind } +declare void @quux() !llvm.ident = !{!0} diff --git a/polly/test/ScopDetect/error-block-referenced-from-scop.ll b/polly/test/ScopDetect/error-block-referenced-from-scop.ll index ba271f34ea7b4..6c66f6df14af5 100644 --- a/polly/test/ScopDetect/error-block-referenced-from-scop.ll +++ b/polly/test/ScopDetect/error-block-referenced-from-scop.ll @@ -6,7 +6,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" ; Function Attrs: nounwind uwtable -define void @hoge() #0 { +define void @hoge() { bb: br label %bb1 @@ -15,7 +15,7 @@ bb1: ; preds = %bb bb2: ; preds = %bb1 %tmp = load i32, ptr undef, align 8, !tbaa !1 - %tmp3 = tail call i32 @widget() #2 + %tmp3 = tail call i32 @widget() br i1 false, label %bb4, label %bb5 bb4: ; preds = %bb2 @@ -33,11 +33,7 @@ bb8: ; preds = %bb7, %bb5, %bb4 } ; Function Attrs: inlinehint nounwind readonly uwtable -declare i32 @widget() #1 - -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { inlinehint nounwind readonly uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind readonly } +declare i32 @widget() !llvm.ident = !{!0} diff --git a/polly/test/ScopDetect/expand-region-correctly-2.ll b/polly/test/ScopDetect/expand-region-correctly-2.ll index df35d05674f95..a5c9626d28361 100644 --- a/polly/test/ScopDetect/expand-region-correctly-2.ll +++ b/polly/test/ScopDetect/expand-region-correctly-2.ll @@ -5,7 +5,7 @@ target triple = "x86_64-unknown-linux-gnu" ; Function Attrs: nounwind uwtable -define void @qtm_decompress() #0 { +define void @qtm_decompress() { entry: br label %if.end.1631 @@ -45,8 +45,6 @@ cleanup.1785: ; preds = %if.then.1659 ret void } -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.ident = !{!0} !0 = !{!"clang version 3.8.0 (trunk 250010) (llvm/trunk 250018)"} diff --git a/polly/test/ScopDetect/intrinsics_1.ll b/polly/test/ScopDetect/intrinsics_1.ll index 0f9c70084a3da..58c9197f7f799 100644 --- a/polly/test/ScopDetect/intrinsics_1.ll +++ b/polly/test/ScopDetect/intrinsics_1.ll @@ -15,7 +15,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; Function Attrs: nounwind uwtable -define void @jd(ptr noalias %A, ptr noalias %B) #0 { +define void @jd(ptr noalias %A, ptr noalias %B) { entry: br label %for.cond @@ -28,29 +28,29 @@ for.body: ; preds = %for.cond %tmp = trunc i64 %indvars.iv to i32 %conv = sitofp i32 %tmp to double %tmp1 = call double @llvm.sqrt.f64(double %conv) - %call = call double @__log10_finite(double %tmp1) #2 - %call1 = call double @ceil(double %call) #2 + %call = call double @__log10_finite(double %tmp1) + %call1 = call double @ceil(double %call) %tmp2 = trunc i64 %indvars.iv to i32 %conv2 = sitofp i32 %tmp2 to double - %call3 = call double @__log2_finite(double %conv2) #2 - %call4 = call double @floor(double %call3) #2 + %call3 = call double @__log2_finite(double %conv2) + %call4 = call double @floor(double %call3) %tmp3 = call double @llvm.pow.f64(double %call1, double %call4) %conv5 = fptosi double %tmp3 to i32 %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv store i32 %conv5, ptr %arrayidx, align 4 %tmp4 = trunc i64 %indvars.iv to i32 %conv6 = sitofp i32 %tmp4 to double - %call7 = call double @sin(double %conv6) #2 - %call8 = call double @__log_finite(double %call7) #2 + %call7 = call double @sin(double %conv6) + %call8 = call double @__log_finite(double %call7) %tmp5 = trunc i64 %indvars.iv to i32 %conv9 = sitofp i32 %tmp5 to double - %call10 = call double @cos(double %conv9) #2 - %call11 = call double @__exp2_finite(double %call10) #2 + %call10 = call double @cos(double %conv9) + %call11 = call double @__exp2_finite(double %call10) %add = fadd fast double %call8, %call11 - %call12 = call double @fabs(double %add) #2 + %call12 = call double @fabs(double %add) %tmp6 = trunc i64 %indvars.iv to i32 %conv13 = sitofp i32 %tmp6 to double - %call14 = call double @__exp_finite(double %conv13) #2 + %call14 = call double @__exp_finite(double %conv13) %add15 = fadd fast double %call12, %call14 %conv16 = fptrunc double %add15 to float %arrayidx18 = getelementptr inbounds float, ptr %B, i64 %indvars.iv @@ -66,41 +66,37 @@ for.end: ; preds = %for.cond } ; Function Attrs: nounwind readnone -declare double @ceil(double) #1 +declare double @ceil(double) ; Function Attrs: nounwind readnone -declare double @__log10_finite(double) #1 +declare double @__log10_finite(double) ; Function Attrs: nounwind readnone -declare double @llvm.sqrt.f64(double) #2 +declare double @llvm.sqrt.f64(double) ; Function Attrs: nounwind readnone -declare double @floor(double) #1 +declare double @floor(double) ; Function Attrs: nounwind readnone -declare double @__log2_finite(double) #1 +declare double @__log2_finite(double) ; Function Attrs: nounwind readnone -declare double @llvm.pow.f64(double, double) #2 +declare double @llvm.pow.f64(double, double) ; Function Attrs: nounwind readnone -declare double @fabs(double) #1 +declare double @fabs(double) ; Function Attrs: nounwind readnone -declare double @__log_finite(double) #1 +declare double @__log_finite(double) ; Function Attrs: nounwind readnone -declare double @sin(double) #1 +declare double @sin(double) ; Function Attrs: nounwind readnone -declare double @__exp2_finite(double) #1 +declare double @__exp2_finite(double) ; Function Attrs: nounwind readnone -declare double @cos(double) #1 +declare double @cos(double) ; Function Attrs: nounwind readnone -declare double @__exp_finite(double) #1 - -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #1 = { nounwind readnone "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #2 = { nounwind readnone } +declare double @__exp_finite(double) diff --git a/polly/test/ScopDetect/intrinsics_2.ll b/polly/test/ScopDetect/intrinsics_2.ll index 1db9807cadb8d..f71016e6d04cd 100644 --- a/polly/test/ScopDetect/intrinsics_2.ll +++ b/polly/test/ScopDetect/intrinsics_2.ll @@ -20,7 +20,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @A = common global [1024 x i32] zeroinitializer, align 16 ; Function Attrs: nounwind uwtable -define void @jd() #0 { +define void @jd() { entry: %tmp = alloca [1024 x i32], align 16 br label %for.cond @@ -31,7 +31,7 @@ for.cond: ; preds = %for.inc11, %entry br i1 %exitcond5, label %for.body, label %for.end13 for.body: ; preds = %for.cond - call void @llvm.lifetime.start(i64 4096, ptr %tmp) #1 + call void @llvm.lifetime.start(i64 4096, ptr %tmp) br label %for.cond2 for.cond2: ; preds = %for.inc, %for.body @@ -62,7 +62,7 @@ for.end: ; preds = %for.cond2 for.inc11: ; preds = %for.end %indvars.iv.next4 = add nuw nsw i64 %indvars.iv3, 1 - call void @llvm.lifetime.end(i64 4096, ptr %tmp) #1 + call void @llvm.lifetime.end(i64 4096, ptr %tmp) br label %for.cond for.end13: ; preds = %for.cond @@ -70,10 +70,7 @@ for.end13: ; preds = %for.cond } ; Function Attrs: nounwind -declare void @llvm.lifetime.start(i64, ptr nocapture) #1 +declare void @llvm.lifetime.start(i64, ptr nocapture) ; Function Attrs: nounwind -declare void @llvm.lifetime.end(i64, ptr nocapture) #1 - -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #1 = { nounwind } +declare void @llvm.lifetime.end(i64, ptr nocapture) diff --git a/polly/test/ScopDetect/intrinsics_3.ll b/polly/test/ScopDetect/intrinsics_3.ll index a230d0aa831c5..579d5bd481d44 100644 --- a/polly/test/ScopDetect/intrinsics_3.ll +++ b/polly/test/ScopDetect/intrinsics_3.ll @@ -20,7 +20,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @A = common global [1024 x i32] zeroinitializer, align 16 ; Function Attrs: nounwind uwtable -define void @jd() #0 { +define void @jd() { entry: %tmp = alloca [1024 x i32], align 16 br label %for.cond @@ -31,7 +31,7 @@ for.cond: ; preds = %for.inc11, %entry br i1 %exitcond5, label %for.body, label %for.end13 for.body: ; preds = %for.cond - %lis = call ptr @llvm.invariant.start(i64 4096, ptr @A) #1 + %lis = call ptr @llvm.invariant.start(i64 4096, ptr @A) br label %for.cond2 for.cond2: ; preds = %for.inc, %for.body @@ -60,7 +60,7 @@ for.end: ; preds = %for.cond2 %arrayidx8 = getelementptr inbounds [1024 x i32], ptr %tmp, i64 0, i64 %indvars.iv3 %tmp8 = load i32, ptr %arrayidx8, align 4 %arrayidx10 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %indvars.iv3 - call void @llvm.invariant.end(ptr %lis, i64 4096, ptr @A) #1 + call void @llvm.invariant.end(ptr %lis, i64 4096, ptr @A) store i32 %tmp8, ptr %arrayidx10, align 4 br label %for.inc11 @@ -73,19 +73,16 @@ for.end13: ; preds = %for.cond } ; Function Attrs: nounwind -declare void @llvm.donothing() #1 +declare void @llvm.donothing() ; Function Attrs: nounwind -declare void @llvm.assume(i1) #1 +declare void @llvm.assume(i1) ; Function Attrs: nounwind -declare i1 @llvm.expect.i1(i1, i1) #1 +declare i1 @llvm.expect.i1(i1, i1) ; Function Attrs: nounwind -declare ptr @llvm.invariant.start(i64, ptr nocapture) #1 +declare ptr @llvm.invariant.start(i64, ptr nocapture) ; Function Attrs: nounwind -declare void @llvm.invariant.end(ptr, i64, ptr nocapture) #1 - -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #1 = { nounwind } +declare void @llvm.invariant.end(ptr, i64, ptr nocapture) diff --git a/polly/test/ScopDetect/report-scop-location.ll b/polly/test/ScopDetect/report-scop-location.ll index a99a2ef2b4841..5e4c38db5e53c 100644 --- a/polly/test/ScopDetect/report-scop-location.ll +++ b/polly/test/ScopDetect/report-scop-location.ll @@ -2,7 +2,7 @@ target datalayout = "e-i64:64-f80:128-s:64-n8:16:32:64-S128" ; Function Attrs: nounwind uwtable -define void @foo(ptr %A) #0 !dbg !4 { +define void @foo(ptr %A) !dbg !4 { entry: br label %entry.split @@ -28,7 +28,7 @@ for.end: ; preds = %for.body ; CHECK: test.c:3: End of scop ; Function Attrs: nounwind uwtable -define void @bar(ptr %A) #0 !dbg !7 { +define void @bar(ptr %A) !dbg !7 { entry: br label %entry.split @@ -53,8 +53,6 @@ for.end: ; preds = %for.body ; CHECK: test.c:9: Start of scop ; CHECK: test.c:13: End of scop -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!8, !9} !llvm.ident = !{!10} @@ -77,4 +75,3 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"= !16 = distinct !DILexicalBlock(line: 9, column: 0, file: !1, scope: !7) !17 = !DILocation(line: 13, scope: !16) !18 = !DILocation(line: 14, scope: !7) - diff --git a/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll b/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll index d22c3b6d27c3d..30e5fb9fdeba8 100644 --- a/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll +++ b/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll @@ -19,9 +19,8 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; CHECK-NEXT: remark: ReportIrreducibleRegion.c:9:4: Irreducible region encountered in control flow. ; CHECK-NEXT: remark: ReportIrreducibleRegion.c:9:4: Invalid Scop candidate ends here. - ; Function Attrs: nounwind uwtable -define void @foo(i32 %a, i32 %b) #0 !dbg !4 { +define void @foo(i32 %a, i32 %b) !dbg !4 { entry: %a.addr = alloca i32, align 4 %b.addr = alloca i32, align 4 @@ -66,10 +65,7 @@ if.end6: ; preds = %if.end5, %entry } ; Function Attrs: nounwind readnone -declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 - -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone } +declare void @llvm.dbg.declare(metadata, metadata, metadata) !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!8, !9} diff --git a/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll b/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll index cb913000a9933..a96b64e4e0d54 100644 --- a/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll +++ b/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll @@ -79,9 +79,6 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) declare void @llvm.dbg.value(metadata, i64, metadata, metadata) -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone } - !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!10, !11} !llvm.ident = !{!12} diff --git a/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll b/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll index 3743bfae9fcaf..6156efaea1909 100644 --- a/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll +++ b/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll @@ -19,12 +19,10 @@ ; CHECK: remark: ReportLoopHasNoExit.c:7:7: Loop cannot be handled because it has no exit. - - target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; Function Attrs: nounwind uwtable -define void @func(i32 %param0, i32 %N, ptr %A) #0 !dbg !6 { +define void @func(i32 %param0, i32 %N, ptr %A) !dbg !6 { entry: %param0.addr = alloca i32, align 4 %N.addr = alloca i32, align 4 @@ -80,10 +78,7 @@ for.end: ; preds = %for.cond } ; Function Attrs: nounwind readnone -declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 - -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone } +declare void @llvm.dbg.declare(metadata, metadata, metadata) !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4} diff --git a/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll b/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll index 832045f089d64..13ac9d5ace2d3 100644 --- a/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll +++ b/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll @@ -5,7 +5,6 @@ ; A[i*i] = 0; ; } - ; CHECK: remark: ReportNonAffineAccess-01.c:2:7: The following errors keep this region from being a Scop. ; CHECK: remark: ReportNonAffineAccess-01.c:3:5: The array subscript of "A" is not affine ; CHECK: remark: ReportNonAffineAccess-01.c:3:5: Invalid Scop candidate ends here. @@ -40,9 +39,6 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) declare void @llvm.dbg.value(metadata, i64, metadata, metadata) -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone } - !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!10, !11} !llvm.ident = !{!12} diff --git a/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll b/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll index b951487d61971..93e9e8b14038b 100644 --- a/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll +++ b/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll @@ -74,9 +74,8 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; YAML: Args: ; YAML: - String: Invalid Scop candidate ends here. - ; Function Attrs: nounwind uwtable -define void @onlyWrite(ptr %A) #0 !dbg !4 { +define void @onlyWrite(ptr %A) !dbg !4 { entry: call void @llvm.dbg.value(metadata ptr %A, i64 0, metadata !14, metadata !15), !dbg !16 call void @llvm.dbg.value(metadata i64 0, i64 0, metadata !17, metadata !15), !dbg !20 @@ -102,10 +101,10 @@ for.end: ; preds = %for.cond } ; Function Attrs: nounwind readnone -declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 +declare void @llvm.dbg.declare(metadata, metadata, metadata) ; Function Attrs: nounwind uwtable -define void @onlyRead(ptr %A) #0 !dbg !10 { +define void @onlyRead(ptr %A) !dbg !10 { entry: call void @llvm.dbg.value(metadata ptr %A, i64 0, metadata !29, metadata !15), !dbg !30 call void @llvm.dbg.value(metadata i64 0, i64 0, metadata !31, metadata !15), !dbg !33 @@ -131,10 +130,7 @@ for.end: ; preds = %for.cond } ; Function Attrs: nounwind readnone -declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1 - -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone } +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!11, !12} diff --git a/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll b/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll index c2efd6165a26b..5f296fae9532b 100644 --- a/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll +++ b/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll @@ -21,7 +21,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" %struct.b = type { ptr } -define void @a(ptr nocapture readonly %A) #0 !dbg !4 { +define void @a(ptr nocapture readonly %A) !dbg !4 { entry: br label %entry.split @@ -57,10 +57,7 @@ for.end: ; preds = %for.body ret void, !dbg !34 } -declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1 - -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone } +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!20, !21} diff --git a/polly/test/ScopInfo/BoundChecks/single-loop.ll b/polly/test/ScopInfo/BoundChecks/single-loop.ll index 10a0a58f381d2..0b69beaaf3f9c 100644 --- a/polly/test/ScopInfo/BoundChecks/single-loop.ll +++ b/polly/test/ScopInfo/BoundChecks/single-loop.ll @@ -38,7 +38,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; Function Attrs: nounwind uwtable -define void @foo(i64 %n, ptr %A) #0 { +define void @foo(i64 %n, ptr %A) { entry: br label %for.cond @@ -51,7 +51,7 @@ for.body: ; preds = %for.cond br i1 false, label %if.then, label %if.end if.then: ; preds = %for.body - call void (...) @exception() #2 + call void (...) @exception() unreachable if.end: ; preds = %for.body @@ -59,7 +59,7 @@ if.end: ; preds = %for.body br i1 %cmp2, label %if.then.3, label %if.end.4 if.then.3: ; preds = %if.end - call void (...) @exception() #2 + call void (...) @exception() unreachable if.end.4: ; preds = %if.end @@ -79,11 +79,7 @@ for.end: ; preds = %for.cond } ; Function Attrs: noreturn -declare void @exception(...) #1 - -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { noreturn "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { noreturn nounwind } +declare void @exception(...) !llvm.ident = !{!0} diff --git a/polly/test/ScopInfo/BoundChecks/two-loops.ll b/polly/test/ScopInfo/BoundChecks/two-loops.ll index c85ac5b4ba8fd..f2ba17d33c0ea 100644 --- a/polly/test/ScopInfo/BoundChecks/two-loops.ll +++ b/polly/test/ScopInfo/BoundChecks/two-loops.ll @@ -35,7 +35,7 @@ ; AST: { /* original code */ } target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -define void @foo(i64 %n, ptr %A) #0 { +define void @foo(i64 %n, ptr %A) { entry: br label %for.cond @@ -56,7 +56,7 @@ for.body.3: ; preds = %for.cond.1 br i1 false, label %if.then, label %if.end if.then: ; preds = %for.body.3 - call void (...) @exception() #2 + call void (...) @exception() unreachable if.end: ; preds = %for.body.3 @@ -64,7 +64,7 @@ if.end: ; preds = %for.body.3 br i1 %cmp5, label %if.then.6, label %if.end.7 if.then.6: ; preds = %if.end - call void (...) @exception() #2 + call void (...) @exception() unreachable if.end.7: ; preds = %if.end @@ -90,11 +90,7 @@ for.end.10: ; preds = %for.cond ret void } -declare void @exception(...) #1 - -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { noreturn "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { noreturn nounwind } +declare void @exception(...) !llvm.ident = !{!0} diff --git a/polly/test/ScopInfo/complex-expression.ll b/polly/test/ScopInfo/complex-expression.ll index 6a6dde62d1ae5..4a2a1d2a64a6d 100644 --- a/polly/test/ScopInfo/complex-expression.ll +++ b/polly/test/ScopInfo/complex-expression.ll @@ -13,7 +13,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" ; Function Attrs: norecurse nounwind -define i32 @foo(ptr nocapture readonly %src1, ptr nocapture readonly %src2, ptr nocapture %score, ptr nocapture %max, i32 %n) #0 { +define i32 @foo(ptr nocapture readonly %src1, ptr nocapture readonly %src2, ptr nocapture %score, ptr nocapture %max, i32 %n) { entry: %cmp33 = icmp sgt i32 %n, 0 br i1 %cmp33, label %for.body.preheader, label %for.body7.preheader @@ -129,8 +129,6 @@ cleanup: ; preds = %for.body7.preheader ret i32 %retval.0 } -attributes #0 = { norecurse nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+strict-align" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} diff --git a/polly/test/ScopInfo/do-not-model-error-block-accesses.ll b/polly/test/ScopInfo/do-not-model-error-block-accesses.ll index baa423f407802..a3ca59563ab1f 100644 --- a/polly/test/ScopInfo/do-not-model-error-block-accesses.ll +++ b/polly/test/ScopInfo/do-not-model-error-block-accesses.ll @@ -6,7 +6,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; Function Attrs: nounwind uwtable -define void @FORMAT3_4() #0 { +define void @FORMAT3_4() { entry: br label %entry.split @@ -14,7 +14,7 @@ entry.split: ; preds = %entry br i1 false, label %if.end.38, label %if.else if.else: ; preds = %entry.split - call void (i32, i32, ptr, ...) @BYTES_TO_BITS(i32 undef, i32 1, ptr undef) #2 + call void (i32, i32, ptr, ...) @BYTES_TO_BITS(i32 undef, i32 1, ptr undef) %0 = load i32, ptr null, align 4 br label %if.end.38 @@ -22,11 +22,7 @@ if.end.38: ; preds = %if.else, %entry.spl unreachable } -declare void @BYTES_TO_BITS(...) #1 - -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cmov,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+hle,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+xsave,+xsaveopt,-adx,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512pf,-avx512vl,-fma4,-prfchw,-rdseed,-sha,-sse4a,-tbm,-xop,-xsavec,-xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cmov,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+hle,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+xsave,+xsaveopt,-adx,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512pf,-avx512vl,-fma4,-prfchw,-rdseed,-sha,-sse4a,-tbm,-xop,-xsavec,-xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind } +declare void @BYTES_TO_BITS(...) !llvm.ident = !{!0} diff --git a/polly/test/ScopInfo/early_exit_for_complex_domains.ll b/polly/test/ScopInfo/early_exit_for_complex_domains.ll index eed19b3214a73..9a1edcbfb7796 100644 --- a/polly/test/ScopInfo/early_exit_for_complex_domains.ll +++ b/polly/test/ScopInfo/early_exit_for_complex_domains.ll @@ -7,7 +7,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" %struct.regnode_charclass_class.2.42.654.690.726.870.978.1770.1806.1842.2166.2274.2382.2598.2814.3030.3064 = type { i8, i8, i16, i32, [32 x i8], [4 x i8] } ; Function Attrs: nounwind uwtable -define void @S_cl_or(ptr %cl, ptr %or_with) #0 { +define void @S_cl_or(ptr %cl, ptr %or_with) { entry: %0 = load i8, ptr %or_with, align 4, !tbaa !1 %conv = zext i8 %0 to i32 @@ -37,8 +37,6 @@ if.end91: ; preds = %for.body71, %for.bo ret void } -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.ident = !{!0} !0 = !{!"clang version 3.9.0"} diff --git a/polly/test/ScopInfo/expensive-boundary-context.ll b/polly/test/ScopInfo/expensive-boundary-context.ll index 1a8858d8fce20..95212f83acdca 100644 --- a/polly/test/ScopInfo/expensive-boundary-context.ll +++ b/polly/test/ScopInfo/expensive-boundary-context.ll @@ -16,7 +16,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" %struct.foo.1 = type { [3 x i32], [3 x i32], i32, i32, [4 x i32], [4 x i32] } ; Function Attrs: nounwind uwtable -define void @hoge() #0 { +define void @hoge() { bb: %tmp52 = alloca ptr, align 8 %tmp53 = alloca ptr, align 8 @@ -250,8 +250,6 @@ bb245: ; preds = %bb244, %bb232 unreachable } -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.ident = !{!0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0} !0 = !{!"clang version 3.8.0 (trunk 252261) (llvm/trunk 252271)"} diff --git a/polly/test/ScopInfo/intrinsics.ll b/polly/test/ScopInfo/intrinsics.ll index c5bbacbe6d8c2..e6d9e733e35bf 100644 --- a/polly/test/ScopInfo/intrinsics.ll +++ b/polly/test/ScopInfo/intrinsics.ll @@ -14,7 +14,7 @@ ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -define void @fun() #0 { +define void @fun() { entry: %A = alloca [1024 x i32], align 16 br label %for.cond @@ -40,7 +40,4 @@ for.end: ; preds = %for.cond } ; Function Attrs: nounwind readnone -declare void @llvm.donothing() #1 - -attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone } +declare void @llvm.donothing() diff --git a/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll b/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll index 6027975b563b7..e32748a4bbb57 100644 --- a/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll +++ b/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll @@ -13,13 +13,13 @@ target triple = "x86_64-unknown-linux-gnu" @global2 = external unnamed_addr constant [79 x i8], align 1 @global3 = external unnamed_addr constant [57 x i8], align 1 -declare void @widget() #0 +declare void @widget() ; Function Attrs: nounwind -declare void @quux(ptr, i64, ptr, ...) #1 +declare void @quux(ptr, i64, ptr, ...) ; Function Attrs: nounwind uwtable -define void @hoge(ptr %A) #2 { +define void @hoge(ptr %A) { bb: br label %bb15 @@ -39,7 +39,7 @@ bb19: ; preds = %bb15 br i1 %tmp22, label %bb24, label %bb23 bb23: ; preds = %bb19 - call void @widget() #3 + call void @widget() br label %bb24 bb24: ; preds = %bb23, %bb19, %bb15 @@ -57,7 +57,7 @@ bb29: ; preds = %bb24 br i1 %tmp32, label %bb33, label %bb34 bb33: ; preds = %bb29 - call void (ptr, i64, ptr, ...) @quux(ptr @global, i64 300, ptr @global2, i32 144) #3 + call void (ptr, i64, ptr, ...) @quux(ptr @global, i64 300, ptr @global2, i32 144) br label %bb34 bb34: ; preds = %bb33, %bb29, %bb24 @@ -84,7 +84,7 @@ bb43: ; preds = %bb39 br i1 %tmp47, label %bb48, label %bb49 bb48: ; preds = %bb43 - call void @widget() #3 + call void @widget() br label %bb49 bb49: ; preds = %bb48, %bb43, %bb39, %bb34 @@ -103,7 +103,7 @@ bb54: ; preds = %bb49 br i1 %tmp57, label %bb58, label %bb59 bb58: ; preds = %bb54 - call void (ptr, i64, ptr, ...) @quux(ptr @global, i64 300, ptr @global3) #3 + call void (ptr, i64, ptr, ...) @quux(ptr @global, i64 300, ptr @global3) br label %bb59 bb59: ; preds = %bb58, %bb54, %bb49 @@ -121,11 +121,6 @@ bb65: ; preds = %bb64, %bb59 ret void } -attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { nounwind } - !llvm.ident = !{!0} !0 = !{!"clang version 3.8.0 (trunk 252261) (llvm/trunk 252271)"} diff --git a/polly/test/ScopInfo/long-sequence-of-error-blocks.ll b/polly/test/ScopInfo/long-sequence-of-error-blocks.ll index 4ef5ef09c44b7..b32b87b5c3f3a 100644 --- a/polly/test/ScopInfo/long-sequence-of-error-blocks.ll +++ b/polly/test/ScopInfo/long-sequence-of-error-blocks.ll @@ -51,13 +51,13 @@ target triple = "x86_64-unknown-linux-gnu" @global2 = external unnamed_addr constant [79 x i8], align 1 @global3 = external unnamed_addr constant [57 x i8], align 1 -declare void @widget() #0 +declare void @widget() ; Function Attrs: nounwind -declare void @quux(ptr, i64, ptr, ...) #1 +declare void @quux(ptr, i64, ptr, ...) ; Function Attrs: nounwind uwtable -define void @hoge(ptr %A) #2 { +define void @hoge(ptr %A) { bb: br label %bb15 @@ -77,7 +77,7 @@ bb19: ; preds = %bb15 br i1 %tmp22, label %bb24, label %bb23 bb23: ; preds = %bb19 - call void @widget() #3 + call void @widget() br label %bb24 bb24: ; preds = %bb23, %bb19, %bb15 @@ -95,18 +95,13 @@ bb29: ; preds = %bb24 br i1 %tmp32, label %bb33, label %bb34 bb33: ; preds = %bb29 - call void (ptr, i64, ptr, ...) @quux(ptr @global, i64 300, ptr @global2, i32 144) #3 + call void (ptr, i64, ptr, ...) @quux(ptr @global, i64 300, ptr @global2, i32 144) br label %bb34 bb34: ; preds = %bb33, %bb29, %bb24 ret void } -attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { nounwind } - !llvm.ident = !{!0} !0 = !{!"clang version 3.8.0 (trunk 252261) (llvm/trunk 252271)"} diff --git a/polly/test/ScopInfo/memcpy-raw-source.ll b/polly/test/ScopInfo/memcpy-raw-source.ll index d9024cd27346f..6c45b0d41b76b 100644 --- a/polly/test/ScopInfo/memcpy-raw-source.ll +++ b/polly/test/ScopInfo/memcpy-raw-source.ll @@ -8,10 +8,10 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @tonemasks = external global [17 x [6 x [56 x float]]], align 16 ; Function Attrs: argmemonly nounwind -declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i32, i1) #0 +declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i32, i1) ; Function Attrs: nounwind uwtable -define void @setup_tone_curves() #1 { +define void @setup_tone_curves() { entry: %workc = alloca [17 x [8 x [56 x float]]], align 16 br label %for.cond7.preheader @@ -46,9 +46,6 @@ for.inc104: ; preds = %for.body74 ret void } -attributes #0 = { argmemonly nounwind } -attributes #1 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.ident = !{!0} !0 = !{!"clang version 4.0.0 (trunk 285057) (llvm/trunk 285063)"} diff --git a/polly/test/ScopInfo/mismatching-array-dimensions.ll b/polly/test/ScopInfo/mismatching-array-dimensions.ll index ed1e28cbee6ef..f825cbff1ec56 100644 --- a/polly/test/ScopInfo/mismatching-array-dimensions.ll +++ b/polly/test/ScopInfo/mismatching-array-dimensions.ll @@ -6,7 +6,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.9.0" ; Function Attrs: nounwind ssp uwtable -define void @hoge(ptr %arg, ptr %arg5, i32 %arg6) #0 { +define void @hoge(ptr %arg, ptr %arg5, i32 %arg6) { bb: br i1 undef, label %bb7, label %bb25 @@ -37,8 +37,6 @@ bb25: ; preds = %bb21, %bb ret void } -attributes #0 = { nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.module.flags = !{!0, !1, !2} !llvm.ident = !{!3} diff --git a/polly/test/ScopInfo/multidim_srem.ll b/polly/test/ScopInfo/multidim_srem.ll index c965e2c86e2ba..88c8c6af648e0 100644 --- a/polly/test/ScopInfo/multidim_srem.ll +++ b/polly/test/ScopInfo/multidim_srem.ll @@ -19,11 +19,9 @@ ; CHECK-NEXT: [n] -> { Stmt_for_body_8[i0, i1, i2] -> MemRef_A[1, i1, i2] : (1 + i0) mod 2 = 0; Stmt_for_body_8[i0, i1, i2] -> MemRef_A[0, i1, i2] : (i0) mod 2 = 0 }; ; CHECK-NEXT: } - target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @foo(i64 %n, ptr %A) #0 { +define void @foo(i64 %n, ptr %A) { entry: br label %entry.split @@ -83,13 +81,10 @@ for.end.16: ; preds = %for.inc.14 } ; Function Attrs: nounwind -declare void @llvm.lifetime.start(i64, ptr nocapture) #1 +declare void @llvm.lifetime.start(i64, ptr nocapture) ; Function Attrs: nounwind -declare void @llvm.lifetime.end(i64, ptr nocapture) #1 - -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind } +declare void @llvm.lifetime.end(i64, ptr nocapture) !llvm.ident = !{!0} diff --git a/polly/test/ScopInfo/remarks.ll b/polly/test/ScopInfo/remarks.ll index 2c173a31c46e9..10cc57aa27a14 100644 --- a/polly/test/ScopInfo/remarks.ll +++ b/polly/test/ScopInfo/remarks.ll @@ -41,7 +41,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @.str = private unnamed_addr constant [8 x i8] c"Printf!\00", align 1 -define void @valid(ptr %A, ptr %B, i32 %N, i32 %M, ptr %C, i32 %Debug) #0 !dbg !4 { +define void @valid(ptr %A, ptr %B, i32 %N, i32 %M, ptr %C, i32 %Debug) !dbg !4 { entry: call void @llvm.dbg.value(metadata ptr %A, i64 0, metadata !23, metadata !24), !dbg !25 call void @llvm.dbg.value(metadata ptr %B, i64 0, metadata !26, metadata !24), !dbg !27 @@ -105,7 +105,7 @@ for.end: ; preds = %for.cond.3 br i1 %tobool, label %if.end.18, label %if.then.17, !dbg !72 if.then.17: ; preds = %for.end - %call = call i32 (ptr, ...) @printf(ptr nonnull @.str) #3, !dbg !73 + %call = call i32 (ptr, ...) @printf(ptr nonnull @.str), !dbg !73 br label %if.end.18, !dbg !73 if.end.18: ; preds = %for.end, %if.then.17 @@ -120,11 +120,11 @@ for.end.21: ; preds = %for.cond ret void, !dbg !76 } -declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 +declare void @llvm.dbg.declare(metadata, metadata, metadata) -declare i32 @printf(ptr, ...) #2 +declare i32 @printf(ptr, ...) -define void @invalid0(ptr %A) #0 !dbg !13 { +define void @invalid0(ptr %A) !dbg !13 { entry: call void @llvm.dbg.value(metadata ptr %A, i64 0, metadata !77, metadata !24), !dbg !78 call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !79, metadata !24), !dbg !81 @@ -173,12 +173,7 @@ for.end.7: ; preds = %for.cond ret void, !dbg !105 } -declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1 - -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone } -attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { nounwind } +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!20, !21} diff --git a/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll b/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll index 55192b5bc2c97..fa0c81fe9a48e 100644 --- a/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll +++ b/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll @@ -9,7 +9,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" ; Function Attrs: nounwind uwtable -define void @cfft2(i32 %n, ptr %A) local_unnamed_addr #0 { +define void @cfft2(i32 %n, ptr %A) local_unnamed_addr { entry: br i1 true, label %for.body.lr.ph, label %for.end @@ -49,8 +49,6 @@ for.end: ; preds = %for.inc, %entry ret void } -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.ident = !{!0} !0 = !{!"clang version 3.9.0 (trunk 273249) (llvm/trunk 273255)"} diff --git a/polly/test/ScopInfo/unnamed_stmts.ll b/polly/test/ScopInfo/unnamed_stmts.ll index 5a189454471f4..163170ce74895 100644 --- a/polly/test/ScopInfo/unnamed_stmts.ll +++ b/polly/test/ScopInfo/unnamed_stmts.ll @@ -48,7 +48,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; Function Attrs: nounwind uwtable -define void @vec3(i64 %n, ptr, ptr) #0 { +define void @vec3(i64 %n, ptr, ptr) { br label %.split .split: ; preds = %0 @@ -141,8 +141,6 @@ define void @vec3(i64 %n, ptr, ptr) #0 { ret void } -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.ident = !{!0} !0 = !{!"Ubuntu clang version 3.7.1-3ubuntu4 (tags/RELEASE_371/final) (based on LLVM 3.7.1)"} diff --git a/polly/test/Simplify/phi_in_regionstmt.ll b/polly/test/Simplify/phi_in_regionstmt.ll index 76efd484f547f..2bb05738955a3 100644 --- a/polly/test/Simplify/phi_in_regionstmt.ll +++ b/polly/test/Simplify/phi_in_regionstmt.ll @@ -14,7 +14,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @qmatrix = external local_unnamed_addr global [8 x ptr], align 16 ; Function Attrs: nounwind uwtable -define void @AssignQuantParam(ptr %pps) local_unnamed_addr #0 { +define void @AssignQuantParam(ptr %pps) local_unnamed_addr { entry: br label %entry.split @@ -46,8 +46,6 @@ if.else121.us.7: ; preds = %if.else135.us.6, %i br label %if.end161 } -attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.ident = !{!0} !0 = !{!"clang version 6.0.0 (trunk 308961)"} @@ -59,5 +57,4 @@ attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="fals !6 = !{!"any pointer", !3, i64 0} !7 = !{!6, !6, i64 0} - ; CHECK: SCoP could not be simplified From ea45fec99c10b940938c8497a9b0d3d64388d44b Mon Sep 17 00:00:00 2001 From: Kunqiu Chen Date: Wed, 22 Oct 2025 22:17:39 +0800 Subject: [PATCH 106/530] [PredicateInfo] Reserve adjacent LN_Last defs for the same phi use (#164577) This patch fixes a missed optimization issue: predicate infos might be lost in phi-use scenarios. Due to the existence of and-chains, a phi-use might be associated with multiple LN_Last predicate infos. E.g., ```cpp // TWO LN_Last Predicate Info defs: // 1. a >= 1 // 2. a < 2 if ( a < 1 || a >= 2) { a = 1; } // PHI use of `a` use(a) ``` However, previously, `popStackUntilDFSScope` reserved only ONE LN_Last def for a phi use (i.e., reserve only one of `a >= 1` / `a < 2`), although there might be multiple LN_Last defs for the same phi use. This patch reserves the adjacent LN_Last defs if they are designated for the same phi use. --- llvm/lib/Transforms/Utils/PredicateInfo.cpp | 13 ++++++--- .../test/Transforms/SCCP/conditions-ranges.ll | 25 +++++++++++++++++ .../Util/PredicateInfo/testandor.ll | 27 +++++++++++++++++++ 3 files changed, 62 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp index 978d5a25a57c8..371d9e6c83163 100644 --- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp +++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp @@ -260,9 +260,16 @@ bool PredicateInfoBuilder::stackIsInScope(const ValueDFSStack &Stack, // next to the defs they must go with so that we can know it's time to pop // the stack when we hit the end of the phi uses for a given def. const ValueDFS &Top = *Stack.back().V; - if (Top.LocalNum == LN_Last && Top.PInfo) { - if (!VDUse.U) - return false; + assert(Top.PInfo && "RenameStack should only contain predicate infos (defs)"); + if (Top.LocalNum == LN_Last) { + if (!VDUse.U) { + assert(VDUse.PInfo && "A non-use VDUse should have a predicate info"); + // We should reserve adjacent LN_Last defs for the same phi use. + return VDUse.LocalNum == LN_Last && + // If the two phi defs have the same edge, they must be designated + // for the same succ BB. + getBlockEdge(Top.PInfo) == getBlockEdge(VDUse.PInfo); + } auto *PHI = dyn_cast(VDUse.U->getUser()); if (!PHI) return false; diff --git a/llvm/test/Transforms/SCCP/conditions-ranges.ll b/llvm/test/Transforms/SCCP/conditions-ranges.ll index a3cf23bd3ae9a..f793814e445f8 100644 --- a/llvm/test/Transforms/SCCP/conditions-ranges.ll +++ b/llvm/test/Transforms/SCCP/conditions-ranges.ll @@ -1547,3 +1547,28 @@ bb2: call void @use(i1 %c4) ret void } + +define i1 @and_predicate_dominating_phi(i32 %x) { +; CHECK-LABEL: @and_predicate_dominating_phi( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[XGE1:%.*]] = icmp uge i32 [[X:%.*]], 1 +; CHECK-NEXT: [[XLT2:%.*]] = icmp ult i32 [[X]], 2 +; CHECK-NEXT: [[AND:%.*]] = and i1 [[XGE1]], [[XLT2]] +; CHECK-NEXT: br i1 [[AND]], label [[PHI:%.*]], label [[NOPE:%.*]] +; CHECK: nope: +; CHECK-NEXT: br label [[PHI]] +; CHECK: phi: +; CHECK-NEXT: ret i1 true +; +entry: + %xge1 = icmp uge i32 %x, 1 + %xlt2 = icmp ult i32 %x, 2 + %and = and i1 %xge1, %xlt2 + br i1 %and, label %phi, label %nope +nope: + br label %phi +phi: + %res = phi i32 [ %x, %entry ], [ 1, %nope ] + %ret = icmp uge i32 %res, 1 + ret i1 %ret +} diff --git a/llvm/test/Transforms/Util/PredicateInfo/testandor.ll b/llvm/test/Transforms/Util/PredicateInfo/testandor.ll index 2e96a92dd37d4..cc1dc4e6989a1 100644 --- a/llvm/test/Transforms/Util/PredicateInfo/testandor.ll +++ b/llvm/test/Transforms/Util/PredicateInfo/testandor.ll @@ -994,3 +994,30 @@ define void @test_assume_deep_and_tree(i1 %a1) { call void @foo(i1 %a15) ret void } + +define i32 @test_and_with_phinode(i32 %x) { +; CHECK-LABEL: @test_and_with_phinode( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[XGE1:%.*]] = icmp uge i32 [[X:%.*]], 1 +; CHECK-NEXT: [[XLT2:%.*]] = icmp ult i32 [[X]], 2 +; CHECK-NEXT: [[AND:%.*]] = and i1 [[XGE1]], [[XLT2]] +; CHECK: [[X_0_1:%.*]] = bitcast i32 [[X]] to i32 +; CHECK: [[X_0_2:%.*]] = bitcast i32 [[X_0_1]] to i32 +; CHECK-NEXT: br i1 [[AND]], label [[PHI:%.*]], label [[NOPE:%.*]] +; CHECK: nope: +; CHECK-NEXT: br label [[PHI]] +; CHECK: phi: +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[X_0_2]], [[ENTRY:%.*]] ], [ 1, [[NOPE]] ] +; CHECK-NEXT: ret i32 [[RES]] +; +entry: + %xge1 = icmp uge i32 %x, 1 + %xlt2 = icmp ult i32 %x, 2 + %and = and i1 %xge1, %xlt2 + br i1 %and, label %phi, label %nope +nope: + br label %phi +phi: + %res = phi i32 [ %x, %entry ], [ 1, %nope ] + ret i32 %res +} From f7bbcdea8e9c0cb17e44c3920631eaad449f3229 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Wed, 22 Oct 2025 17:19:12 +0300 Subject: [PATCH 107/530] [compiler-rt] Fix building for arm64ec (#164590) c208a23643231d0b19c6f795895a16dfe6797340 added the directive `.att_syntax` when building for x86 architectures. However, when building for arm64ec (a Windows target, for an ABI compatible with x86_64), the defines for `__x86_64__` (and similar ones like `__amd64__`) are still defined, so we need to check for `__arm64ec__` here as well to skip it for such targets. This matches similar existing ifdefs for x86_64/aarch64 in compiler-rt builtins. --- compiler-rt/lib/builtins/assembly.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/compiler-rt/lib/builtins/assembly.h b/compiler-rt/lib/builtins/assembly.h index d28f73f14b0cf..ac119af521a37 100644 --- a/compiler-rt/lib/builtins/assembly.h +++ b/compiler-rt/lib/builtins/assembly.h @@ -337,7 +337,8 @@ #endif #endif -#if defined(__ASSEMBLER__) && (defined(__i386__) || defined(__amd64__)) +#if defined(__ASSEMBLER__) && (defined(__i386__) || defined(__amd64__)) && \ + !defined(__arm64ec__) .att_syntax #endif From 45c0b29171633e3977938ded4223d9184af5c07b Mon Sep 17 00:00:00 2001 From: Kerry McLaughlin Date: Wed, 22 Oct 2025 15:21:27 +0100 Subject: [PATCH 108/530] [LV] Ignore user-specified interleave count when unsafe. (#153009) When an VF is specified via a loop hint, it will be clamped to a safe VF or ignored if it is found to be unsafe. This is not the case for user-specified interleave counts, which can lead to loops such as the following with a memory dependence being vectorised with interleaving: ``` #pragma clang loop interleave_count(4) for (int i = 4; i < LEN; i++) b[i] = b[i - 4] + a[i]; ``` According to [1], loop hints are ignored if they are not safe to apply. This patch adds a check to prevent vectorisation with interleaving if isSafeForAnyVectorWidth() returns false. This is already checked in selectInterleaveCount(). [1] https://llvm.org/docs/LangRef.html#llvm-loop-vectorize-and-llvm-loop-interleave --- .../Transforms/Vectorize/LoopVectorize.cpp | 12 +++++++- .../AArch64/scalable-reductions.ll | 14 ++++----- .../LoopVectorize/unsafe-ic-hint-remark.ll | 30 +++++++++++++++++++ 3 files changed, 46 insertions(+), 10 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index adf27bed3d749..d2c100c9adbfd 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9860,6 +9860,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Get user vectorization factor and interleave count. ElementCount UserVF = Hints.getWidth(); unsigned UserIC = Hints.getInterleave(); + if (UserIC > 1 && !LVL.isSafeForAnyVectorWidth()) + UserIC = 1; // Plan how to best vectorize. LVP.plan(UserVF, UserIC); @@ -9924,7 +9926,15 @@ bool LoopVectorizePass::processLoop(Loop *L) { VectorizeLoop = false; } - if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) { + if (UserIC == 1 && Hints.getInterleave() > 1) { + assert(!LVL.isSafeForAnyVectorWidth() && + "UserIC should only be ignored due to unsafe dependencies"); + LLVM_DEBUG(dbgs() << "LV: Ignoring user-specified interleave count.\n"); + IntDiagMsg = {"InterleavingUnsafe", + "Ignoring user-specified interleave count due to possibly " + "unsafe dependencies in the loop."}; + InterleaveLoop = false; + } else if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) { // Tell the user interleaving was avoided up-front, despite being explicitly // requested. LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll index 11cc971586773..fb7890a3b82f4 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll @@ -417,21 +417,17 @@ for.end: ; preds = %for.body, %entry ; Note: This test was added to ensure we always check the legality of reductions (end emit a warning if necessary) before checking for memory dependencies ; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. -; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2) +; CHECK-REMARK: Ignoring user-specified interleave count due to possibly unsafe dependencies in the loop. +; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 1) define i32 @memory_dependence(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i64 %n) { ; CHECK-LABEL: @memory_dependence ; CHECK: vector.body: ; CHECK: %[[LOAD1:.*]] = load <4 x i32> ; CHECK: %[[LOAD2:.*]] = load <4 x i32> -; CHECK: %[[LOAD3:.*]] = load <4 x i32> -; CHECK: %[[LOAD4:.*]] = load <4 x i32> -; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD3]], %[[LOAD1]] -; CHECK: %[[ADD2:.*]] = add nsw <4 x i32> %[[LOAD4]], %[[LOAD2]] -; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD3]] -; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD4]] +; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD2]], %[[LOAD1]] +; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD2]] ; CHECK: middle.block: -; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]] -; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]]) +; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[MUL1]]) entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll b/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll new file mode 100644 index 0000000000000..01934b1d7fbd2 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll @@ -0,0 +1,30 @@ +; RUN: opt -passes=loop-vectorize -pass-remarks-analysis=loop-vectorize -S < %s 2>&1 | FileCheck %s + +; Make sure the unsafe user specified interleave count is ignored. + +; CHECK: remark: :0:0: Ignoring user-specified interleave count due to possibly unsafe dependencies in the loop. +; CHECK-LABEL: @loop_distance_4 +define void @loop_distance_4(ptr %a, ptr %b) { +entry: + br label %loop + +loop: + %iv = phi i64 [ 4, %entry ], [ %iv.next, %loop ] + %0 = getelementptr i32, ptr %b, i64 %iv + %arrayidx = getelementptr i8, ptr %0, i64 -16 + %1 = load i32, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds nuw i32, ptr %a, i64 %iv + %2 = load i32, ptr %arrayidx2, align 4 + %add = add nsw i32 %2, %1 + store i32 %add, ptr %0, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 64 + br i1 %exitcond.not, label %for.end, label %loop, !llvm.loop !1 + +for.end: + ret void +} + +!1 = !{!1, !2, !3} +!2 = !{!"llvm.loop.interleave.count", i32 4} +!3 = !{!"llvm.loop.vectorize.width", i32 4} From 2dbe9592663a701546efd1ec1396417629542e4b Mon Sep 17 00:00:00 2001 From: Daniel Chen Date: Wed, 22 Oct 2025 10:24:39 -0400 Subject: [PATCH 109/530] Get the BoxType from the RHS instead of LHS for polymorphic pointer assignment inside FORALL. (#164279) Fixes #153220 --- .../flang/Optimizer/Builder/HLFIRTools.h | 7 ++ flang/lib/Lower/Bridge.cpp | 14 ++- flang/test/Lower/forall-polymorphic.f90 | 89 +++++++++++++++++++ 3 files changed, 108 insertions(+), 2 deletions(-) create mode 100644 flang/test/Lower/forall-polymorphic.f90 diff --git a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h index f96d2228915fe..9f7c10c2b06c2 100644 --- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h +++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h @@ -98,6 +98,13 @@ class Entity : public mlir::Value { mlir::Type getElementOrSequenceType() const { return hlfir::getFortranElementOrSequenceType(getType()); } + /// Return the fir.class or fir.box type needed to describe this entity. + fir::BaseBoxType getBoxType() const { + if (isBoxAddressOrValue()) + return llvm::cast(fir::unwrapRefType(getType())); + const bool isVolatile = fir::isa_volatile_type(getType()); + return fir::BoxType::get(getElementOrSequenceType(), isVolatile); + } bool hasLengthParameters() const { mlir::Type eleTy = getFortranElementType(); diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index 238a623dc6c5d..acb8e114c167b 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -4733,11 +4733,21 @@ class FirConverter : public Fortran::lower::AbstractConverter { return fir::factory::createUnallocatedBox(*builder, loc, lhsBoxType, {}); hlfir::Entity rhs = Fortran::lower::convertExprToHLFIR( loc, *this, assign.rhs, localSymbols, rhsContext); + auto rhsBoxType = rhs.getBoxType(); // Create pointer descriptor value from the RHS. if (rhs.isMutableBox()) rhs = hlfir::Entity{fir::LoadOp::create(*builder, loc, rhs)}; - mlir::Value rhsBox = hlfir::genVariableBox( - loc, *builder, rhs, lhsBoxType.getBoxTypeWithNewShape(rhs.getRank())); + + // Use LHS type if LHS is not polymorphic. + fir::BaseBoxType targetBoxType; + if (assign.lhs.GetType()->IsPolymorphic()) + targetBoxType = rhsBoxType.getBoxTypeWithNewAttr( + fir::BaseBoxType::Attribute::Pointer); + else + targetBoxType = lhsBoxType.getBoxTypeWithNewShape(rhs.getRank()); + mlir::Value rhsBox = + hlfir::genVariableBox(loc, *builder, rhs, targetBoxType); + // Apply lower bounds or reshaping if any. if (const auto *lbExprs = std::get_if(&assign.u); diff --git a/flang/test/Lower/forall-polymorphic.f90 b/flang/test/Lower/forall-polymorphic.f90 new file mode 100644 index 0000000000000..2b7a51f9b549a --- /dev/null +++ b/flang/test/Lower/forall-polymorphic.f90 @@ -0,0 +1,89 @@ +! Test lower of FORALL polymorphic pointer assignment +! RUN: bbc -emit-fir %s -o - | FileCheck %s + +!! Test when LHS is polymorphic and RHS is not polymorphic +! CHECK-LABEL: c.func @_QPforallpolymorphic + subroutine forallPolymorphic() + TYPE :: DT + CLASS(DT), POINTER :: Ptr(:) => NULL() + END TYPE + + TYPE, EXTENDS(DT) :: DT1 + END TYPE + + TYPE(DT1), TARGET :: Tar1(10) + CLASS(DT), POINTER :: T(:) + integer :: I + + FORALL (I=1:10) + T(I)%Ptr => Tar1 + END FORALL + +! CHECK: %[[V_11:[0-9]+]] = fir.alloca !fir.class>>>}>>>> {bindc_name = "t", uniq_name = "_QFforallpolymorphicEt"} +! CHECK: %[[V_15:[0-9]+]] = fir.declare %[[V_11]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFforallpolymorphicEt"} : (!fir.ref>>>}>>>>>) -> !fir.ref>>>}>>>>> +! CHECK: %[[V_16:[0-9]+]] = fir.alloca !fir.array<10x!fir.type<_QFforallpolymorphicTdt1{dt:!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class>>>}>}>> {bindc_name = "tar1", fir.target, uniq_name = "_QFforallpolymorphicEtar1"} +! CHECK: %[[V_17:[0-9]+]] = fir.shape %c10 : (index) -> !fir.shape<1> +! CHECK: %[[V_18:[0-9]+]] = fir.declare %[[V_16]](%[[V_17]]) {fortran_attrs = #fir.var_attrs, uniq_name = "_QFforallpolymorphicEtar1"} : (!fir.ref>>>}>}>>>, !fir.shape<1>) -> !fir.ref>>>}>}>>> +! CHECK: %[[V_19:[0-9]+]] = fir.embox %[[V_18]](%[[V_17]]) : (!fir.ref>>>}>}>>>, !fir.shape<1>) -> !fir.box>>>}>}>>> +! CHECK: %[[V_34:[0-9]+]] = fir.convert %c1_i32 : (i32) -> index +! CHECK: %[[V_35:[0-9]+]] = fir.convert %c10_i32 : (i32) -> index +! CHECK: fir.do_loop %arg0 = %[[V_34]] to %[[V_35]] step %c1 +! CHECK: { +! CHECK: %[[V_36:[0-9]+]] = fir.convert %arg0 : (index) -> i32 +! CHECK: %[[V_37:[0-9]+]] = fir.load %[[V_15]] : !fir.ref>>>}>>>>> +! CHECK: %[[V_38:[0-9]+]] = fir.convert %[[V_36]] : (i32) -> i64 +! CHECK: %[[C0:.*]] = arith.constant 0 : index +! CHECK: %[[V_39:[0-9]+]]:3 = fir.box_dims %37, %[[C0]] : (!fir.class>>>}>>>>, index) -> (index, index, index) +! CHECK: %[[V_40:[0-9]+]] = fir.shift %[[V_39]]#0 : (index) -> !fir.shift<1> +! CHECK: %[[V_41:[0-9]+]] = fir.array_coor %[[V_37]](%[[V_40]]) %[[V_38]] : (!fir.class>>>}>>>>, !fir.shift<1>, i64) -> !fir.ref>>>}>> +! CHECK: %[[V_42:[0-9]+]] = fir.embox %[[V_41]] source_box %[[V_37]] : (!fir.ref>>>}>>, !fir.class>>>}>>>>) -> !fir.class>>>}>> +! CHECK: %[[V_43:[0-9]+]] = fir.field_index ptr, !fir.type<_QFforallpolymorphicTdt{ptr:!fir.class>>>}> +! CHECK: %[[V_44:[0-9]+]] = fir.coordinate_of %[[V_42]], ptr : (!fir.class>>>}>>) -> !fir.ref>>>}>>>>> +! CHECK: %[[V_45:[0-9]+]] = fir.embox %[[V_18]](%[[V_17]]) : (!fir.ref>>>}>}>>>, !fir.shape<1>) -> !fir.box>>>}>}>>>> +! CHECK: %[[V_46:[0-9]+]] = fir.convert %[[V_45]] : (!fir.box>>>}>}>>>>) -> !fir.class>>>}>>>> +! CHECK: fir.store %[[V_46]] to %[[V_44]] : !fir.ref>>>}>>>>> +! CHECK: } + + end subroutine forallPolymorphic + +!! Test when LHS is not polymorphic but RHS is polymorphic +! CHECK-LABEL: c.func @_QPforallpolymorphic2( +! CHECK-SAME: %arg0: !fir.ref>>>}>>>>> {fir.bindc_name = "tar1", fir.target}) { + subroutine forallPolymorphic2(Tar1) + TYPE :: DT + TYPE(DT), POINTER :: Ptr(:) => NULL() + END TYPE + + TYPE, EXTENDS(DT) :: DT1 + END TYPE + + CLASS(DT), ALLOCATABLE, TARGET :: Tar1(:) + TYPE(DT) :: T(10) + integer :: I + + FORALL (I=1:10) + T(I)%Ptr => Tar1 + END FORALL + +! CHECK: %[[V_11:[0-9]+]] = fir.alloca !fir.array<10x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box>>>}>> {bindc_name = "t", uniq_name = "_QFforallpolymorphic2Et"} +! CHECK: %[[V_12:[0-9]+]] = fir.shape %c10 : (index) -> !fir.shape<1> +! CHECK: %[[V_13:[0-9]+]] = fir.declare %[[V_11]](%[[V_12]]) {uniq_name = "_QFforallpolymorphic2Et"} : (!fir.ref>>>}>>>, !fir.shape<1>) -> !fir.ref>>>}>>> +! CHECK: %[[V_18:[0-9]+]] = fir.declare %arg0 dummy_scope %0 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFforallpolymorphic2Etar1"} : (!fir.ref>>>}>>>>>, !fir.dscope) -> !fir.ref>>>}>>>>> +! CHECK: %[[V_30:[0-9]+]] = fir.convert %c1_i32 : (i32) -> index +! CHECK: %[[V_31:[0-9]+]] = fir.convert %c10_i32 : (i32) -> index +! CHECK: fir.do_loop %arg1 = %[[V_30]] to %[[V_31]] step %c1 +! CHECK: { +! CHECK: %[[V_32:[0-9]+]] = fir.convert %arg1 : (index) -> i32 +! CHECK: %[[V_33:[0-9]+]] = fir.convert %[[V_32]] : (i32) -> i64 +! CHECK: %[[V_34:[0-9]+]] = fir.array_coor %[[V_13]](%[[V_12]]) %[[V_33]] : (!fir.ref>>>}>>>, !fir.shape<1>, i64) -> !fir.ref>>>}>> +! CHECK: %[[V_35:[0-9]+]] = fir.field_index ptr, !fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box>>>}> +! CHECK: %[[V_36:[0-9]+]] = fir.coordinate_of %[[V_34]], ptr : (!fir.ref>>>}>>) -> !fir.ref>>>}>>>>> +! CHECK: %[[V_37:[0-9]+]] = fir.load %[[V_18]] : !fir.ref>>>}>>>>> +! CHECK: %[[V_38:[0-9]+]]:3 = fir.box_dims %[[V_37]], %c0 : (!fir.class>>>}>>>>, index) -> (index, index, index) +! CHECK: %[[V_39:[0-9]+]] = fir.shift %[[V_38]]#0 : (index) -> !fir.shift<1> +! CHECK: %[[V_40:[0-9]+]] = fir.rebox %[[V_37]](%[[V_39]]) : (!fir.class>>>}>>>>, !fir.shift<1>) -> !fir.box>>>}>>>> +! CHECK: fir.store %[[V_40]] to %[[V_36]] : !fir.ref>>>}>>>>> +! CHECK: } + + end subroutine forallPolymorphic2 + From 7c826d4b4e394b6370714a03cf20fd6e8ddb4a12 Mon Sep 17 00:00:00 2001 From: Walter Lee <49250218+googlewalt@users.noreply.github.com> Date: Wed, 22 Oct 2025 10:26:02 -0400 Subject: [PATCH 110/530] Revert "[lldb-dap] Use protocol types for exceptioninfo" (#164631) Reverts llvm/llvm-project#164318 2 failures from LLVM Buildbot. --- .../Handler/ExceptionInfoRequestHandler.cpp | 211 +++++++++++++----- lldb/tools/lldb-dap/Handler/RequestHandler.h | 10 +- .../lldb-dap/Protocol/ProtocolRequests.cpp | 18 -- .../lldb-dap/Protocol/ProtocolRequests.h | 22 -- .../tools/lldb-dap/Protocol/ProtocolTypes.cpp | 33 --- lldb/tools/lldb-dap/Protocol/ProtocolTypes.h | 30 --- lldb/unittests/DAP/CMakeLists.txt | 1 - lldb/unittests/DAP/ProtocolRequestsTest.cpp | 69 ------ lldb/unittests/DAP/ProtocolTypesTest.cpp | 47 ---- .../TestingSupport/TestUtilities.cpp | 5 - lldb/unittests/TestingSupport/TestUtilities.h | 4 - 11 files changed, 154 insertions(+), 296 deletions(-) delete mode 100644 lldb/unittests/DAP/ProtocolRequestsTest.cpp diff --git a/lldb/tools/lldb-dap/Handler/ExceptionInfoRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/ExceptionInfoRequestHandler.cpp index fa01a2036e1dd..c1c2adb32a510 100644 --- a/lldb/tools/lldb-dap/Handler/ExceptionInfoRequestHandler.cpp +++ b/lldb/tools/lldb-dap/Handler/ExceptionInfoRequestHandler.cpp @@ -7,77 +7,168 @@ //===----------------------------------------------------------------------===// #include "DAP.h" -#include "DAPError.h" -#include "Protocol/ProtocolRequests.h" -#include "Protocol/ProtocolTypes.h" +#include "EventHelper.h" +#include "JSONUtils.h" #include "RequestHandler.h" #include "lldb/API/SBStream.h" -using namespace lldb_dap::protocol; - namespace lldb_dap { -/// Retrieves the details of the exception that caused this event to be raised. -/// -/// Clients should only call this request if the corresponding capability -/// `supportsExceptionInfoRequest` is true. -llvm::Expected -ExceptionInfoRequestHandler::Run(const ExceptionInfoArguments &args) const { - - lldb::SBThread thread = dap.GetLLDBThread(args.threadId); - if (!thread.IsValid()) - return llvm::make_error( - llvm::formatv("Invalid thread id: {}", args.threadId).str()); - - ExceptionInfoResponseBody response; - response.breakMode = eExceptionBreakModeAlways; - const lldb::StopReason stop_reason = thread.GetStopReason(); - switch (stop_reason) { - case lldb::eStopReasonSignal: - response.exceptionId = "signal"; - break; - case lldb::eStopReasonBreakpoint: { - const ExceptionBreakpoint *exc_bp = - dap.GetExceptionBPFromStopReason(thread); - if (exc_bp) { - response.exceptionId = exc_bp->GetFilter(); - response.description = exc_bp->GetLabel(); +// "ExceptionInfoRequest": { +// "allOf": [ { "$ref": "#/definitions/Request" }, { +// "type": "object", +// "description": "Retrieves the details of the exception that +// caused this event to be raised. Clients should only call this request if +// the corresponding capability `supportsExceptionInfoRequest` is true.", +// "properties": { +// "command": { +// "type": "string", +// "enum": [ "exceptionInfo" ] +// }, +// "arguments": { +// "$ref": "#/definitions/ExceptionInfoArguments" +// } +// }, +// "required": [ "command", "arguments" ] +// }] +// }, +// "ExceptionInfoArguments": { +// "type": "object", +// "description": "Arguments for `exceptionInfo` request.", +// "properties": { +// "threadId": { +// "type": "integer", +// "description": "Thread for which exception information should be +// retrieved." +// } +// }, +// "required": [ "threadId" ] +// }, +// "ExceptionInfoResponse": { +// "allOf": [ { "$ref": "#/definitions/Response" }, { +// "type": "object", +// "description": "Response to `exceptionInfo` request.", +// "properties": { +// "body": { +// "type": "object", +// "properties": { +// "exceptionId": { +// "type": "string", +// "description": "ID of the exception that was thrown." +// }, +// "description": { +// "type": "string", +// "description": "Descriptive text for the exception." +// }, +// "breakMode": { +// "$ref": "#/definitions/ExceptionBreakMode", +// "description": "Mode that caused the exception notification to +// be raised." +// }, +// "details": { +// "$ref": "#/definitions/ExceptionDetails", +// "description": "Detailed information about the exception." +// } +// }, +// "required": [ "exceptionId", "breakMode" ] +// } +// }, +// "required": [ "body" ] +// }] +// } +// "ExceptionDetails": { +// "type": "object", +// "description": "Detailed information about an exception that has +// occurred.", "properties": { +// "message": { +// "type": "string", +// "description": "Message contained in the exception." +// }, +// "typeName": { +// "type": "string", +// "description": "Short type name of the exception object." +// }, +// "fullTypeName": { +// "type": "string", +// "description": "Fully-qualified type name of the exception object." +// }, +// "evaluateName": { +// "type": "string", +// "description": "An expression that can be evaluated in the current +// scope to obtain the exception object." +// }, +// "stackTrace": { +// "type": "string", +// "description": "Stack trace at the time the exception was thrown." +// }, +// "innerException": { +// "type": "array", +// "items": { +// "$ref": "#/definitions/ExceptionDetails" +// }, +// "description": "Details of the exception contained by this exception, +// if any." +// } +// } +// }, +void ExceptionInfoRequestHandler::operator()( + const llvm::json::Object &request) const { + llvm::json::Object response; + FillResponse(request, response); + const auto *arguments = request.getObject("arguments"); + llvm::json::Object body; + lldb::SBThread thread = dap.GetLLDBThread(*arguments); + if (thread.IsValid()) { + auto stopReason = thread.GetStopReason(); + if (stopReason == lldb::eStopReasonSignal) + body.try_emplace("exceptionId", "signal"); + else if (stopReason == lldb::eStopReasonBreakpoint) { + ExceptionBreakpoint *exc_bp = dap.GetExceptionBPFromStopReason(thread); + if (exc_bp) { + EmplaceSafeString(body, "exceptionId", exc_bp->GetFilter()); + EmplaceSafeString(body, "description", exc_bp->GetLabel()); + } else { + body.try_emplace("exceptionId", "exception"); + } } else { - response.exceptionId = "exception"; - } - } break; - default: - response.exceptionId = "exception"; - } - - if (response.description.empty()) { - const size_t buffer_size = thread.GetStopDescription(nullptr, 0); - if (buffer_size > 0) { - std::string &buffer = response.description; - buffer.resize(buffer_size); - thread.GetStopDescription(buffer.data(), buffer.size()); + body.try_emplace("exceptionId", "exception"); } - } - - if (lldb::SBValue exception = thread.GetCurrentException()) { - lldb::SBStream stream; - response.details = ExceptionDetails{}; - if (exception.GetDescription(stream)) { - response.details->message = stream.GetData(); + if (!ObjectContainsKey(body, "description")) { + char description[1024]; + if (thread.GetStopDescription(description, sizeof(description))) { + EmplaceSafeString(body, "description", description); + } } + body.try_emplace("breakMode", "always"); + auto exception = thread.GetCurrentException(); + if (exception.IsValid()) { + llvm::json::Object details; + lldb::SBStream stream; + if (exception.GetDescription(stream)) { + EmplaceSafeString(details, "message", stream.GetData()); + } - if (lldb::SBThread exception_backtrace = - thread.GetCurrentExceptionBacktrace()) { - stream.Clear(); - exception_backtrace.GetDescription(stream); - - for (uint32_t idx = 0; idx < exception_backtrace.GetNumFrames(); idx++) { - lldb::SBFrame frame = exception_backtrace.GetFrameAtIndex(idx); - frame.GetDescription(stream); + auto exceptionBacktrace = thread.GetCurrentExceptionBacktrace(); + if (exceptionBacktrace.IsValid()) { + lldb::SBStream stream; + exceptionBacktrace.GetDescription(stream); + for (uint32_t i = 0; i < exceptionBacktrace.GetNumFrames(); i++) { + lldb::SBFrame frame = exceptionBacktrace.GetFrameAtIndex(i); + frame.GetDescription(stream); + } + EmplaceSafeString(details, "stackTrace", stream.GetData()); } - response.details->stackTrace = stream.GetData(); + + body.try_emplace("details", std::move(details)); } + // auto excInfoCount = thread.GetStopReasonDataCount(); + // for (auto i=0; i> { +class ExceptionInfoRequestHandler : public LegacyRequestHandler { public: - using RequestHandler::RequestHandler; + using LegacyRequestHandler::LegacyRequestHandler; static llvm::StringLiteral GetCommand() { return "exceptionInfo"; } FeatureSet GetSupportedFeatures() const override { return {protocol::eAdapterFeatureExceptionInfoRequest}; } - llvm::Expected - Run(const protocol::ExceptionInfoArguments &args) const override; + void operator()(const llvm::json::Object &request) const override; }; class InitializeRequestHandler diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp index e207aad2167d6..b9393356b4e01 100644 --- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp +++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp @@ -625,22 +625,4 @@ llvm::json::Value toJSON(const ModuleSymbolsResponseBody &DGMSR) { return result; } -bool fromJSON(const json::Value &Params, ExceptionInfoArguments &Args, - json::Path Path) { - json::ObjectMapper O(Params, Path); - return O && O.map("threadId", Args.threadId); -} - -json::Value toJSON(const ExceptionInfoResponseBody &ERB) { - json::Object result{{"exceptionId", ERB.exceptionId}, - {"breakMode", ERB.breakMode}}; - - if (!ERB.description.empty()) - result.insert({"description", ERB.description.c_str()}); - if (ERB.details.has_value()) - result.insert({"details", *ERB.details}); - - return result; -} - } // namespace lldb_dap::protocol diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h index 53e551ac2ec64..a85a68b87014c 100644 --- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h +++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h @@ -1039,28 +1039,6 @@ struct ModuleSymbolsResponseBody { }; llvm::json::Value toJSON(const ModuleSymbolsResponseBody &); -struct ExceptionInfoArguments { - /// Thread for which exception information should be retrieved. - lldb::tid_t threadId = LLDB_INVALID_THREAD_ID; -}; -bool fromJSON(const llvm::json::Value &, ExceptionInfoArguments &, - llvm::json::Path); - -struct ExceptionInfoResponseBody { - /// ID of the exception that was thrown. - std::string exceptionId; - - /// Descriptive text for the exception. - std::string description; - - /// Mode that caused the exception notification to be raised. - ExceptionBreakMode breakMode; - - /// Detailed information about the exception. - std::optional details; -}; -llvm::json::Value toJSON(const ExceptionInfoResponseBody &); - } // namespace lldb_dap::protocol #endif diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp index 95007013742a0..dc8edaadcd9bb 100644 --- a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp +++ b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp @@ -1136,37 +1136,4 @@ bool fromJSON(const json::Value &Param, Variable &V, json::Path Path) { Path, /*required=*/false); } -json::Value toJSON(const ExceptionBreakMode Mode) { - switch (Mode) { - case eExceptionBreakModeNever: - return "never"; - case eExceptionBreakModeAlways: - return "always"; - case eExceptionBreakModeUnhandled: - return "unhandled"; - case eExceptionBreakModeUserUnhandled: - return "userUnhandled"; - } - llvm_unreachable("unhandled exception breakMode."); -} - -json::Value toJSON(const ExceptionDetails &ED) { - json::Object result; - - if (!ED.message.empty()) - result.insert({"message", ED.message}); - if (!ED.typeName.empty()) - result.insert({"typeName", ED.typeName}); - if (!ED.fullTypeName.empty()) - result.insert({"fullTypeName", ED.fullTypeName}); - if (!ED.evaluateName.empty()) - result.insert({"evaluateName", ED.evaluateName}); - if (!ED.stackTrace.empty()) - result.insert({"stackTrace", ED.stackTrace}); - if (!ED.innerException.empty()) - result.insert({"innerException", ED.innerException}); - - return result; -} - } // namespace lldb_dap::protocol diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h index 6d85c74377bd3..7077df90a85b5 100644 --- a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h +++ b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h @@ -1007,36 +1007,6 @@ struct Variable { llvm::json::Value toJSON(const Variable &); bool fromJSON(const llvm::json::Value &, Variable &, llvm::json::Path); -enum ExceptionBreakMode : unsigned { - eExceptionBreakModeNever, - eExceptionBreakModeAlways, - eExceptionBreakModeUnhandled, - eExceptionBreakModeUserUnhandled, -}; -llvm::json::Value toJSON(ExceptionBreakMode); - -struct ExceptionDetails { - /// Message contained in the exception. - std::string message; - - /// Short type name of the exception object. - std::string typeName; - - /// Fully-qualified type name of the exception object. - std::string fullTypeName; - - /// An expression that can be evaluated in the current scope to obtain the - /// exception object. - std::string evaluateName; - - /// Stack trace at the time the exception was thrown. - std::string stackTrace; - - /// Details of the exception contained by this exception, if any. - std::vector innerException; -}; -llvm::json::Value toJSON(const ExceptionDetails &); - } // namespace lldb_dap::protocol #endif diff --git a/lldb/unittests/DAP/CMakeLists.txt b/lldb/unittests/DAP/CMakeLists.txt index 434f5280a97a0..a08414c30e6cd 100644 --- a/lldb/unittests/DAP/CMakeLists.txt +++ b/lldb/unittests/DAP/CMakeLists.txt @@ -7,7 +7,6 @@ add_lldb_unittest(DAPTests Handler/ContinueTest.cpp JSONUtilsTest.cpp LLDBUtilsTest.cpp - ProtocolRequestsTest.cpp ProtocolTypesTest.cpp ProtocolUtilsTest.cpp TestBase.cpp diff --git a/lldb/unittests/DAP/ProtocolRequestsTest.cpp b/lldb/unittests/DAP/ProtocolRequestsTest.cpp deleted file mode 100644 index 498195dc09325..0000000000000 --- a/lldb/unittests/DAP/ProtocolRequestsTest.cpp +++ /dev/null @@ -1,69 +0,0 @@ -//===-- ProtocolRequestsTest.cpp ------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "Protocol/ProtocolRequests.h" -#include "Protocol/ProtocolTypes.h" -#include "TestingSupport/TestUtilities.h" -#include "llvm/Testing/Support/Error.h" -#include - -using namespace llvm; -using namespace lldb_dap::protocol; -using lldb_private::PrettyPrint; -using llvm::json::parse; - -TEST(ProtocolRequestsTest, ExceptionInfoArguments) { - llvm::Expected expected = - parse(R"({ - "threadId": 3434 - })"); - ASSERT_THAT_EXPECTED(expected, llvm::Succeeded()); - EXPECT_EQ(expected->threadId, 3434U); - - // Check required keys; - EXPECT_THAT_EXPECTED(parse(R"({})"), - FailedWithMessage("missing value at (root).threadId")); - - EXPECT_THAT_EXPECTED(parse(R"({"id": 10})"), - FailedWithMessage("missing value at (root).threadId")); -} - -TEST(ProtocolRequestsTest, ExceptionInfoResponseBody) { - ExceptionInfoResponseBody body; - body.exceptionId = "signal"; - body.breakMode = eExceptionBreakModeAlways; - - // Check required keys. - Expected expected = parse( - R"({ - "exceptionId": "signal", - "breakMode": "always" - })"); - - ASSERT_THAT_EXPECTED(expected, llvm::Succeeded()); - EXPECT_EQ(PrettyPrint(*expected), PrettyPrint(body)); - - // Check optional keys. - body.description = "SIGNAL SIGWINCH"; - body.breakMode = eExceptionBreakModeNever; - body.details = ExceptionDetails{}; - body.details->message = "some message"; - - Expected expected_opt = parse( - R"({ - "exceptionId": "signal", - "description": "SIGNAL SIGWINCH", - "breakMode": "never", - "details": { - "message": "some message" - } - })"); - - ASSERT_THAT_EXPECTED(expected_opt, llvm::Succeeded()); - EXPECT_EQ(PrettyPrint(*expected_opt), PrettyPrint(body)); -} diff --git a/lldb/unittests/DAP/ProtocolTypesTest.cpp b/lldb/unittests/DAP/ProtocolTypesTest.cpp index 6a4620a3f1e59..8170abdd25bc6 100644 --- a/lldb/unittests/DAP/ProtocolTypesTest.cpp +++ b/lldb/unittests/DAP/ProtocolTypesTest.cpp @@ -1129,50 +1129,3 @@ TEST(ProtocolTypesTest, DataBreakpointInfoArguments) { EXPECT_THAT_EXPECTED(parse(R"({"name":"data"})"), llvm::Succeeded()); } - -TEST(ProtocolTypesTest, ExceptionBreakMode) { - const std::vector> test_cases = - {{ExceptionBreakMode::eExceptionBreakModeAlways, "always"}, - {ExceptionBreakMode::eExceptionBreakModeNever, "never"}, - {ExceptionBreakMode::eExceptionBreakModeUnhandled, "unhandled"}, - {ExceptionBreakMode::eExceptionBreakModeUserUnhandled, "userUnhandled"}}; - - for (const auto [value, expected] : test_cases) { - json::Value const serialized = toJSON(value); - ASSERT_EQ(serialized.kind(), llvm::json::Value::Kind::String); - EXPECT_EQ(serialized.getAsString(), expected); - } -} - -TEST(ProtocolTypesTest, ExceptionDetails) { - ExceptionDetails details; - - // Check required keys. - Expected expected = parse(R"({})"); - ASSERT_THAT_EXPECTED(expected, llvm::Succeeded()); - EXPECT_EQ(pp(*expected), pp(details)); - - // Check optional keys. - details.message = "SIGABRT exception"; - details.typeName = "signal"; - details.fullTypeName = "SIGABRT"; - details.evaluateName = "process handle SIGABRT"; - details.stackTrace = "some stacktrace"; - ExceptionDetails inner_details; - inner_details.message = "inner message"; - details.innerException = {std::move(inner_details)}; - - Expected expected_opt = parse(R"({ - "message": "SIGABRT exception", - "typeName": "signal", - "fullTypeName": "SIGABRT", - "evaluateName": "process handle SIGABRT", - "stackTrace": "some stacktrace", - "innerException": [{ - "message": "inner message" - }] - })"); - - ASSERT_THAT_EXPECTED(expected_opt, llvm::Succeeded()); - EXPECT_EQ(pp(*expected_opt), pp(details)); -} diff --git a/lldb/unittests/TestingSupport/TestUtilities.cpp b/lldb/unittests/TestingSupport/TestUtilities.cpp index d164c227afb9e..b53822e38324b 100644 --- a/lldb/unittests/TestingSupport/TestUtilities.cpp +++ b/lldb/unittests/TestingSupport/TestUtilities.cpp @@ -20,11 +20,6 @@ using namespace lldb_private; extern const char *TestMainArgv0; std::once_flag TestUtilities::g_debugger_initialize_flag; - -std::string lldb_private::PrettyPrint(const llvm::json::Value &value) { - return llvm::formatv("{0:2}", value).str(); -} - std::string lldb_private::GetInputFilePath(const llvm::Twine &name) { llvm::SmallString<128> result = llvm::sys::path::parent_path(TestMainArgv0); llvm::sys::fs::make_absolute(result); diff --git a/lldb/unittests/TestingSupport/TestUtilities.h b/lldb/unittests/TestingSupport/TestUtilities.h index f05d176618fa0..cc93a68a6a431 100644 --- a/lldb/unittests/TestingSupport/TestUtilities.h +++ b/lldb/unittests/TestingSupport/TestUtilities.h @@ -30,10 +30,6 @@ } namespace lldb_private { - -/// Returns a pretty printed json string of a `llvm::json::Value`. -std::string PrettyPrint(const llvm::json::Value &E); - std::string GetInputFilePath(const llvm::Twine &name); class TestUtilities { From 7ae7a5ad51f32118161ee0aaa13b11368aa5d29b Mon Sep 17 00:00:00 2001 From: Michal R Date: Wed, 22 Oct 2025 16:30:33 +0200 Subject: [PATCH 111/530] [BPF] Do not emit names for PTR, CONST, VOLATILE and RESTRICT BTF types (#163174) We currently raise a warning in `print_btf.py` when any of these types have a name. Linux kernel doesn't allow names in these types either.[0] However, there is nothing stopping frontends from giving names to these types. To make sure that they are always anonymous, explicitly skip the name emission. [0] https://elixir.bootlin.com/linux/v6.17.1/source/kernel/bpf/btf.c#L2586 --- llvm/lib/Target/BPF/BTFDebug.cpp | 19 +++++- llvm/test/CodeGen/BPF/BTF/ptr-named-2.ll | 59 +++++++++++++++++++ llvm/test/CodeGen/BPF/BTF/ptr-named.ll | 75 ++++++++++++++++++++++++ 3 files changed, 152 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/BPF/BTF/ptr-named-2.ll create mode 100644 llvm/test/CodeGen/BPF/BTF/ptr-named.ll diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp index 9b5fc9d05e336..a652b7e9c537f 100644 --- a/llvm/lib/Target/BPF/BTFDebug.cpp +++ b/llvm/lib/Target/BPF/BTFDebug.cpp @@ -95,7 +95,24 @@ void BTFTypeDerived::completeType(BTFDebug &BDebug) { return; IsCompleted = true; - BTFType.NameOff = BDebug.addString(Name); + switch (Kind) { + case BTF::BTF_KIND_PTR: + case BTF::BTF_KIND_CONST: + case BTF::BTF_KIND_VOLATILE: + case BTF::BTF_KIND_RESTRICT: + // Debug info might contain names for these types, but given that we want + // to keep BTF minimal and naming reference types doesn't bring any value + // (what matters is the completeness of the base type), we don't emit them. + // + // Furthermore, the Linux kernel refuses to load BPF programs that contain + // BTF with these types named: + // https://elixir.bootlin.com/linux/v6.17.1/source/kernel/bpf/btf.c#L2586 + BTFType.NameOff = 0; + break; + default: + BTFType.NameOff = BDebug.addString(Name); + break; + } if (NeedsFixup || !DTy) return; diff --git a/llvm/test/CodeGen/BPF/BTF/ptr-named-2.ll b/llvm/test/CodeGen/BPF/BTF/ptr-named-2.ll new file mode 100644 index 0000000000000..df0cbeb3dd625 --- /dev/null +++ b/llvm/test/CodeGen/BPF/BTF/ptr-named-2.ll @@ -0,0 +1,59 @@ +; RUN: llc -mtriple=bpfel -filetype=obj -o %t1 %s +; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1 +; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s +; RUN: llc -mtriple=bpfeb -filetype=obj -o %t1 %s +; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1 +; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s +; +; This IR is hand-written. + +; ModuleID = 'ptr-named-2.ll' +source_filename = "ptr-named-2.ll" +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "bpfel-unknown-none" + +%struct.TypeExamples = type { i32*, i32, i32, i32* } + +@type_examples = internal global %struct.TypeExamples zeroinitializer, align 8, !dbg !0 + +!llvm.dbg.cu = !{!1} +!llvm.module.flags = !{!2, !3, !4} +!llvm.ident = !{!21} + +; CHECK-BTF: [1] STRUCT 'TypeExamples' size=32 vlen=4 +; CHECK-BTF-NEXT: 'ptr' type_id=2 bits_offset=0 +; CHECK-BTF-NEXT: 'volatile' type_id=4 bits_offset=64 +; CHECK-BTF-NEXT: 'const' type_id=5 bits_offset=128 +; CHECK-BTF-NEXT: 'restrict_ptr' type_id=6 bits_offset=192 +; CHECK-BTF-NEXT: [2] PTR '(anon)' type_id=3 +; CHECK-BTF-NEXT: [3] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED +; CHECK-BTF-NEXT: [4] VOLATILE '(anon)' type_id=3 +; CHECK-BTF-NEXT: [5] CONST '(anon)' type_id=3 +; CHECK-BTF-NEXT: [6] RESTRICT '(anon)' type_id=7 +; CHECK-BTF-NEXT: [7] PTR '(anon)' type_id=3 +; CHECK-BTF-NEXT: [8] VAR 'type_examples' type_id=1, linkage=static +; CHECK-BTF-NEXT: [9] DATASEC '.bss' size=0 vlen=1 +; CHECK-BTF-NEXT: type_id=8 offset=0 size=24 + +!0 = !DIGlobalVariableExpression(var: !5, expr: !DIExpression()) +!1 = distinct !DICompileUnit(language: DW_LANG_C99, file: !6, isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !7, globals: !8, splitDebugInlining: false, nameTableKind: None) +!2 = !{i32 2, !"Dwarf Version", i32 4} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"wchar_size", i32 4} +!5 = distinct !DIGlobalVariable(name: "type_examples", scope: !1, file: !6, line: 12, type: !9, isLocal: true, isDefinition: true) +!6 = !DIFile(filename: "ptr-named-2.ll", directory: "/tmp") +!7 = !{} +!8 = !{!0} +!9 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "TypeExamples", file: !6, line: 5, size: 256, elements: !10) +!10 = !{!11, !12, !13, !14} +!11 = !DIDerivedType(tag: DW_TAG_member, name: "ptr", scope: !9, file: !6, line: 6, baseType: !15, size: 64) +!12 = !DIDerivedType(tag: DW_TAG_member, name: "volatile", scope: !9, file: !6, line: 7, baseType: !17, size: 64, offset: 64) +!13 = !DIDerivedType(tag: DW_TAG_member, name: "const", scope: !9, file: !6, line: 8, baseType: !18, size: 64, offset: 128) +!14 = !DIDerivedType(tag: DW_TAG_member, name: "restrict_ptr", scope: !9, file: !6, line: 9, baseType: !19, size: 64, offset: 192) +!15 = !DIDerivedType(tag: DW_TAG_pointer_type, name: "*int", baseType: !16, size: 64) +!16 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!17 = !DIDerivedType(tag: DW_TAG_volatile_type, name: "volatile int", baseType: !16) +!18 = !DIDerivedType(tag: DW_TAG_const_type, name: "const int", baseType: !16) +!19 = !DIDerivedType(tag: DW_TAG_restrict_type, name: "*int restrict", baseType: !20) +!20 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !16, size: 64) +!21 = !{!"my hand-written IR"} diff --git a/llvm/test/CodeGen/BPF/BTF/ptr-named.ll b/llvm/test/CodeGen/BPF/BTF/ptr-named.ll new file mode 100644 index 0000000000000..675c34e976abb --- /dev/null +++ b/llvm/test/CodeGen/BPF/BTF/ptr-named.ll @@ -0,0 +1,75 @@ +; RUN: llc -mtriple=bpfel -filetype=obj -o %t1 %s +; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1 +; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s +; RUN: llc -mtriple=bpfeb -filetype=obj -o %t1 %s +; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1 +; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s +; +; Source: +; #![no_std] +; #![no_main] +; +; pub struct MyType { +; ptr: *const u32, +; } +; +; impl MyType { +; pub const fn new() -> Self { +; let ptr = core::ptr::null(); +; Self { ptr } +; } +; } +; +; unsafe impl Sync for MyType {} +; +; #[unsafe(no_mangle)] +; pub static X: MyType = MyType::new(); +; +; #[cfg(not(test))] +; #[panic_handler] +; fn panic(_info: &core::panic::PanicInfo) -> ! { +; loop {} +; } +; Compilation flag: +; cargo +nightly rustc -Zbuild-std=core --target=bpfel-unknown-none -- --emit=llvm-bc +; llvm-extract --glob=X $(find target/ -name "*.bc" | head -n 1) -o ptr-named.bc +; llvm-dis ptr-named.bc -o ptr-named.ll + +; ModuleID = 'ptr-named.bc' +source_filename = "1m2uqe50qkwxmo53ydydvou91" +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "bpfel" + +@X = constant [8 x i8] zeroinitializer, align 8, !dbg !0 + +!llvm.module.flags = !{!11, !12, !13, !14} +!llvm.ident = !{!15} +!llvm.dbg.cu = !{!16} + +; CHECK-BTF: [1] STRUCT 'MyType' size=8 vlen=1 +; CHECK-BTF-NEXT: 'ptr' type_id=2 bits_offset=0 +; CHECK-BTF-NEXT: [2] PTR '(anon)' type_id=3 +; CHECK-BTF-NEXT: [3] INT 'u32' size=4 bits_offset=0 nr_bits=32 encoding=(none) +; CHECK-BTF-NEXT: [4] VAR 'X' type_id=1, linkage=global +; CHECK-BTF-NEXT: [5] DATASEC '.rodata' size=0 vlen=1 +; CHECK-BTF-NEXT: type_id=4 offset=0 size=8 + +!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) +!1 = distinct !DIGlobalVariable(name: "X", scope: !2, file: !3, line: 19, type: !4, isLocal: false, isDefinition: true, align: 64) +!2 = !DINamespace(name: "ptr_named", scope: null) +!3 = !DIFile(filename: "ptr-named/src/main.rs", directory: "/tmp/ptr-named", checksumkind: CSK_MD5, checksum: "e37168304600b30cbb5ba168f0384932") +!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "MyType", scope: !2, file: !5, size: 64, align: 64, flags: DIFlagPublic, elements: !6, templateParams: !10, identifier: "7609fa40332dd486922f074276a171c3") +!5 = !DIFile(filename: "", directory: "") +!6 = !{!7} +!7 = !DIDerivedType(tag: DW_TAG_member, name: "ptr", scope: !4, file: !5, baseType: !8, size: 64, align: 64, flags: DIFlagPrivate) +!8 = !DIDerivedType(tag: DW_TAG_pointer_type, name: "*const u32", baseType: !9, size: 64, align: 64, dwarfAddressSpace: 0) +!9 = !DIBasicType(name: "u32", size: 32, encoding: DW_ATE_unsigned) +!10 = !{} +!11 = !{i32 8, !"PIC Level", i32 2} +!12 = !{i32 7, !"PIE Level", i32 2} +!13 = !{i32 7, !"Dwarf Version", i32 4} +!14 = !{i32 2, !"Debug Info Version", i32 3} +!15 = !{!"rustc version 1.92.0-nightly (c8905eaa6 2025-09-28)"} +!16 = distinct !DICompileUnit(language: DW_LANG_Rust, file: !17, producer: "clang LLVM (rustc version 1.92.0-nightly (c8905eaa6 2025-09-28))", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !18, splitDebugInlining: false, nameTableKind: None) +!17 = !DIFile(filename: "ptr-named/src/main.rs/@/1m2uqe50qkwxmo53ydydvou91", directory: "/tmp/ptr-named") +!18 = !{!0} From eb74d8e03cef0bb29bccf221f5ea565e47d7ab26 Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Wed, 22 Oct 2025 07:30:43 -0700 Subject: [PATCH 112/530] [ThinLTO] Add index flag for internalization/promotion status (#164530) Add an index-wide flag indicating whether index-based internalization and promotion have completed. This will be used in a follow on change. --- llvm/include/llvm/IR/ModuleSummaryIndex.h | 6 ++++ llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 2 +- llvm/lib/IR/ModuleSummaryIndex.cpp | 8 ++++- llvm/lib/LTO/LTO.cpp | 2 ++ llvm/test/Bitcode/thinlto-deadstrip-flag.ll | 20 ----------- llvm/test/Bitcode/thinlto-index-flags.ll | 39 +++++++++++++++++++++ 6 files changed, 55 insertions(+), 22 deletions(-) delete mode 100644 llvm/test/Bitcode/thinlto-deadstrip-flag.ll create mode 100644 llvm/test/Bitcode/thinlto-index-flags.ll diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h index 0062cece17944..98df06ae43ad0 100644 --- a/llvm/include/llvm/IR/ModuleSummaryIndex.h +++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h @@ -1449,6 +1449,9 @@ class ModuleSummaryIndex { /// every summary of a GV is synchronized. bool WithDSOLocalPropagation = false; + /// Indicates that summary-based internalization and promotion has run. + bool WithInternalizeAndPromote = false; + /// Indicates that we have whole program visibility. bool WithWholeProgramVisibility = false; @@ -1653,6 +1656,9 @@ class ModuleSummaryIndex { bool withDSOLocalPropagation() const { return WithDSOLocalPropagation; } void setWithDSOLocalPropagation() { WithDSOLocalPropagation = true; } + bool withInternalizeAndPromote() const { return WithInternalizeAndPromote; } + void setWithInternalizeAndPromote() { WithInternalizeAndPromote = true; } + bool withWholeProgramVisibility() const { return WithWholeProgramVisibility; } void setWithWholeProgramVisibility() { WithWholeProgramVisibility = true; } diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index cf7efbfab65c7..466dcb02696f4 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -8603,7 +8603,7 @@ getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream, case bitc::FS_FLAGS: { // [flags] uint64_t Flags = Record[0]; // Scan flags. - assert(Flags <= 0x2ff && "Unexpected bits in flag"); + assert(Flags <= 0x7ff && "Unexpected bits in flag"); bool EnableSplitLTOUnit = Flags & 0x8; bool UnifiedLTO = Flags & 0x200; diff --git a/llvm/lib/IR/ModuleSummaryIndex.cpp b/llvm/lib/IR/ModuleSummaryIndex.cpp index a6353664e0e7a..62fd62caad3d6 100644 --- a/llvm/lib/IR/ModuleSummaryIndex.cpp +++ b/llvm/lib/IR/ModuleSummaryIndex.cpp @@ -111,11 +111,13 @@ uint64_t ModuleSummaryIndex::getFlags() const { Flags |= 0x100; if (hasUnifiedLTO()) Flags |= 0x200; + if (withInternalizeAndPromote()) + Flags |= 0x400; return Flags; } void ModuleSummaryIndex::setFlags(uint64_t Flags) { - assert(Flags <= 0x2ff && "Unexpected bits in flag"); + assert(Flags <= 0x7ff && "Unexpected bits in flag"); // 1 bit: WithGlobalValueDeadStripping flag. // Set on combined index only. if (Flags & 0x1) @@ -154,6 +156,10 @@ void ModuleSummaryIndex::setFlags(uint64_t Flags) { // Set on combined index only. if (Flags & 0x200) setUnifiedLTO(); + // 1 bit: WithInternalizeAndPromote flag. + // Set on combined index only. + if (Flags & 0x400) + setWithInternalizeAndPromote(); } // Collect for the given module the list of function it defines diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index cbc0b1d3256bf..72ae064e46a0c 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -551,9 +551,11 @@ void llvm::thinLTOInternalizeAndPromoteInIndex( function_ref isExported, function_ref isPrevailing) { + assert(!Index.withInternalizeAndPromote()); for (auto &I : Index) thinLTOInternalizeAndPromoteGUID(Index.getValueInfo(I), isExported, isPrevailing); + Index.setWithInternalizeAndPromote(); } // Requires a destructor for std::vector. diff --git a/llvm/test/Bitcode/thinlto-deadstrip-flag.ll b/llvm/test/Bitcode/thinlto-deadstrip-flag.ll deleted file mode 100644 index 00c01314a51c2..0000000000000 --- a/llvm/test/Bitcode/thinlto-deadstrip-flag.ll +++ /dev/null @@ -1,20 +0,0 @@ -; REQUIRES: x86-registered-target -; RUN: opt -module-summary %s -o %t.o - -; Ensure dead stripping performed flag is set on distributed index -; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \ -; RUN: -r %t.o,glob,plx -; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=WITHDEAD -; WITHDEAD: - -; Ensure dead stripping performed flag is not set on distributed index -; when option used to disable dead stripping computation. -; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \ -; RUN: -r %t.o,glob,plx -compute-dead=false -; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=NODEAD -; NODEAD: - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@glob = global i32 0 diff --git a/llvm/test/Bitcode/thinlto-index-flags.ll b/llvm/test/Bitcode/thinlto-index-flags.ll new file mode 100644 index 0000000000000..e957ce681038a --- /dev/null +++ b/llvm/test/Bitcode/thinlto-index-flags.ll @@ -0,0 +1,39 @@ +; REQUIRES: x86-registered-target +; RUN: opt -module-summary %s -o %t.o + +;; By default, the indexing step should perform and set the appropriate index +;; flags for dead stripping, attribute propagation, DSO local propagation, +;; and internalization/promotion. +; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \ +; RUN: -r %t.o,glob,plx +; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=ALL +;; The flag value should be 0x461 aka 1121: +;; 0x1: Dead stripping +;; 0x20: Attribute propagation +;; 0x40: DSO local propagation +;; 0x400: Internalization/promotion +; ALL: + +;; Ensure dead stripping performed flag is not set on distributed index +;; when option used to disable dead stripping computation. +; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \ +; RUN: -r %t.o,glob,plx -compute-dead=false +; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=NODEAD +;; Flag should be 0x460 aka 1120. +; NODEAD: + +;; Disabling attribute propagation should disable that as well as DSO local +;; propagation. +; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \ +; RUN: -r %t.o,glob,plx -propagate-attrs=false +; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=NOPROP +;; Flag should be 0x401 aka 1025. +; NOPROP: + +;; Note there isn't currently a way to disable internalization+promotion, which +;; are performed together. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@glob = global i32 0 From 276bccda66f333171f5ba08d18d9301ee663cf7a Mon Sep 17 00:00:00 2001 From: Felipe de Azevedo Piovezan Date: Wed, 22 Oct 2025 07:40:29 -0700 Subject: [PATCH 113/530] [lldb] Implement ProcessGDBRemote support for ReadMemoryRanges (#164311) This commit makes use of the newly created MultiMemRead packet to provide an efficient implementation of MultiMemRead inside ProcessGDBRemote. Testing is tricky, but it is accomplished two ways: 1. Some Objective-C tests would fail if this were implemented incorrectly, as there is already an in-tree use of the base class implementation of MultiMemRead, which is now getting replaced by the derived class. 2. One Objective-C test is modified so that we ensure the packet is being sent by looking at the packet logs. While not the most elegant solution, it is a strategy adopted in other tests as well. This gets around the fact that we cannot instantiate / unittest a mock ProcessGDBRemote. Depends on https://github.com/llvm/llvm-project/pull/163651 --- .../Process/gdb-remote/ProcessGDBRemote.cpp | 103 ++++++++++++++++++ .../Process/gdb-remote/ProcessGDBRemote.h | 16 +++ .../objc/foundation/TestObjCMethodsNSError.py | 27 +++++ 3 files changed, 146 insertions(+) diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp index 91f3a6ce383b1..b4422a7d58077 100644 --- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp @@ -86,6 +86,7 @@ #include "lldb/Host/Host.h" #include "lldb/Utility/StringExtractorGDBRemote.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringSwitch.h" @@ -2762,6 +2763,108 @@ size_t ProcessGDBRemote::DoReadMemory(addr_t addr, void *buf, size_t size, return 0; } +llvm::SmallVector> +ProcessGDBRemote::ReadMemoryRanges( + llvm::ArrayRef> ranges, + llvm::MutableArrayRef buffer) { + if (!m_gdb_comm.GetMultiMemReadSupported()) + return Process::ReadMemoryRanges(ranges, buffer); + + llvm::Expected response = + SendMultiMemReadPacket(ranges); + if (!response) { + LLDB_LOG_ERROR(GetLog(GDBRLog::Process), response.takeError(), + "MultiMemRead error response: {0}"); + return Process::ReadMemoryRanges(ranges, buffer); + } + + llvm::StringRef response_str = response->GetStringRef(); + const unsigned expected_num_ranges = ranges.size(); + llvm::Expected>> + parsed_response = + ParseMultiMemReadPacket(response_str, buffer, expected_num_ranges); + if (!parsed_response) { + LLDB_LOG_ERROR(GetLog(GDBRLog::Process), parsed_response.takeError(), + "MultiMemRead error parsing response: {0}"); + return Process::ReadMemoryRanges(ranges, buffer); + } + return std::move(*parsed_response); +} + +llvm::Expected +ProcessGDBRemote::SendMultiMemReadPacket( + llvm::ArrayRef> ranges) { + std::string packet_str; + llvm::raw_string_ostream stream(packet_str); + stream << "MultiMemRead:ranges:"; + + auto range_to_stream = [&](auto range) { + // the "-" marker omits the '0x' prefix. + stream << llvm::formatv("{0:x-},{1:x-}", range.base, range.size); + }; + llvm::interleave(ranges, stream, range_to_stream, ","); + stream << ";"; + + StringExtractorGDBRemote response; + GDBRemoteCommunication::PacketResult packet_result = + m_gdb_comm.SendPacketAndWaitForResponse(packet_str.data(), response, + GetInterruptTimeout()); + if (packet_result != GDBRemoteCommunication::PacketResult::Success) + return llvm::createStringError( + llvm::formatv("MultiMemRead failed to send packet: '{0}'", packet_str)); + + if (response.IsErrorResponse()) + return llvm::createStringError( + llvm::formatv("MultiMemRead failed: '{0}'", response.GetStringRef())); + + if (!response.IsNormalResponse()) + return llvm::createStringError(llvm::formatv( + "MultiMemRead unexpected response: '{0}'", response.GetStringRef())); + + return response; +} + +llvm::Expected>> +ProcessGDBRemote::ParseMultiMemReadPacket(llvm::StringRef response_str, + llvm::MutableArrayRef buffer, + unsigned expected_num_ranges) { + // The sizes and the data are separated by a `;`. + auto [sizes_str, memory_data] = response_str.split(';'); + if (sizes_str.size() == response_str.size()) + return llvm::createStringError(llvm::formatv( + "MultiMemRead response missing field separator ';' in: '{0}'", + response_str)); + + llvm::SmallVector> read_results; + + // Sizes are separated by a `,`. + for (llvm::StringRef size_str : llvm::split(sizes_str, ',')) { + uint64_t read_size; + if (size_str.getAsInteger(16, read_size)) + return llvm::createStringError(llvm::formatv( + "MultiMemRead response has invalid size string: {0}", size_str)); + + if (memory_data.size() < read_size) + return llvm::createStringError( + llvm::formatv("MultiMemRead response did not have enough data, " + "requested sizes: {0}", + sizes_str)); + + llvm::StringRef region_to_read = memory_data.take_front(read_size); + memory_data = memory_data.drop_front(read_size); + + assert(buffer.size() >= read_size); + llvm::MutableArrayRef region_to_write = + buffer.take_front(read_size); + buffer = buffer.drop_front(read_size); + + memcpy(region_to_write.data(), region_to_read.data(), read_size); + read_results.push_back(region_to_write); + } + + return read_results; +} + bool ProcessGDBRemote::SupportsMemoryTagging() { return m_gdb_comm.GetMemoryTaggingSupported(); } diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h index 7c3dfb179a4b3..eb33b52b57441 100644 --- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h +++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h @@ -137,6 +137,22 @@ class ProcessGDBRemote : public Process, size_t DoReadMemory(lldb::addr_t addr, void *buf, size_t size, Status &error) override; + /// Override of ReadMemoryRanges that uses MultiMemRead to optimize this + /// operation. + llvm::SmallVector> + ReadMemoryRanges(llvm::ArrayRef> ranges, + llvm::MutableArrayRef buf) override; + +private: + llvm::Expected + SendMultiMemReadPacket(llvm::ArrayRef> ranges); + + llvm::Expected>> + ParseMultiMemReadPacket(llvm::StringRef response_str, + llvm::MutableArrayRef buffer, + unsigned expected_num_ranges); + +public: Status WriteObjectFile(std::vector entries) override; diff --git a/lldb/test/API/lang/objc/foundation/TestObjCMethodsNSError.py b/lldb/test/API/lang/objc/foundation/TestObjCMethodsNSError.py index a14035db5e057..a9fbe54e074ff 100644 --- a/lldb/test/API/lang/objc/foundation/TestObjCMethodsNSError.py +++ b/lldb/test/API/lang/objc/foundation/TestObjCMethodsNSError.py @@ -45,3 +45,30 @@ def test_NSError_p(self): ], ) self.runCmd("process continue") + + @skipIfOutOfTreeDebugserver + def test_runtime_types_efficient_memreads(self): + # Test that we use an efficient reading of memory when reading + # Objective-C method descriptions. + logfile = os.path.join(self.getBuildDir(), "log.txt") + self.runCmd(f"log enable -f {logfile} gdb-remote packets process") + self.addTearDownHook(lambda: self.runCmd("log disable gdb-remote packets")) + + self.build() + self.target, process, thread, bkpt = lldbutil.run_to_source_breakpoint( + self, "// Break here for NSString tests", lldb.SBFileSpec("main.m", False) + ) + + self.runCmd(f"proc plugin packet send StartTesting", check=False) + self.expect('expression str = [NSString stringWithCString: "new"]') + self.runCmd(f"proc plugin packet send EndTesting", check=False) + + self.assertTrue(os.path.exists(logfile)) + log_text = open(logfile).read() + log_text = log_text.split("StartTesting", 1)[-1].split("EndTesting", 1)[0] + + # This test is only checking that the packet it used at all (and that + # no errors are produced). It doesn't check that the packet is being + # used to solve a problem in an optimal way. + self.assertIn("MultiMemRead:", log_text) + self.assertNotIn("MultiMemRead error", log_text) From 18d4ba593db9d6949300fbd97e900bfb86d651b2 Mon Sep 17 00:00:00 2001 From: Bertik23 <39457484+Bertik23@users.noreply.github.com> Date: Wed, 22 Oct 2025 16:46:12 +0200 Subject: [PATCH 114/530] [LLVM][IR] Add location tracking to LLVM IR parser (#155797) This PR is part of the LLVM IR LSP server project ([RFC](https://discourse.llvm.org/t/rfc-ir-visualization-with-vs-code-extension-using-an-lsp-server/87773)) To be able to make a LSP server, it's crucial to have location information about the LLVM objects (Functions, BasicBlocks and Instructions). This PR adds: * Position tracking to the Lexer * A new AsmParserContext class, to hold the new position info * Tests to check if the location is correct The AsmParserContext can be passed as an optional parameter into the parser. Which populates it and it can be then used by other tools, such as the LSP server. The AsmParserContext idea was borrowed from MLIR. As we didn't want to store data no one else uses inside the objects themselves. But the implementation is different, this class holds several maps of Functions, BasicBlocks and Instructions, to map them to their location. And some utility methods were added to get the positions of the processed tokens. --- .../include/llvm/AsmParser/AsmParserContext.h | 70 +++++++++++++++ llvm/include/llvm/AsmParser/FileLoc.h | 56 ++++++++++++ llvm/include/llvm/AsmParser/LLLexer.h | 24 +++-- llvm/include/llvm/AsmParser/LLParser.h | 11 ++- llvm/include/llvm/AsmParser/Parser.h | 16 ++-- llvm/include/llvm/IRReader/IRReader.h | 17 ++-- llvm/lib/AsmParser/AsmParserContext.cpp | 89 +++++++++++++++++++ llvm/lib/AsmParser/CMakeLists.txt | 1 + llvm/lib/AsmParser/LLLexer.cpp | 2 + llvm/lib/AsmParser/LLParser.cpp | 24 ++++- llvm/lib/AsmParser/Parser.cpp | 31 ++++--- llvm/lib/IRReader/IRReader.cpp | 13 ++- llvm/unittests/AsmParser/AsmParserTest.cpp | 55 ++++++++++++ 13 files changed, 367 insertions(+), 42 deletions(-) create mode 100644 llvm/include/llvm/AsmParser/AsmParserContext.h create mode 100644 llvm/include/llvm/AsmParser/FileLoc.h create mode 100644 llvm/lib/AsmParser/AsmParserContext.cpp diff --git a/llvm/include/llvm/AsmParser/AsmParserContext.h b/llvm/include/llvm/AsmParser/AsmParserContext.h new file mode 100644 index 0000000000000..1a397486cba4f --- /dev/null +++ b/llvm/include/llvm/AsmParser/AsmParserContext.h @@ -0,0 +1,70 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ASMPARSER_ASMPARSERCONTEXT_H +#define LLVM_ASMPARSER_ASMPARSERCONTEXT_H + +#include "llvm/ADT/DenseMap.h" +#include "llvm/AsmParser/FileLoc.h" +#include "llvm/IR/Value.h" +#include + +namespace llvm { + +/// Registry of file location information for LLVM IR constructs. +/// +/// This class provides access to the file location information +/// for various LLVM IR constructs. Currently, it supports Function, +/// BasicBlock and Instruction locations. +/// +/// When available, it can answer queries about what is at a given +/// file location, as well as where in a file a given IR construct +/// is. +/// +/// This information is optionally emitted by the LLParser while +/// it reads LLVM textual IR. +class AsmParserContext { + DenseMap Functions; + DenseMap Blocks; + DenseMap Instructions; + +public: + std::optional getFunctionLocation(const Function *) const; + std::optional getBlockLocation(const BasicBlock *) const; + std::optional getInstructionLocation(const Instruction *) const; + /// Get the function at the requested location range. + /// If no single function occupies the queried range, or the record is + /// missing, a nullptr is returned. + Function *getFunctionAtLocation(const FileLocRange &) const; + /// Get the function at the requested location. + /// If no function occupies the queried location, or the record is missing, a + /// nullptr is returned. + Function *getFunctionAtLocation(const FileLoc &) const; + /// Get the block at the requested location range. + /// If no single block occupies the queried range, or the record is missing, a + /// nullptr is returned. + BasicBlock *getBlockAtLocation(const FileLocRange &) const; + /// Get the block at the requested location. + /// If no block occupies the queried location, or the record is missing, a + /// nullptr is returned. + BasicBlock *getBlockAtLocation(const FileLoc &) const; + /// Get the instruction at the requested location range. + /// If no single instruction occupies the queried range, or the record is + /// missing, a nullptr is returned. + Instruction *getInstructionAtLocation(const FileLocRange &) const; + /// Get the instruction at the requested location. + /// If no instruction occupies the queried location, or the record is missing, + /// a nullptr is returned. + Instruction *getInstructionAtLocation(const FileLoc &) const; + bool addFunctionLocation(Function *, const FileLocRange &); + bool addBlockLocation(BasicBlock *, const FileLocRange &); + bool addInstructionLocation(Instruction *, const FileLocRange &); +}; +} // namespace llvm + +#endif diff --git a/llvm/include/llvm/AsmParser/FileLoc.h b/llvm/include/llvm/AsmParser/FileLoc.h new file mode 100644 index 0000000000000..02c1849fa986e --- /dev/null +++ b/llvm/include/llvm/AsmParser/FileLoc.h @@ -0,0 +1,56 @@ +//===-- FileLoc.h ---------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ASMPARSER_FILELOC_H +#define LLVM_ASMPARSER_FILELOC_H + +#include +#include + +namespace llvm { + +/// Struct holding Line:Column location +struct FileLoc { + /// 0-based line number + unsigned Line; + /// 0-based column number + unsigned Col; + + bool operator<=(const FileLoc &RHS) const { + return Line < RHS.Line || (Line == RHS.Line && Col <= RHS.Col); + } + + bool operator<(const FileLoc &RHS) const { + return Line < RHS.Line || (Line == RHS.Line && Col < RHS.Col); + } + + FileLoc(unsigned L, unsigned C) : Line(L), Col(C) {} + FileLoc(std::pair LC) : Line(LC.first), Col(LC.second) {} +}; + +/// Struct holding a semiopen range [Start; End) +struct FileLocRange { + FileLoc Start; + FileLoc End; + + FileLocRange() : Start(0, 0), End(0, 0) {} + + FileLocRange(FileLoc S, FileLoc E) : Start(S), End(E) { + assert(Start <= End); + } + + bool contains(FileLoc L) const { return Start <= L && L < End; } + + bool contains(FileLocRange LR) const { + return Start <= LR.Start && LR.End <= End; + } +}; + +} // namespace llvm + +#endif diff --git a/llvm/include/llvm/AsmParser/LLLexer.h b/llvm/include/llvm/AsmParser/LLLexer.h index 501a7aefccd7f..0e379e5a75e65 100644 --- a/llvm/include/llvm/AsmParser/LLLexer.h +++ b/llvm/include/llvm/AsmParser/LLLexer.h @@ -13,22 +13,25 @@ #ifndef LLVM_ASMPARSER_LLLEXER_H #define LLVM_ASMPARSER_LLLEXER_H -#include "LLToken.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APSInt.h" +#include "llvm/AsmParser/LLToken.h" #include "llvm/Support/SMLoc.h" +#include "llvm/Support/SourceMgr.h" #include namespace llvm { class Type; class SMDiagnostic; - class SourceMgr; class LLVMContext; class LLLexer { const char *CurPtr; StringRef CurBuf; + /// The end (exclusive) of the previous token. + const char *PrevTokEnd = nullptr; + enum class ErrorPriority { None, // No error message present. Parser, // Errors issued by parser. @@ -62,9 +65,7 @@ namespace llvm { explicit LLLexer(StringRef StartBuf, SourceMgr &SM, SMDiagnostic &, LLVMContext &C); - lltok::Kind Lex() { - return CurKind = LexToken(); - } + lltok::Kind Lex() { return CurKind = LexToken(); } typedef SMLoc LocTy; LocTy getLoc() const { return SMLoc::getFromPointer(TokStart); } @@ -79,6 +80,19 @@ namespace llvm { IgnoreColonInIdentifiers = val; } + /// Get the line, column position of the start of the current token, + /// zero-indexed + std::pair getTokLineColumnPos() { + auto LC = SM.getLineAndColumn(SMLoc::getFromPointer(TokStart)); + return {LC.first - 1, LC.second - 1}; + } + /// Get the line, column position of the end of the previous token, + /// zero-indexed exclusive + std::pair getPrevTokEndLineColumnPos() { + auto LC = SM.getLineAndColumn(SMLoc::getFromPointer(PrevTokEnd)); + return {LC.first - 1, LC.second - 1}; + } + // This returns true as a convenience for the parser functions that return // true on error. bool ParseError(LocTy ErrorLoc, const Twine &Msg) { diff --git a/llvm/include/llvm/AsmParser/LLParser.h b/llvm/include/llvm/AsmParser/LLParser.h index c01de4a289a69..9eb31d7e0a451 100644 --- a/llvm/include/llvm/AsmParser/LLParser.h +++ b/llvm/include/llvm/AsmParser/LLParser.h @@ -13,8 +13,9 @@ #ifndef LLVM_ASMPARSER_LLPARSER_H #define LLVM_ASMPARSER_LLPARSER_H -#include "LLLexer.h" #include "llvm/ADT/StringMap.h" +#include "llvm/AsmParser/AsmParserContext.h" +#include "llvm/AsmParser/LLLexer.h" #include "llvm/AsmParser/NumberedValues.h" #include "llvm/AsmParser/Parser.h" #include "llvm/IR/Attributes.h" @@ -177,6 +178,9 @@ namespace llvm { // Map of module ID to path. std::map ModuleIdMap; + /// Keeps track of source locations for Values, BasicBlocks, and Functions. + AsmParserContext *ParserContext; + /// Only the llvm-as tool may set this to false to bypass /// UpgradeDebuginfo so it can generate broken bitcode. bool UpgradeDebugInfo; @@ -189,10 +193,11 @@ namespace llvm { public: LLParser(StringRef F, SourceMgr &SM, SMDiagnostic &Err, Module *M, ModuleSummaryIndex *Index, LLVMContext &Context, - SlotMapping *Slots = nullptr) + SlotMapping *Slots = nullptr, + AsmParserContext *ParserContext = nullptr) : Context(Context), OPLex(F, SM, Err, Context), Lex(F, SM, Err, Context), M(M), Index(Index), Slots(Slots), - BlockAddressPFS(nullptr) {} + BlockAddressPFS(nullptr), ParserContext(ParserContext) {} bool Run( bool UpgradeDebugInfo, DataLayoutCallbackTy DataLayoutCallback = [](StringRef, StringRef) { diff --git a/llvm/include/llvm/AsmParser/Parser.h b/llvm/include/llvm/AsmParser/Parser.h index c900b79665404..22b0881d92b53 100644 --- a/llvm/include/llvm/AsmParser/Parser.h +++ b/llvm/include/llvm/AsmParser/Parser.h @@ -15,6 +15,7 @@ #include "llvm/ADT/STLFunctionalExtras.h" #include "llvm/ADT/StringRef.h" +#include "llvm/AsmParser/AsmParserContext.h" #include "llvm/Support/Compiler.h" #include #include @@ -62,7 +63,8 @@ parseAssemblyFile(StringRef Filename, SMDiagnostic &Err, LLVMContext &Context, /// parsing. LLVM_ABI std::unique_ptr parseAssemblyString(StringRef AsmString, SMDiagnostic &Err, - LLVMContext &Context, SlotMapping *Slots = nullptr); + LLVMContext &Context, SlotMapping *Slots = nullptr, + AsmParserContext *ParserContext = nullptr); /// Holds the Module and ModuleSummaryIndex returned by the interfaces /// that parse both. @@ -128,9 +130,9 @@ parseSummaryIndexAssemblyString(StringRef AsmString, SMDiagnostic &Err); LLVM_ABI std::unique_ptr parseAssembly( MemoryBufferRef F, SMDiagnostic &Err, LLVMContext &Context, SlotMapping *Slots = nullptr, - DataLayoutCallbackTy DataLayoutCallback = [](StringRef, StringRef) { - return std::nullopt; - }); + DataLayoutCallbackTy DataLayoutCallback = + [](StringRef, StringRef) { return std::nullopt; }, + AsmParserContext *ParserContext = nullptr); /// Parse LLVM Assembly including the summary index from a MemoryBuffer. /// @@ -169,9 +171,9 @@ parseSummaryIndexAssembly(MemoryBufferRef F, SMDiagnostic &Err); LLVM_ABI bool parseAssemblyInto( MemoryBufferRef F, Module *M, ModuleSummaryIndex *Index, SMDiagnostic &Err, SlotMapping *Slots = nullptr, - DataLayoutCallbackTy DataLayoutCallback = [](StringRef, StringRef) { - return std::nullopt; - }); + DataLayoutCallbackTy DataLayoutCallback = + [](StringRef, StringRef) { return std::nullopt; }, + AsmParserContext *ParserContext = nullptr); /// Parse a type and a constant value in the given string. /// diff --git a/llvm/include/llvm/IRReader/IRReader.h b/llvm/include/llvm/IRReader/IRReader.h index 790140f19934e..00cf12d342ae0 100644 --- a/llvm/include/llvm/IRReader/IRReader.h +++ b/llvm/include/llvm/IRReader/IRReader.h @@ -15,6 +15,7 @@ #define LLVM_IRREADER_IRREADER_H #include "llvm/ADT/StringRef.h" +#include "llvm/AsmParser/AsmParserContext.h" #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/Support/Compiler.h" #include @@ -50,19 +51,19 @@ getLazyIRFileModule(StringRef Filename, SMDiagnostic &Err, LLVMContext &Context, /// for it. Otherwise, attempt to parse it as LLVM Assembly and return /// a Module for it. /// \param DataLayoutCallback Override datalayout in the llvm assembly. -LLVM_ABI std::unique_ptr parseIR(MemoryBufferRef Buffer, - SMDiagnostic &Err, - LLVMContext &Context, - ParserCallbacks Callbacks = {}); +LLVM_ABI std::unique_ptr +parseIR(MemoryBufferRef Buffer, SMDiagnostic &Err, LLVMContext &Context, + ParserCallbacks Callbacks = {}, + AsmParserContext *ParserContext = nullptr); /// If the given file holds a bitcode image, return a Module for it. /// Otherwise, attempt to parse it as LLVM Assembly and return a Module /// for it. /// \param DataLayoutCallback Override datalayout in the llvm assembly. -LLVM_ABI std::unique_ptr parseIRFile(StringRef Filename, - SMDiagnostic &Err, - LLVMContext &Context, - ParserCallbacks Callbacks = {}); +LLVM_ABI std::unique_ptr +parseIRFile(StringRef Filename, SMDiagnostic &Err, LLVMContext &Context, + ParserCallbacks Callbacks = {}, + AsmParserContext *ParserContext = nullptr); } #endif diff --git a/llvm/lib/AsmParser/AsmParserContext.cpp b/llvm/lib/AsmParser/AsmParserContext.cpp new file mode 100644 index 0000000000000..59d3ffcb470e4 --- /dev/null +++ b/llvm/lib/AsmParser/AsmParserContext.cpp @@ -0,0 +1,89 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/AsmParser/AsmParserContext.h" + +namespace llvm { + +std::optional +AsmParserContext::getFunctionLocation(const Function *F) const { + if (auto FIt = Functions.find(F); FIt != Functions.end()) + return FIt->second; + return std::nullopt; +} + +std::optional +AsmParserContext::getBlockLocation(const BasicBlock *BB) const { + if (auto BBIt = Blocks.find(BB); BBIt != Blocks.end()) + return BBIt->second; + return std::nullopt; +} + +std::optional +AsmParserContext::getInstructionLocation(const Instruction *I) const { + if (auto IIt = Instructions.find(I); IIt != Instructions.end()) + return IIt->second; + return std::nullopt; +} + +Function * +AsmParserContext::getFunctionAtLocation(const FileLocRange &Query) const { + for (auto &[F, Loc] : Functions) { + if (Loc.contains(Query)) + return F; + } + return nullptr; +} + +Function *AsmParserContext::getFunctionAtLocation(const FileLoc &Query) const { + return getFunctionAtLocation(FileLocRange(Query, Query)); +} + +BasicBlock * +AsmParserContext::getBlockAtLocation(const FileLocRange &Query) const { + for (auto &[BB, Loc] : Blocks) { + if (Loc.contains(Query)) + return BB; + } + return nullptr; +} + +BasicBlock *AsmParserContext::getBlockAtLocation(const FileLoc &Query) const { + return getBlockAtLocation(FileLocRange(Query, Query)); +} + +Instruction * +AsmParserContext::getInstructionAtLocation(const FileLocRange &Query) const { + for (auto &[I, Loc] : Instructions) { + if (Loc.contains(Query)) + return I; + } + return nullptr; +} + +Instruction * +AsmParserContext::getInstructionAtLocation(const FileLoc &Query) const { + return getInstructionAtLocation(FileLocRange(Query, Query)); +} + +bool AsmParserContext::addFunctionLocation(Function *F, + const FileLocRange &Loc) { + return Functions.insert({F, Loc}).second; +} + +bool AsmParserContext::addBlockLocation(BasicBlock *BB, + const FileLocRange &Loc) { + return Blocks.insert({BB, Loc}).second; +} + +bool AsmParserContext::addInstructionLocation(Instruction *I, + const FileLocRange &Loc) { + return Instructions.insert({I, Loc}).second; +} + +} // namespace llvm diff --git a/llvm/lib/AsmParser/CMakeLists.txt b/llvm/lib/AsmParser/CMakeLists.txt index 20d0c50a029ca..dcfcc06f093a7 100644 --- a/llvm/lib/AsmParser/CMakeLists.txt +++ b/llvm/lib/AsmParser/CMakeLists.txt @@ -1,5 +1,6 @@ # AsmParser add_llvm_component_library(LLVMAsmParser + AsmParserContext.cpp LLLexer.cpp LLParser.cpp Parser.cpp diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp index 50d1d4730007a..7a6c19ece92ac 100644 --- a/llvm/lib/AsmParser/LLLexer.cpp +++ b/llvm/lib/AsmParser/LLLexer.cpp @@ -191,6 +191,8 @@ int LLLexer::getNextChar() { } lltok::Kind LLLexer::LexToken() { + // Set token end to next location, since the end is exclusive. + PrevTokEnd = CurPtr; while (true) { TokStart = CurPtr; diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index f71a534ef9c2e..5164cec33e6f5 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -752,14 +752,21 @@ bool LLParser::parseDeclare() { /// ::= 'define' FunctionHeader (!dbg !56)* '{' ... bool LLParser::parseDefine() { assert(Lex.getKind() == lltok::kw_define); + FileLoc FunctionStart(Lex.getTokLineColumnPos()); Lex.Lex(); Function *F; unsigned FunctionNumber = -1; SmallVector UnnamedArgNums; - return parseFunctionHeader(F, true, FunctionNumber, UnnamedArgNums) || - parseOptionalFunctionMetadata(*F) || - parseFunctionBody(*F, FunctionNumber, UnnamedArgNums); + bool RetValue = + parseFunctionHeader(F, true, FunctionNumber, UnnamedArgNums) || + parseOptionalFunctionMetadata(*F) || + parseFunctionBody(*F, FunctionNumber, UnnamedArgNums); + if (ParserContext) + ParserContext->addFunctionLocation( + F, FileLocRange(FunctionStart, Lex.getPrevTokEndLineColumnPos())); + + return RetValue; } /// parseGlobalType @@ -7018,6 +7025,8 @@ bool LLParser::parseFunctionBody(Function &Fn, unsigned FunctionNumber, /// parseBasicBlock /// ::= (LabelStr|LabelID)? Instruction* bool LLParser::parseBasicBlock(PerFunctionState &PFS) { + FileLoc BBStart(Lex.getTokLineColumnPos()); + // If this basic block starts out with a name, remember it. std::string Name; int NameID = -1; @@ -7059,6 +7068,7 @@ bool LLParser::parseBasicBlock(PerFunctionState &PFS) { TrailingDbgRecord.emplace_back(DR, DeleteDbgRecord); } + FileLoc InstStart(Lex.getTokLineColumnPos()); // This instruction may have three possibilities for a name: a) none // specified, b) name specified "%foo =", c) number specified: "%4 =". LocTy NameLoc = Lex.getLoc(); @@ -7108,8 +7118,16 @@ bool LLParser::parseBasicBlock(PerFunctionState &PFS) { for (DbgRecordPtr &DR : TrailingDbgRecord) BB->insertDbgRecordBefore(DR.release(), Inst->getIterator()); TrailingDbgRecord.clear(); + if (ParserContext) { + ParserContext->addInstructionLocation( + Inst, FileLocRange(InstStart, Lex.getPrevTokEndLineColumnPos())); + } } while (!Inst->isTerminator()); + if (ParserContext) + ParserContext->addBlockLocation( + BB, FileLocRange(BBStart, Lex.getPrevTokEndLineColumnPos())); + assert(TrailingDbgRecord.empty() && "All debug values should have been attached to an instruction."); diff --git a/llvm/lib/AsmParser/Parser.cpp b/llvm/lib/AsmParser/Parser.cpp index 07fdce981b084..c5346d0977314 100644 --- a/llvm/lib/AsmParser/Parser.cpp +++ b/llvm/lib/AsmParser/Parser.cpp @@ -24,33 +24,38 @@ using namespace llvm; static bool parseAssemblyInto(MemoryBufferRef F, Module *M, ModuleSummaryIndex *Index, SMDiagnostic &Err, SlotMapping *Slots, bool UpgradeDebugInfo, - DataLayoutCallbackTy DataLayoutCallback) { + DataLayoutCallbackTy DataLayoutCallback, + AsmParserContext *ParserContext = nullptr) { SourceMgr SM; std::unique_ptr Buf = MemoryBuffer::getMemBuffer(F); SM.AddNewSourceBuffer(std::move(Buf), SMLoc()); std::optional OptContext; return LLParser(F.getBuffer(), SM, Err, M, Index, - M ? M->getContext() : OptContext.emplace(), Slots) + M ? M->getContext() : OptContext.emplace(), Slots, + ParserContext) .Run(UpgradeDebugInfo, DataLayoutCallback); } bool llvm::parseAssemblyInto(MemoryBufferRef F, Module *M, ModuleSummaryIndex *Index, SMDiagnostic &Err, SlotMapping *Slots, - DataLayoutCallbackTy DataLayoutCallback) { + DataLayoutCallbackTy DataLayoutCallback, + AsmParserContext *ParserContext) { return ::parseAssemblyInto(F, M, Index, Err, Slots, - /*UpgradeDebugInfo*/ true, DataLayoutCallback); + /*UpgradeDebugInfo*/ true, DataLayoutCallback, + ParserContext); } std::unique_ptr llvm::parseAssembly(MemoryBufferRef F, SMDiagnostic &Err, LLVMContext &Context, - SlotMapping *Slots, - DataLayoutCallbackTy DataLayoutCallback) { + SlotMapping *Slots, DataLayoutCallbackTy DataLayoutCallback, + AsmParserContext *ParserContext) { std::unique_ptr M = std::make_unique(F.getBufferIdentifier(), Context); - if (parseAssemblyInto(F, M.get(), nullptr, Err, Slots, DataLayoutCallback)) + if (parseAssemblyInto(F, M.get(), nullptr, Err, Slots, DataLayoutCallback, + ParserContext)) return nullptr; return M; @@ -133,12 +138,14 @@ ParsedModuleAndIndex llvm::parseAssemblyFileWithIndexNoUpgradeDebugInfo( DataLayoutCallback); } -std::unique_ptr llvm::parseAssemblyString(StringRef AsmString, - SMDiagnostic &Err, - LLVMContext &Context, - SlotMapping *Slots) { +std::unique_ptr +llvm::parseAssemblyString(StringRef AsmString, SMDiagnostic &Err, + LLVMContext &Context, SlotMapping *Slots, + AsmParserContext *ParserContext) { MemoryBufferRef F(AsmString, ""); - return parseAssembly(F, Err, Context, Slots); + return parseAssembly( + F, Err, Context, Slots, [](StringRef, StringRef) { return std::nullopt; }, + ParserContext); } static bool parseSummaryIndexAssemblyInto(MemoryBufferRef F, diff --git a/llvm/lib/IRReader/IRReader.cpp b/llvm/lib/IRReader/IRReader.cpp index a7e7deee8aa91..c16871f081d1d 100644 --- a/llvm/lib/IRReader/IRReader.cpp +++ b/llvm/lib/IRReader/IRReader.cpp @@ -8,6 +8,7 @@ #include "llvm/IRReader/IRReader.h" #include "llvm-c/IRReader.h" +#include "llvm/AsmParser/AsmParserContext.h" #include "llvm/AsmParser/Parser.h" #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/IR/LLVMContext.h" @@ -68,7 +69,8 @@ std::unique_ptr llvm::getLazyIRFileModule(StringRef Filename, std::unique_ptr llvm::parseIR(MemoryBufferRef Buffer, SMDiagnostic &Err, LLVMContext &Context, - ParserCallbacks Callbacks) { + ParserCallbacks Callbacks, + llvm::AsmParserContext *ParserContext) { NamedRegionTimer T(TimeIRParsingName, TimeIRParsingDescription, TimeIRParsingGroupName, TimeIRParsingGroupDescription, TimePassesIsEnabled); @@ -88,12 +90,14 @@ std::unique_ptr llvm::parseIR(MemoryBufferRef Buffer, SMDiagnostic &Err, return parseAssembly(Buffer, Err, Context, nullptr, Callbacks.DataLayout.value_or( - [](StringRef, StringRef) { return std::nullopt; })); + [](StringRef, StringRef) { return std::nullopt; }), + ParserContext); } std::unique_ptr llvm::parseIRFile(StringRef Filename, SMDiagnostic &Err, LLVMContext &Context, - ParserCallbacks Callbacks) { + ParserCallbacks Callbacks, + AsmParserContext *ParserContext) { ErrorOr> FileOrErr = MemoryBuffer::getFileOrSTDIN(Filename, /*IsText=*/true); if (std::error_code EC = FileOrErr.getError()) { @@ -102,7 +106,8 @@ std::unique_ptr llvm::parseIRFile(StringRef Filename, SMDiagnostic &Err, return nullptr; } - return parseIR(FileOrErr.get()->getMemBufferRef(), Err, Context, Callbacks); + return parseIR(FileOrErr.get()->getMemBufferRef(), Err, Context, Callbacks, + ParserContext); } //===----------------------------------------------------------------------===// diff --git a/llvm/unittests/AsmParser/AsmParserTest.cpp b/llvm/unittests/AsmParser/AsmParserTest.cpp index ce226705068af..898a8293925b6 100644 --- a/llvm/unittests/AsmParser/AsmParserTest.cpp +++ b/llvm/unittests/AsmParser/AsmParserTest.cpp @@ -6,7 +6,9 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" +#include "llvm/AsmParser/AsmParserContext.h" #include "llvm/AsmParser/Parser.h" #include "llvm/AsmParser/SlotMapping.h" #include "llvm/IR/Constants.h" @@ -14,10 +16,14 @@ #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/Error.h" #include "llvm/Support/SourceMgr.h" #include "gtest/gtest.h" +#define DEBUG_TYPE "unittest-asm-parser-tests" + using namespace llvm; namespace { @@ -479,4 +485,53 @@ TEST(AsmParserTest, DIExpressionBodyAtBeginningWithSlotMappingParsing) { ASSERT_EQ(Mapping.MetadataNodes.size(), 0u); } +#define ASSERT_EQ_LOC(Loc1, Loc2) \ + do { \ + EXPECT_TRUE(Loc1.contains(Loc2) && Loc2.contains(Loc1)) \ + << #Loc1 " location: " << Loc1.Start.Line << ":" << Loc1.Start.Col \ + << " - " << Loc1.End.Line << ":" << Loc1.End.Col << "\n" \ + << #Loc2 " location: " << Loc2.Start.Line << ":" << Loc2.Start.Col \ + << " - " << Loc2.End.Line << ":" << Loc2.End.Col << "\n"; \ + } while (false) + +TEST(AsmParserTest, ParserObjectLocations) { + StringRef Source = "define i32 @main() {\n" + "entry:\n" + " %a = add i32 1, 2\n" + " ret i32 %a\n" + "}\n"; + LLVMContext Ctx; + SMDiagnostic Error; + SlotMapping Mapping; + AsmParserContext ParserContext; + auto Mod = parseAssemblyString(Source, Error, Ctx, &Mapping, &ParserContext); + + auto *MainFn = Mod->getFunction("main"); + ASSERT_TRUE(MainFn != nullptr); + + auto MaybeMainLoc = ParserContext.getFunctionLocation(MainFn); + EXPECT_TRUE(MaybeMainLoc.has_value()); + auto MainLoc = MaybeMainLoc.value(); + auto ExpectedMainLoc = FileLocRange(FileLoc{0, 0}, FileLoc{4, 1}); + ASSERT_EQ_LOC(MainLoc, ExpectedMainLoc); + + auto &EntryBB = MainFn->getEntryBlock(); + auto MaybeEntryBBLoc = ParserContext.getBlockLocation(&EntryBB); + ASSERT_TRUE(MaybeEntryBBLoc.has_value()); + auto EntryBBLoc = MaybeEntryBBLoc.value(); + auto ExpectedEntryBBLoc = FileLocRange(FileLoc{1, 0}, FileLoc{3, 14}); + ASSERT_EQ_LOC(EntryBBLoc, ExpectedEntryBBLoc); + + SmallVector InstructionLocations = { + FileLocRange(FileLoc{2, 4}, FileLoc{2, 21}), + FileLocRange(FileLoc{3, 4}, FileLoc{3, 14})}; + + for (const auto &[Inst, ExpectedLoc] : zip(EntryBB, InstructionLocations)) { + auto MaybeInstLoc = ParserContext.getInstructionLocation(&Inst); + ASSERT_TRUE(MaybeMainLoc.has_value()); + auto InstLoc = MaybeInstLoc.value(); + ASSERT_EQ_LOC(InstLoc, ExpectedLoc); + } +} + } // end anonymous namespace From 65850de74d7fb52044e68bd79cc55e7fa1edcf0d Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Wed, 22 Oct 2025 09:46:29 -0500 Subject: [PATCH 115/530] [HIP] Hack around CMake incorrectly parsing OpenMP support in HIP mode (#164482) Summary: The new driver uses an embedded clang job to handle the device compilation. This correctly returns the linker as being `ld.lld`. The CMake parser to detect things like OpenMP support in the linker will parse the output of `-v` for the linker job. If the user is also using LLD for their linker, it will think this job is the linker and then fail as it does not see the required libraries. For these special HIP compilations, just print the linker wrapper invocation and nothing else. This will still show users the steps used to generate the binary but it will hack around this issue. --- clang/lib/Driver/ToolChains/Clang.cpp | 10 +++++++++- clang/test/Driver/hip-toolchain-no-rdc.hip | 7 +++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index a7310ba2da061..caf74786aebea 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -9099,6 +9099,9 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA, }; auto ShouldForward = [&](const llvm::DenseSet &Set, Arg *A, const ToolChain &TC) { + // CMake hack to avoid printing verbose informatoin for HIP non-RDC mode. + if (A->getOption().matches(OPT_v) && JA.getType() == types::TY_Object) + return false; return (Set.contains(A->getOption().getID()) || (A->getOption().getGroup().isValid() && Set.contains(A->getOption().getGroup().getID()))) && @@ -9174,7 +9177,12 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back( Args.MakeArgString("--host-triple=" + getToolChain().getTripleString())); - if (Args.hasArg(options::OPT_v)) + + // CMake hack, suppress passing verbose arguments for the special-case HIP + // non-RDC mode compilation. This confuses default CMake implicit linker + // argument parsing when the language is set to HIP and the system linker is + // also `ld.lld`. + if (Args.hasArg(options::OPT_v) && JA.getType() != types::TY_Object) CmdArgs.push_back("--wrapper-verbose"); if (Arg *A = Args.getLastArg(options::OPT_cuda_path_EQ)) CmdArgs.push_back( diff --git a/clang/test/Driver/hip-toolchain-no-rdc.hip b/clang/test/Driver/hip-toolchain-no-rdc.hip index 2f40fd478671e..840334e19e7f2 100644 --- a/clang/test/Driver/hip-toolchain-no-rdc.hip +++ b/clang/test/Driver/hip-toolchain-no-rdc.hip @@ -214,3 +214,10 @@ // AMDGCNSPIRV: {{".*clang-offload-bundler.*"}} "-type=o" // AMDGCNSPIRV-SAME: "-targets={{.*}}hipv4-spirv64-amd-amdhsa--amdgcnspirv,hipv4-amdgcn-amd-amdhsa--gfx900" // AMDGCNSPIRV-SAME: "-input=[[AMDGCNSPV_CO]]" "-input=[[GFX900_CO]]" + +// Check verbose printing with the new driver. +// RUN: %clang -### --target=x86_64-linux-gnu -fno-gpu-rdc -nogpulib -nogpuinc \ +// RUN: --offload-new-driver --offload-arch=gfx908 -v %s 2>&1 | FileCheck %s --check-prefix=VERBOSE +// VERBOSE: clang-linker-wrapper +// VERBOSE-NOT: --device-compiler=amdgcn-amd-amdhsa=-v +// VERBOSE-NOT: --wrapper-verbose From f6ba21389fb0daad19bbb36a60cbfb4e7ef4efa2 Mon Sep 17 00:00:00 2001 From: Nick Sarnie Date: Wed, 22 Oct 2025 23:47:29 +0900 Subject: [PATCH 116/530] [clang] Fix inconsistencies with the device_kernel attr on different targets (#161905) The original [change](https://github.com/llvm/llvm-project/pull/137882) unifying the device kernel attributes had some inexplicable behavior, such as `amdgpu_kernel` resulting in a function ending up with the `spir_kernel` CC but `nvptx_kernel` not doing the same, both cases compiling for SPIR. There was also a [crash](https://github.com/llvm/llvm-project/issues/161077). `sycl_kernel` is now separated out from `device_kernel`, but still there was some weird behavior for the remaining spellings. For the target-specific spellings (`nvptx_kernel` and `amdgpu_kernel`), while not technically required, we warn and ignore the attribute if the spelling doesn't match the target because it's weird from the user's point of view to allow it. Also we make sure that any valid usage actually applies the CC to the generated `llvm:Function`. This worked for `NVPTX` already but was missing for `SPIR/SPIR-V` and `AMDGPU`, it needs to be explicitly done in `TargetInfo`. This allows us to remove the `amdgpu_kernel` specific handing we had. That special handling was previously required because it was the only variation that was allowed on a type, and thus had a separate way to propagate the CC. These issues were reported [here](https://github.com/llvm/llvm-project/issues/161077) and [here](https://github.com/llvm/llvm-project/pull/161349). Closes: https://github.com/llvm/llvm-project/issues/161077 --------- Signed-off-by: Sarnie, Nick --- clang/docs/ReleaseNotes.rst | 1 + clang/include/clang/Basic/Attr.td | 2 +- .../clang/Basic/DiagnosticSemaKinds.td | 3 ++ clang/lib/AST/TypePrinter.cpp | 3 -- clang/lib/Basic/Targets/NVPTX.h | 2 +- clang/lib/CodeGen/Targets/AMDGPU.cpp | 8 +++-- clang/lib/CodeGen/Targets/NVPTX.cpp | 4 +-- clang/lib/CodeGen/Targets/SPIR.cpp | 36 +++++++++++++++---- clang/lib/Sema/SemaDeclAttr.cpp | 24 +++++++++++-- clang/lib/Sema/SemaType.cpp | 18 ++-------- clang/test/Sema/callingconv-devicekernel.cpp | 16 +++++++++ clang/test/Sema/callingconv.c | 4 +++ 12 files changed, 86 insertions(+), 35 deletions(-) create mode 100644 clang/test/Sema/callingconv-devicekernel.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index fe77f917bb801..e6e33e7a9a280 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -447,6 +447,7 @@ Bug Fixes to Attribute Support - Using ``[[gnu::cleanup(some_func)]]`` where some_func is annotated with ``[[gnu::error("some error")]]`` now correctly triggers an error. (#GH146520) - Fix a crash when the function name is empty in the `swift_name` attribute. (#GH157075) +- Fixes crashes or missing diagnostics with the `device_kernel` attribute. (#GH161905) Bug Fixes to C++ Support ^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index eb48a0c01fd1e..b320f4bedba0c 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -1623,7 +1623,7 @@ def SYCLKernel : InheritableAttr { let Documentation = [SYCLKernelDocs]; } -def DeviceKernel : DeclOrTypeAttr { +def DeviceKernel : InheritableAttr { let Spellings = [Clang<"device_kernel">, Clang<"nvptx_kernel">, Clang<"amdgpu_kernel">, CustomKeyword<"__kernel">, CustomKeyword<"kernel">]; diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 5ff4cc4b833d9..20b499462ae94 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -4126,6 +4126,9 @@ def warn_missing_sdksettings_for_availability_checking : Warning< "%0 availability is ignored without a valid 'SDKSettings.json' in the SDK">, InGroup>; +def err_hidden_device_kernel + : Error<"%0 is specified as a device kernel but it is not externally visible">; + // Thread Safety Attributes def warn_thread_attribute_ignored : Warning< "ignoring %0 attribute because its argument is invalid">, diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp index 2da7789fe8117..c18b2eafc722c 100644 --- a/clang/lib/AST/TypePrinter.cpp +++ b/clang/lib/AST/TypePrinter.cpp @@ -2147,9 +2147,6 @@ void TypePrinter::printAttributedAfter(const AttributedType *T, } case attr::AArch64VectorPcs: OS << "aarch64_vector_pcs"; break; case attr::AArch64SVEPcs: OS << "aarch64_sve_pcs"; break; - case attr::DeviceKernel: - OS << T->getAttr()->getSpelling(); - break; case attr::IntelOclBicc: OS << "inteloclbicc"; break; diff --git a/clang/lib/Basic/Targets/NVPTX.h b/clang/lib/Basic/Targets/NVPTX.h index 33c29586359be..f5c8396f398aa 100644 --- a/clang/lib/Basic/Targets/NVPTX.h +++ b/clang/lib/Basic/Targets/NVPTX.h @@ -200,7 +200,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXTargetInfo : public TargetInfo { // a host function. if (HostTarget) return HostTarget->checkCallingConvention(CC); - return CCCR_Warning; + return CC == CC_DeviceKernel ? CCCR_OK : CCCR_Warning; } bool hasBitIntType() const override { return true; } diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index 0bc4b4b7025f2..e4ad078dab197 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -439,9 +439,11 @@ void AMDGPUTargetCodeGenInfo::setTargetAttributes( return; const FunctionDecl *FD = dyn_cast_or_null(D); - if (FD) + if (FD) { setFunctionDeclAttributes(FD, F, M); - + if (FD->hasAttr() && !M.getLangOpts().OpenCL) + F->setCallingConv(getDeviceKernelCallingConv()); + } if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts) F->addFnAttr("amdgpu-ieee", "false"); } @@ -658,7 +660,7 @@ llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel( // kernel address (only the kernel descriptor). auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name, &Mod); - F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); + F->setCallingConv(getDeviceKernelCallingConv()); llvm::AttrBuilder KernelAttrs(C); // FIXME: The invoke isn't applying the right attributes either diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp index 53f2fc4a040a7..f6715861d91bc 100644 --- a/clang/lib/CodeGen/Targets/NVPTX.cpp +++ b/clang/lib/CodeGen/Targets/NVPTX.cpp @@ -264,7 +264,7 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes( // And kernel functions are not subject to inlining F->addFnAttr(llvm::Attribute::NoInline); if (FD->hasAttr()) { - F->setCallingConv(llvm::CallingConv::PTX_Kernel); + F->setCallingConv(getDeviceKernelCallingConv()); for (auto IV : llvm::enumerate(FD->parameters())) if (IV.value()->hasAttr()) @@ -278,7 +278,7 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes( } // Attach kernel metadata directly if compiling for NVPTX. if (FD->hasAttr()) - F->setCallingConv(llvm::CallingConv::PTX_Kernel); + F->setCallingConv(getDeviceKernelCallingConv()); } void NVPTXTargetCodeGenInfo::addNVVMMetadata(llvm::GlobalValue *GV, diff --git a/clang/lib/CodeGen/Targets/SPIR.cpp b/clang/lib/CodeGen/Targets/SPIR.cpp index 80e096ecf5ae9..15d0b353d748c 100644 --- a/clang/lib/CodeGen/Targets/SPIR.cpp +++ b/clang/lib/CodeGen/Targets/SPIR.cpp @@ -64,6 +64,8 @@ class CommonSPIRTargetCodeGenInfo : public TargetCodeGenInfo { llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM, llvm::PointerType *T, QualType QT) const override; + void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV, + CodeGen::CodeGenModule &M) const override; }; class SPIRVTargetCodeGenInfo : public CommonSPIRTargetCodeGenInfo { public: @@ -268,6 +270,22 @@ CommonSPIRTargetCodeGenInfo::getNullPointer(const CodeGen::CodeGenModule &CGM, llvm::ConstantPointerNull::get(NPT), PT); } +void CommonSPIRTargetCodeGenInfo::setTargetAttributes( + const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const { + if (M.getLangOpts().OpenCL || GV->isDeclaration()) + return; + + const FunctionDecl *FD = dyn_cast(D); + if (!FD) + return; + + llvm::Function *F = dyn_cast(GV); + assert(F && "Expected GlobalValue to be a Function"); + + if (FD->hasAttr()) + F->setCallingConv(getDeviceKernelCallingConv()); +} + LangAS SPIRVTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM, const VarDecl *D) const { @@ -292,19 +310,23 @@ SPIRVTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM, void SPIRVTargetCodeGenInfo::setTargetAttributes( const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const { - if (!M.getLangOpts().HIP || - M.getTarget().getTriple().getVendor() != llvm::Triple::AMD) - return; if (GV->isDeclaration()) return; - auto F = dyn_cast(GV); - if (!F) + const FunctionDecl *FD = dyn_cast_or_null(D); + if (!FD) return; - auto FD = dyn_cast_or_null(D); - if (!FD) + llvm::Function *F = dyn_cast(GV); + assert(F && "Expected GlobalValue to be a Function"); + + if (FD->hasAttr()) + F->setCallingConv(getDeviceKernelCallingConv()); + + if (!M.getLangOpts().HIP || + M.getTarget().getTriple().getVendor() != llvm::Triple::AMD) return; + if (!FD->hasAttr()) return; diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index 9475b8a684082..964a2a791e18f 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -5206,16 +5206,36 @@ static void handleCallConvAttr(Sema &S, Decl *D, const ParsedAttr &AL) { static void handleDeviceKernelAttr(Sema &S, Decl *D, const ParsedAttr &AL) { const auto *FD = dyn_cast_or_null(D); bool IsFunctionTemplate = FD && FD->getDescribedFunctionTemplate(); - if (S.getASTContext().getTargetInfo().getTriple().isNVPTX()) { + llvm::Triple Triple = S.getASTContext().getTargetInfo().getTriple(); + const LangOptions &LangOpts = S.getLangOpts(); + // OpenCL has its own error messages. + if (!LangOpts.OpenCL && FD && !FD->isExternallyVisible()) { + S.Diag(AL.getLoc(), diag::err_hidden_device_kernel) << FD; + AL.setInvalid(); + return; + } + if (Triple.isNVPTX()) { handleGlobalAttr(S, D, AL); } else { // OpenCL C++ will throw a more specific error. - if (!S.getLangOpts().OpenCLCPlusPlus && (!FD || IsFunctionTemplate)) { + if (!LangOpts.OpenCLCPlusPlus && (!FD || IsFunctionTemplate)) { S.Diag(AL.getLoc(), diag::err_attribute_wrong_decl_type_str) << AL << AL.isRegularKeywordAttribute() << "functions"; + AL.setInvalid(); + return; } handleSimpleAttribute(S, D, AL); } + // TODO: isGPU() should probably return true for SPIR. + bool TargetDeviceEnvironment = Triple.isGPU() || Triple.isSPIR() || + LangOpts.isTargetDevice() || LangOpts.OpenCL; + if (!TargetDeviceEnvironment) { + S.Diag(AL.getLoc(), diag::warn_cconv_unsupported) + << AL << (int)Sema::CallingConventionIgnoredReason::ForThisTarget; + AL.setInvalid(); + return; + } + // Make sure we validate the CC with the target // and warn/error if necessary. handleCallConvAttr(S, D, AL); diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp index 7c1fb12a549e6..280b3c92cce14 100644 --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -134,7 +134,6 @@ static void diagnoseBadTypeAttribute(Sema &S, const ParsedAttr &attr, case ParsedAttr::AT_VectorCall: \ case ParsedAttr::AT_AArch64VectorPcs: \ case ParsedAttr::AT_AArch64SVEPcs: \ - case ParsedAttr::AT_DeviceKernel: \ case ParsedAttr::AT_MSABI: \ case ParsedAttr::AT_SysVABI: \ case ParsedAttr::AT_Pcs: \ @@ -3786,7 +3785,8 @@ static CallingConv getCCForDeclaratorChunk( } } } - for (const ParsedAttr &AL : D.getDeclSpec().getAttributes()) { + for (const ParsedAttr &AL : llvm::concat( + D.getDeclSpec().getAttributes(), D.getAttributes())) { if (AL.getKind() == ParsedAttr::AT_DeviceKernel) { CC = CC_DeviceKernel; break; @@ -7569,8 +7569,6 @@ static Attr *getCCTypeAttr(ASTContext &Ctx, ParsedAttr &Attr) { return createSimpleAttr(Ctx, Attr); case ParsedAttr::AT_ArmStreaming: return createSimpleAttr(Ctx, Attr); - case ParsedAttr::AT_DeviceKernel: - return createSimpleAttr(Ctx, Attr); case ParsedAttr::AT_Pcs: { // The attribute may have had a fixit applied where we treated an // identifier as a string literal. The contents of the string are valid, @@ -8809,16 +8807,6 @@ static void HandleHLSLParamModifierAttr(TypeProcessingState &State, } } -static bool isMultiSubjectAttrAllowedOnType(const ParsedAttr &Attr) { - // The DeviceKernel attribute is shared for many targets, and - // it is only allowed to be a type attribute with the AMDGPU - // spelling, so skip processing the attr as a type attr - // unless it has that spelling. - if (Attr.getKind() != ParsedAttr::AT_DeviceKernel) - return true; - return DeviceKernelAttr::isAMDGPUSpelling(Attr); -} - static void processTypeAttrs(TypeProcessingState &state, QualType &type, TypeAttrLocation TAL, const ParsedAttributesView &attrs, @@ -9072,8 +9060,6 @@ static void processTypeAttrs(TypeProcessingState &state, QualType &type, break; [[fallthrough]]; FUNCTION_TYPE_ATTRS_CASELIST: - if (!isMultiSubjectAttrAllowedOnType(attr)) - break; attr.setUsedAsTypeAttr(); diff --git a/clang/test/Sema/callingconv-devicekernel.cpp b/clang/test/Sema/callingconv-devicekernel.cpp new file mode 100644 index 0000000000000..f5da873fb68bd --- /dev/null +++ b/clang/test/Sema/callingconv-devicekernel.cpp @@ -0,0 +1,16 @@ +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fsyntax-only -verify %s +// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda- -fsyntax-only -verify %s +// RUN: %clang_cc1 -triple spir64 -fsyntax-only -verify %s +// RUN: %clang_cc1 -triple spirv64 -fsyntax-only -verify %s + +[[clang::device_kernel]] void kernel1() {} + +namespace { +[[clang::device_kernel]] void kernel2() {} // expected-error {{'kernel2' is specified as a device kernel but it is not externally visible}} +} + +namespace ns { + [[clang::device_kernel]] void kernel3() {} +} + +[[clang::device_kernel]] static void kernel4() {} // expected-error {{'kernel4' is specified as a device kernel but it is not externally visible}} diff --git a/clang/test/Sema/callingconv.c b/clang/test/Sema/callingconv.c index f0b8b80a32974..28342b56af39a 100644 --- a/clang/test/Sema/callingconv.c +++ b/clang/test/Sema/callingconv.c @@ -55,6 +55,10 @@ int __attribute__((aarch64_vector_pcs)) aavpcs(void); // expected-warning {{'aar int __attribute__((aarch64_sve_pcs)) aasvepcs(void); // expected-warning {{'aarch64_sve_pcs' calling convention is not supported for this target}} int __attribute__((amdgpu_kernel)) amdgpu_kernel(void); // expected-warning {{'amdgpu_kernel' calling convention is not supported for this target}} +int __attribute__((device_kernel)) device_kernel(void) { // expected-warning {{'device_kernel' calling convention is not supported for this target}} +} +int __attribute__((sycl_kernel)) sycl_kernel(void) { // expected-warning {{'sycl_kernel' attribute ignored}} +} // PR6361 void ctest3(); From ae11c5c2c4d7ae4cba4a8e05f0c7d85b316a2cf0 Mon Sep 17 00:00:00 2001 From: Jakub Kuderski Date: Wed, 22 Oct 2025 10:51:03 -0400 Subject: [PATCH 117/530] [mlir] Switch uses of deprecated .create methods to free function. NFC. (#164635) See https://discourse.llvm.org/t/psa-opty-create-now-with-100-more-tab-complete/87339. --- .../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 10 +++--- .../ComplexToROCDLLibraryCalls.cpp | 10 +++--- .../ComplexToStandard/ComplexToStandard.cpp | 6 ++-- .../GPUToNVVM/LowerGpuOpsToNVVMOps.cpp | 21 +++++++------ mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp | 4 +-- mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp | 26 ++++++++-------- .../Conversion/TosaToLinalg/TosaToLinalg.cpp | 24 +++++++------- .../TransformOps/LinalgTransformOps.cpp | 14 +++++---- .../Transforms/RuntimeOpVerification.cpp | 13 ++++---- mlir/lib/Dialect/SCF/IR/SCF.cpp | 8 ++--- .../Vector/Transforms/LowerVectorShuffle.cpp | 6 ++-- mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 2 +- .../Transforms/XeGPUSubgroupDistribute.cpp | 12 +++---- .../Transforms/XeGPUWgToSgDistribute.cpp | 31 ++++++++++--------- .../SPIRV/Deserialization/Deserializer.cpp | 8 ++--- mlir/lib/Target/Wasm/TranslateFromWasm.cpp | 20 ++++++------ mlir/test/lib/Dialect/Test/TestPatterns.cpp | 2 +- 17 files changed, 110 insertions(+), 107 deletions(-) diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 85f0fd1dd1048..9b154350cd913 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -1927,16 +1927,16 @@ struct AMDGPUPermlaneLowering : public ConvertOpToLLVMPattern { else llvm_unreachable("unsupported row length"); - const Value vdst0 = LLVM::ExtractValueOp::create(rewriter, loc, res, {0}); - const Value vdst1 = LLVM::ExtractValueOp::create(rewriter, loc, res, {1}); + Value vdst0 = LLVM::ExtractValueOp::create(rewriter, loc, res, {0}); + Value vdst1 = LLVM::ExtractValueOp::create(rewriter, loc, res, {1}); - const Value isEqual = - rewriter.create(loc, LLVM::ICmpPredicate::eq, vdst0, v); + Value isEqual = LLVM::ICmpOp::create(rewriter, loc, + LLVM::ICmpPredicate::eq, vdst0, v); // Per `permlane(16|32)` semantics: if the first extracted element equals // 'v', the result is the second element; otherwise it is the first. Value vdstNew = - rewriter.create(loc, isEqual, vdst1, vdst0); + LLVM::SelectOp::create(rewriter, loc, isEqual, vdst1, vdst0); permuted.emplace_back(vdstNew); } diff --git a/mlir/lib/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.cpp b/mlir/lib/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.cpp index 42099aaa6b574..12adfe13a5685 100644 --- a/mlir/lib/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.cpp +++ b/mlir/lib/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.cpp @@ -93,11 +93,11 @@ struct PowiOpToROCDLLibraryCalls : public OpRewritePattern { Location loc = op.getLoc(); Value exponentReal = - rewriter.create(loc, exponentFloatType, op.getRhs()); - Value zeroImag = rewriter.create( - loc, rewriter.getZeroAttr(exponentFloatType)); - Value exponent = rewriter.create( - loc, op.getLhs().getType(), exponentReal, zeroImag); + arith::SIToFPOp::create(rewriter, loc, exponentFloatType, op.getRhs()); + Value zeroImag = arith::ConstantOp::create( + rewriter, loc, rewriter.getZeroAttr(exponentFloatType)); + Value exponent = complex::CreateOp::create( + rewriter, loc, op.getLhs().getType(), exponentReal, zeroImag); rewriter.replaceOpWithNewOp(op, op.getType(), op.getLhs(), exponent, op.getFastmathAttr()); diff --git a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp index 5613e021cd709..0fe72394b61d6 100644 --- a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp +++ b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp @@ -937,14 +937,14 @@ struct PowiOpConversion : public OpConversionPattern { auto elementType = cast(type.getElementType()); Value floatExponent = - builder.create(elementType, adaptor.getRhs()); + arith::SIToFPOp::create(builder, elementType, adaptor.getRhs()); Value zero = arith::ConstantOp::create( builder, elementType, builder.getFloatAttr(elementType, 0.0)); Value complexExponent = complex::CreateOp::create(builder, type, floatExponent, zero); - auto pow = builder.create( - type, adaptor.getLhs(), complexExponent, op.getFastmathAttr()); + auto pow = complex::PowOp::create(builder, type, adaptor.getLhs(), + complexExponent, op.getFastmathAttr()); rewriter.replaceOp(op, pow.getResult()); return success(); } diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp index 852c50c965f11..d64c4d64cad84 100644 --- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp +++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp @@ -500,19 +500,19 @@ struct SincosOpLowering : public ConvertOpToLLVMPattern { op->getParentWithTrait(); assert(scope && "Expected op to be inside automatic allocation scope"); rewriter.setInsertionPointToStart(&scope->getRegion(0).front()); - auto one = rewriter.create( - loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(1)); + auto one = LLVM::ConstantOp::create(rewriter, loc, rewriter.getI32Type(), + rewriter.getI32IntegerAttr(1)); sinPtr = - rewriter.create(loc, ptrType, computeType, one, 0); + LLVM::AllocaOp::create(rewriter, loc, ptrType, computeType, one, 0); cosPtr = - rewriter.create(loc, ptrType, computeType, one, 0); + LLVM::AllocaOp::create(rewriter, loc, ptrType, computeType, one, 0); } createSincosCall(rewriter, loc, sincosFunc, convertedInput, sinPtr, cosPtr, op); - auto sinResult = rewriter.create(loc, computeType, sinPtr); - auto cosResult = rewriter.create(loc, computeType, cosPtr); + auto sinResult = LLVM::LoadOp::create(rewriter, loc, computeType, sinPtr); + auto cosResult = LLVM::LoadOp::create(rewriter, loc, computeType, cosPtr); rewriter.replaceOp(op, {maybeTrunc(sinResult, inputType, rewriter), maybeTrunc(cosResult, inputType, rewriter)}); @@ -522,14 +522,15 @@ struct SincosOpLowering : public ConvertOpToLLVMPattern { private: Value maybeExt(Value operand, PatternRewriter &rewriter) const { if (isa(operand.getType())) - return rewriter.create( - operand.getLoc(), Float32Type::get(rewriter.getContext()), operand); + return LLVM::FPExtOp::create(rewriter, operand.getLoc(), + Float32Type::get(rewriter.getContext()), + operand); return operand; } Value maybeTrunc(Value operand, Type type, PatternRewriter &rewriter) const { if (operand.getType() != type) - return rewriter.create(operand.getLoc(), type, operand); + return LLVM::FPTruncOp::create(rewriter, operand.getLoc(), type, operand); return operand; } @@ -556,7 +557,7 @@ struct SincosOpLowering : public ConvertOpToLLVMPattern { } SmallVector callOperands = {input, sinPtr, cosPtr}; - rewriter.create(loc, funcOp, callOperands); + LLVM::CallOp::create(rewriter, loc, funcOp, callOperands); } }; diff --git a/mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp b/mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp index 229e40e2061cb..7cce324f94295 100644 --- a/mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp +++ b/mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp @@ -142,8 +142,8 @@ struct SincosOpLowering : public ConvertOpToLLVMPattern { auto structType = LLVM::LLVMStructType::getLiteral( rewriter.getContext(), {llvmOperandType, llvmOperandType}); - auto sincosOp = rewriter.create( - loc, structType, adaptor.getOperand(), attrs.getAttrs()); + auto sincosOp = LLVM::SincosOp::create( + rewriter, loc, structType, adaptor.getOperand(), attrs.getAttrs()); auto sinValue = LLVM::ExtractValueOp::create(rewriter, loc, sincosOp, 0); auto cosValue = LLVM::ExtractValueOp::create(rewriter, loc, sincosOp, 1); diff --git a/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp b/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp index 519d9c8f835f2..71e3f88a63f34 100644 --- a/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp +++ b/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp @@ -394,9 +394,9 @@ struct WhileLowering : public OpConversionPattern { if (!convertedType) return rewriter.notifyMatchFailure(whileOp, "type conversion failed"); - emitc::VariableOp var = rewriter.create( - loc, emitc::LValueType::get(convertedType), noInit); - rewriter.create(loc, var.getResult(), init); + auto var = emitc::VariableOp::create( + rewriter, loc, emitc::LValueType::get(convertedType), noInit); + emitc::AssignOp::create(rewriter, loc, var.getResult(), init); loopVars.push_back(var); } @@ -411,11 +411,11 @@ struct WhileLowering : public OpConversionPattern { // Create a global boolean variable to store the loop condition state. Type i1Type = IntegerType::get(context, 1); auto globalCondition = - rewriter.create(loc, emitc::LValueType::get(i1Type), - emitc::OpaqueAttr::get(context, "")); + emitc::VariableOp::create(rewriter, loc, emitc::LValueType::get(i1Type), + emitc::OpaqueAttr::get(context, "")); Value conditionVal = globalCondition.getResult(); - auto loweredDo = rewriter.create(loc); + auto loweredDo = emitc::DoOp::create(rewriter, loc); // Convert region types to match the target dialect type system. if (failed(rewriter.convertRegionTypes(&whileOp.getBefore(), @@ -450,12 +450,12 @@ struct WhileLowering : public OpConversionPattern { // Convert scf.condition to condition variable assignment. Value condition = rewriter.getRemappedValue(condOp.getCondition()); - rewriter.create(loc, conditionVal, condition); + emitc::AssignOp::create(rewriter, loc, conditionVal, condition); // Wrap body region in conditional to preserve scf semantics. Only create // ifOp if after-region is non-empty. if (whileOp.getAfterBody()->getOperations().size() > 1) { - auto ifOp = rewriter.create(loc, condition, false, false); + auto ifOp = emitc::IfOp::create(rewriter, loc, condition, false, false); // Prepare the after region (loop body) for merging. Block *afterBlock = &whileOp.getAfter().front(); @@ -480,8 +480,8 @@ struct WhileLowering : public OpConversionPattern { Block *condBlock = rewriter.createBlock(&condRegion); rewriter.setInsertionPointToStart(condBlock); - auto exprOp = rewriter.create( - loc, i1Type, conditionVal, /*do_not_inline=*/false); + auto exprOp = emitc::ExpressionOp::create( + rewriter, loc, i1Type, conditionVal, /*do_not_inline=*/false); Block *exprBlock = rewriter.createBlock(&exprOp.getBodyRegion()); // Set up the expression block to load the condition variable. @@ -490,12 +490,12 @@ struct WhileLowering : public OpConversionPattern { // Load the condition value and yield it as the expression result. Value cond = - rewriter.create(loc, i1Type, exprBlock->getArgument(0)); - rewriter.create(loc, cond); + emitc::LoadOp::create(rewriter, loc, i1Type, exprBlock->getArgument(0)); + emitc::YieldOp::create(rewriter, loc, cond); // Yield the expression as the condition region result. rewriter.setInsertionPointToEnd(condBlock); - rewriter.create(loc, exprOp); + emitc::YieldOp::create(rewriter, loc, exprOp); return success(); } diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp index 00df14b1bdb77..29afdc240c9f8 100644 --- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp +++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp @@ -232,16 +232,16 @@ static Value createLinalgBodyCalculationForElementwiseOp( } intermediateType = rewriter.getIntegerType(intermediateBitWidth); - zpAddValue = rewriter.create( - loc, rewriter.getIntegerAttr(intermediateType, zpAdd)); + zpAddValue = arith::ConstantOp::create( + rewriter, loc, rewriter.getIntegerAttr(intermediateType, zpAdd)); } else { intermediateType = rewriter.getIntegerType(intermediateBitWidth); auto arg1 = - rewriter.create(loc, intermediateType, args[1]); + arith::ExtSIOp::create(rewriter, loc, intermediateType, args[1]); auto arg2 = - rewriter.create(loc, intermediateType, args[2]); + arith::ExtSIOp::create(rewriter, loc, intermediateType, args[2]); zpAddValue = - rewriter.create(loc, intermediateType, arg1, arg2); + arith::AddIOp::create(rewriter, loc, intermediateType, arg1, arg2); } // The negation can be applied by doing: @@ -1402,8 +1402,8 @@ static Value collapse1xNTensorToN(PatternRewriter &rewriter, Value input, auto elemType = inputType.getElementType(); auto collapsedType = RankedTensorType::get({}, elemType); // Emit the collapse op - return rewriter.create(loc, collapsedType, input, - reassociation); + return tensor::CollapseShapeOp::create(rewriter, loc, collapsedType, input, + reassociation); } static llvm::SmallVector @@ -1443,7 +1443,7 @@ static void setupLinalgGenericOpInputAndIndexingMap( IntegerAttr intAttr = isShift ? rewriter.getI8IntegerAttr(values.front()) : rewriter.getI32IntegerAttr(values.front()); - constant = rewriter.create(loc, intAttr); + constant = arith::ConstantOp::create(rewriter, loc, intAttr); } else { auto elementType = isShift ? rewriter.getIntegerType(8) : rewriter.getI32Type(); @@ -1511,14 +1511,14 @@ static Value getExtendZp(OpBuilder &builder, Type valueTy, .getResult(0); } if (zpTy.isUnsignedInteger()) { - return builder.create(loc, extendType, result); + return arith::ExtUIOp::create(builder, loc, extendType, result); } else { - return builder.create(loc, extendType, result); + return arith::ExtSIOp::create(builder, loc, extendType, result); } } } else { - return builder.create( - loc, IntegerAttr::get(extendType, *maybeZp)); + return arith::ConstantOp::create(builder, loc, + IntegerAttr::get(extendType, *maybeZp)); } return result; } diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp index 9a8a63e54d02d..794dda96d1dfa 100644 --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -437,13 +437,15 @@ transform::PromoteTensorOp::apply(transform::TransformRewriter &rewriter, for (auto [pos, dim] : llvm::enumerate(type.getShape())) { if (!ShapedType::isDynamic(dim)) continue; - Value cst = rewriter.create(tensor.getLoc(), pos); - auto dimOp = rewriter.create(tensor.getLoc(), tensor, cst); + Value cst = + arith::ConstantIndexOp::create(rewriter, tensor.getLoc(), pos); + auto dimOp = + tensor::DimOp::create(rewriter, tensor.getLoc(), tensor, cst); preservedOps.insert(dimOp); dynamicDims.push_back(dimOp); } - auto allocation = rewriter.create( - tensor.getLoc(), type, dynamicDims); + auto allocation = bufferization::AllocTensorOp::create( + rewriter, tensor.getLoc(), type, dynamicDims); // Set memory space if provided. if (getMemorySpaceAttr()) allocation.setMemorySpaceAttr(getMemorySpaceAttr()); @@ -452,8 +454,8 @@ transform::PromoteTensorOp::apply(transform::TransformRewriter &rewriter, // Only insert a materialization (typically bufferizes to a copy) when the // value may be read from. if (needsMaterialization) { - auto copy = rewriter.create( - tensor.getLoc(), tensor, allocated); + auto copy = bufferization::MaterializeInDestinationOp::create( + rewriter, tensor.getLoc(), tensor, allocated); preservedOps.insert(copy); promoted.push_back(copy.getResult()); } else { diff --git a/mlir/lib/Dialect/Linalg/Transforms/RuntimeOpVerification.cpp b/mlir/lib/Dialect/Linalg/Transforms/RuntimeOpVerification.cpp index 181b4846835c0..5e10ba3780607 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/RuntimeOpVerification.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/RuntimeOpVerification.cpp @@ -50,24 +50,25 @@ struct StructuredOpInterface auto endValue = getValueOrCreateConstantIndexOp(builder, loc, end); // Loop Trip count > 0 iff start < end - Value dimensionHasNonZeroTripCount = builder.create( - loc, index::IndexCmpPredicate::SLT, startValue, endValue); + Value dimensionHasNonZeroTripCount = index::CmpOp::create( + builder, loc, index::IndexCmpPredicate::SLT, startValue, endValue); if (!iterationDomainIsNonDegenerate) { iterationDomainIsNonDegenerate = dimensionHasNonZeroTripCount; } else { // Iteration domain is non-degenerate iff all dimensions have loop trip // count > 0 - iterationDomainIsNonDegenerate = builder.create( - loc, iterationDomainIsNonDegenerate, dimensionHasNonZeroTripCount); + iterationDomainIsNonDegenerate = + arith::AndIOp::create(builder, loc, iterationDomainIsNonDegenerate, + dimensionHasNonZeroTripCount); } } if (!iterationDomainIsNonDegenerate) return; - auto ifOp = builder.create(loc, iterationDomainIsNonDegenerate, - /*withElseRegion=*/false); + auto ifOp = scf::IfOp::create(builder, loc, iterationDomainIsNonDegenerate, + /*withElseRegion=*/false); builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); // Subtract one from the loop ends before composing with the indexing map diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp index a9da6c2c8320a..9bd13f3236cfc 100644 --- a/mlir/lib/Dialect/SCF/IR/SCF.cpp +++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp @@ -2490,8 +2490,8 @@ struct ConditionPropagation : public OpRewritePattern { changed = true; if (!constantTrue) - constantTrue = rewriter.create( - op.getLoc(), i1Ty, rewriter.getIntegerAttr(i1Ty, 1)); + constantTrue = arith::ConstantOp::create( + rewriter, op.getLoc(), i1Ty, rewriter.getIntegerAttr(i1Ty, 1)); rewriter.modifyOpInPlace(use.getOwner(), [&]() { use.set(constantTrue); }); @@ -2500,8 +2500,8 @@ struct ConditionPropagation : public OpRewritePattern { changed = true; if (!constantFalse) - constantFalse = rewriter.create( - op.getLoc(), i1Ty, rewriter.getIntegerAttr(i1Ty, 0)); + constantFalse = arith::ConstantOp::create( + rewriter, op.getLoc(), i1Ty, rewriter.getIntegerAttr(i1Ty, 0)); rewriter.modifyOpInPlace(use.getOwner(), [&]() { use.set(constantFalse); }); diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorShuffle.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorShuffle.cpp index 8f46ad6ea892b..ef49c863683ad 100644 --- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorShuffle.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorShuffle.cpp @@ -74,9 +74,9 @@ struct MixedSizeInputShuffleOpRewrite final for (int64_t i = 0; i < origNumElems; ++i) promoteMask[i] = i; - Value promotedInput = rewriter.create( - shuffleOp.getLoc(), promotedType, inputToPromote, inputToPromote, - promoteMask); + Value promotedInput = + vector::ShuffleOp::create(rewriter, shuffleOp.getLoc(), promotedType, + inputToPromote, inputToPromote, promoteMask); // Create the final shuffle with the promoted inputs. Value promotedV1 = promoteV1 ? promotedInput : shuffleOp.getV1(); diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index 1599ae9f28e90..24e909548fe0b 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -736,7 +736,7 @@ OpFoldResult genBinOp(OpFoldResult a, OpFoldResult b, Location loc, OpBuilder &builder) { auto aVal = getValueOrCreateConstantIndexOp(builder, loc, a); auto bVal = getValueOrCreateConstantIndexOp(builder, loc, b); - return builder.create(loc, aVal, bVal).getResult(); + return ArithOp::create(builder, loc, aVal, bVal).getResult(); } // a helper utility to perform division operation on OpFoldResult and int64_t. diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 26770b3c003ea..ba89daafedb30 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -1525,15 +1525,13 @@ void XeGPUSubgroupDistributePass::runOnOperation() { auto warpReduction = [](Location loc, OpBuilder &builder, Value input, vector::CombiningKind kind, uint32_t size) { // First reduce on a single thread to get per lane reduction value. - Value laneVal = builder.create(loc, kind, input); + Value laneVal = vector::ReductionOp::create(builder, loc, kind, input); // Parallel reduction using butterfly shuffles. for (uint64_t i = 1; i < size; i <<= 1) { - Value shuffled = - builder - .create(loc, laneVal, i, - /*width=*/size, - /*mode=*/gpu::ShuffleMode::XOR) - .getShuffleResult(); + Value shuffled = gpu::ShuffleOp::create(builder, loc, laneVal, i, + /*width=*/size, + /*mode=*/gpu::ShuffleMode::XOR) + .getShuffleResult(); laneVal = makeArithReduction(builder, loc, kind, laneVal, shuffled); } return laneVal; diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index 31a967dcd04c7..9fc5ad9af5c7b 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -825,7 +825,7 @@ struct WgToSgArithConstantOp : public OpConversionPattern { auto tileAttr = DenseElementsAttr::get(VectorType::get(sgShape, eltType), baseTileValues); - auto baseConstVec = rewriter.create(loc, tileAttr); + auto baseConstVec = arith::ConstantOp::create(rewriter, loc, tileAttr); // Get subgroup id Value sgId = @@ -837,25 +837,26 @@ struct WgToSgArithConstantOp : public OpConversionPattern { SmallVector strideConsts; strideConsts.push_back( - rewriter.create(loc, colStride)); + arith::ConstantIndexOp::create(rewriter, loc, colStride)); if (rows > 1) strideConsts.insert( strideConsts.begin(), - rewriter.create(loc, rowStride)); + arith::ConstantIndexOp::create(rewriter, loc, rowStride)); SmallVector newConstOps; for (auto offsets : *sgOffsets) { // Multiply offset with stride, broadcast it and add to baseConstVec - Value mulOffset = rewriter.create(loc, 0); + Value mulOffset = arith::ConstantIndexOp::create(rewriter, loc, 0); for (size_t i = 0; i < strideConsts.size(); ++i) { - Value mul = rewriter.create( - loc, rewriter.getIndexType(), offsets[i], strideConsts[i]); - mulOffset = rewriter.create( - loc, rewriter.getIndexType(), mulOffset, mul); + Value mul = + arith::MulIOp::create(rewriter, loc, rewriter.getIndexType(), + offsets[i], strideConsts[i]); + mulOffset = arith::AddIOp::create( + rewriter, loc, rewriter.getIndexType(), mulOffset, mul); } // Broadcast to baseConstVec size - auto bcastOffset = rewriter.create( - loc, baseConstVec.getType(), mulOffset); + auto bcastOffset = vector::BroadcastOp::create( + rewriter, loc, baseConstVec.getType(), mulOffset); auto finalConst = arith::AddIOp::create(rewriter, loc, baseConstVec, bcastOffset); setLayoutIfNeeded(baseConstVec); @@ -1138,8 +1139,8 @@ struct WgToSgVectorShapeCastOp SmallVector newShapeCastOps; for (auto src : adaptor.getSource()) { - auto newShapeCast = - rewriter.create(op.getLoc(), newResultType, src); + auto newShapeCast = vector::ShapeCastOp::create(rewriter, op.getLoc(), + newResultType, src); if (!layout.getEffectiveLaneLayoutAsInt().empty() || !layout.getEffectiveInstDataAsInt().empty()) xegpu::setDistributeLayoutAttr(newShapeCast->getResult(0), @@ -1201,9 +1202,9 @@ struct WgToSgMultiDimReductionOp SmallVector newReductions; for (auto sgSrc : adaptor.getSource()) { - auto newOp = rewriter.create( - op.getLoc(), newDstType, op.getKind(), sgSrc, adaptor.getAcc()[0], - op.getReductionDims()); + auto newOp = vector::MultiDimReductionOp::create( + rewriter, op.getLoc(), newDstType, op.getKind(), sgSrc, + adaptor.getAcc()[0], op.getReductionDims()); if (!layout.getEffectiveLaneLayoutAsInt().empty() || !layout.getEffectiveInstDataAsInt().empty()) xegpu::setDistributeLayoutAttr(newOp->getResult(0), diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp index d9ad8fb144b61..6492708694cc5 100644 --- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp +++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp @@ -702,8 +702,8 @@ spirv::Deserializer::processGraphEntryPointARM(ArrayRef operands) { // RAII guard to reset the insertion point to previous value when done. OpBuilder::InsertionGuard insertionGuard(opBuilder); opBuilder.setInsertionPoint(graphARM); - opBuilder.create( - unknownLoc, SymbolRefAttr::get(opBuilder.getContext(), name), + spirv::GraphEntryPointARMOp::create( + opBuilder, unknownLoc, SymbolRefAttr::get(opBuilder.getContext(), name), opBuilder.getArrayAttr(interface)); return success(); @@ -736,7 +736,7 @@ spirv::Deserializer::processGraphARM(ArrayRef operands) { std::string graphName = getGraphSymbol(graphID); auto graphOp = - opBuilder.create(unknownLoc, graphName, graphType); + spirv::GraphARMOp::create(opBuilder, unknownLoc, graphName, graphType); curGraph = graphMap[graphID] = graphOp; Block *entryBlock = graphOp.addEntryBlock(); LLVM_DEBUG({ @@ -844,7 +844,7 @@ spirv::Deserializer::processOpGraphSetOutputARM(ArrayRef operands) { LogicalResult spirv::Deserializer::processGraphEndARM(ArrayRef operands) { // Create GraphOutputsARM instruction. - opBuilder.create(unknownLoc, graphOutputs); + spirv::GraphOutputsARMOp::create(opBuilder, unknownLoc, graphOutputs); // Process OpGraphEndARM. if (!operands.empty()) { diff --git a/mlir/lib/Target/Wasm/TranslateFromWasm.cpp b/mlir/lib/Target/Wasm/TranslateFromWasm.cpp index 366ba8f0115f4..048e964037558 100644 --- a/mlir/lib/Target/Wasm/TranslateFromWasm.cpp +++ b/mlir/lib/Target/Wasm/TranslateFromWasm.cpp @@ -406,7 +406,7 @@ class ExpressionParser { auto returnOperands = popOperands(resTypes); if (failed(returnOperands)) return failure(); - builder.create(opLoc, *returnOperands); + BlockReturnOp::create(builder, opLoc, *returnOperands); LDBG() << "end of parsing of a block"; return bodyParsingRes->endingByte; } @@ -1000,7 +1000,7 @@ parsed_inst_t ExpressionParser::parseBlockLikeOp(OpBuilder &builder) { builder.createBlock(curRegion, curRegion->end(), resTypes, locations); builder.setInsertionPointToEnd(curBlock); auto blockOp = - builder.create(*currentOpLoc, *inputOps, successor); + OpToCreate::create(builder, *currentOpLoc, *inputOps, successor); auto *blockBody = blockOp.createBlock(); if (failed(parseBlockContent(builder, blockBody, resTypes, *opLoc, blockOp))) return failure(); @@ -1047,8 +1047,8 @@ inline parsed_inst_t ExpressionParser::parseSpecificInstruction< auto *successor = builder.createBlock(curRegion, curRegion->end(), resTypes, locations); builder.setInsertionPointToEnd(curBlock); - auto ifOp = builder.create(*currentOpLoc, conditionValue->front(), - *inputOps, successor); + auto ifOp = IfOp::create(builder, *currentOpLoc, conditionValue->front(), + *inputOps, successor); auto *ifEntryBlock = ifOp.createIfBlock(); constexpr auto ifElseFilter = ByteSequence(*currentOpLoc, condition->front(), - builder.getUI32IntegerAttr(*level), *branchArgs, - elseBlock); + BranchIfOp::create(builder, *currentOpLoc, condition->front(), + builder.getUI32IntegerAttr(*level), *branchArgs, + elseBlock); builder.setInsertionPointToStart(elseBlock); return {*branchArgs}; } @@ -1115,7 +1115,7 @@ ExpressionParser::parseSpecificInstruction( if (failed(inOperands)) return failure(); auto callOp = - builder.create(loc, resTypes, callee.symbol, *inOperands); + FuncCallOp::create(builder, loc, resTypes, callee.symbol, *inOperands); return {callOp.getResults()}; } @@ -1391,8 +1391,8 @@ inline parsed_inst_t ExpressionParser::buildConvertOp(OpBuilder &builder, auto operand = popOperands(intype); if (failed(operand)) return failure(); - auto op = builder.create(*currentOpLoc, outType, operand->front(), - extraArgs...); + auto op = opType::create(builder, *currentOpLoc, outType, operand->front(), + extraArgs...); LDBG() << "Built operation: " << op; return {{op.getResult()}}; } diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp index ee4fa39158721..efbdbfb65d65b 100644 --- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp +++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp @@ -2136,7 +2136,7 @@ struct TestTypeConversionDriver Location loc) -> Value { if (inputs.size() != 1 || !inputs[0].getType().isInteger(37)) return Value(); - return builder.create(loc, type, inputs) + return UnrealizedConversionCastOp::create(builder, loc, type, inputs) .getResult(0); }); From a321ce3d72ebe28f7a3bfb209f2f27d3ab057e77 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 22 Oct 2025 16:01:57 +0100 Subject: [PATCH 118/530] [SCEV] Expose getGEPExpr without needing to pass GEPOperator* (NFC) (#164487) Add a new getGEPExpr variant which is independent of GEPOperator*. To be used to construct SCEVs for VPlan recipes in https://github.com/llvm/llvm-project/pull/161276. PR: https://github.com/llvm/llvm-project/pull/164487 --- llvm/include/llvm/Analysis/ScalarEvolution.h | 8 ++++++-- llvm/lib/Analysis/ScalarEvolution.cpp | 17 +++++++++++------ 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h index 3d3ec14796bc1..04ea769bd06d1 100644 --- a/llvm/include/llvm/Analysis/ScalarEvolution.h +++ b/llvm/include/llvm/Analysis/ScalarEvolution.h @@ -638,8 +638,12 @@ class ScalarEvolution { /// \p GEP The GEP. The indices contained in the GEP itself are ignored, /// instead we use IndexExprs. /// \p IndexExprs The expressions for the indices. - LLVM_ABI const SCEV * - getGEPExpr(GEPOperator *GEP, const SmallVectorImpl &IndexExprs); + LLVM_ABI const SCEV *getGEPExpr(GEPOperator *GEP, + ArrayRef IndexExprs); + LLVM_ABI const SCEV *getGEPExpr(const SCEV *BaseExpr, + ArrayRef IndexExprs, + Type *SrcElementTy, + GEPNoWrapFlags NW = GEPNoWrapFlags::none()); LLVM_ABI const SCEV *getAbsExpr(const SCEV *Op, bool IsNSW); LLVM_ABI const SCEV *getMinMaxExpr(SCEVTypes Kind, SmallVectorImpl &Operands); diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 6f7dd79032cfe..7597f3ad685a0 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -3768,13 +3768,11 @@ ScalarEvolution::getAddRecExpr(SmallVectorImpl &Operands, return getOrCreateAddRecExpr(Operands, L, Flags); } -const SCEV * -ScalarEvolution::getGEPExpr(GEPOperator *GEP, - const SmallVectorImpl &IndexExprs) { +const SCEV *ScalarEvolution::getGEPExpr(GEPOperator *GEP, + ArrayRef IndexExprs) { const SCEV *BaseExpr = getSCEV(GEP->getPointerOperand()); // getSCEV(Base)->getType() has the same address space as Base->getType() // because SCEV::getType() preserves the address space. - Type *IntIdxTy = getEffectiveSCEVType(BaseExpr->getType()); GEPNoWrapFlags NW = GEP->getNoWrapFlags(); if (NW != GEPNoWrapFlags::none()) { // We'd like to propagate flags from the IR to the corresponding SCEV nodes, @@ -3787,13 +3785,20 @@ ScalarEvolution::getGEPExpr(GEPOperator *GEP, NW = GEPNoWrapFlags::none(); } + return getGEPExpr(BaseExpr, IndexExprs, GEP->getSourceElementType(), NW); +} + +const SCEV *ScalarEvolution::getGEPExpr(const SCEV *BaseExpr, + ArrayRef IndexExprs, + Type *SrcElementTy, GEPNoWrapFlags NW) { SCEV::NoWrapFlags OffsetWrap = SCEV::FlagAnyWrap; if (NW.hasNoUnsignedSignedWrap()) OffsetWrap = setFlags(OffsetWrap, SCEV::FlagNSW); if (NW.hasNoUnsignedWrap()) OffsetWrap = setFlags(OffsetWrap, SCEV::FlagNUW); - Type *CurTy = GEP->getType(); + Type *CurTy = BaseExpr->getType(); + Type *IntIdxTy = getEffectiveSCEVType(BaseExpr->getType()); bool FirstIter = true; SmallVector Offsets; for (const SCEV *IndexExpr : IndexExprs) { @@ -3812,7 +3817,7 @@ ScalarEvolution::getGEPExpr(GEPOperator *GEP, if (FirstIter) { assert(isa(CurTy) && "The first index of a GEP indexes a pointer"); - CurTy = GEP->getSourceElementType(); + CurTy = SrcElementTy; FirstIter = false; } else { CurTy = GetElementPtrInst::getTypeAtIndex(CurTy, (uint64_t)0); From 003101e5f891dfec614a101b26dff22d92234391 Mon Sep 17 00:00:00 2001 From: Walter Lee <49250218+googlewalt@users.noreply.github.com> Date: Wed, 22 Oct 2025 11:20:58 -0400 Subject: [PATCH 119/530] [bazel][libc] Add missing dependency (#164640) Fixes #164522. --- utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel index b44273123dcad..522a2bd62db82 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel @@ -110,6 +110,7 @@ libc_test_library( "//libc:__support_fputil_rounding_mode", "//libc:__support_libc_errno", "//libc:__support_macros_config", + "//libc:__support_macros_optimization", "//libc:__support_macros_properties_architectures", "//libc:hdr_fenv_macros", "//libc:hdr_math_macros", From f3599e55d12de7535cf5697a73fb0e1d74e037a2 Mon Sep 17 00:00:00 2001 From: Razvan Lupusoru Date: Wed, 22 Oct 2025 08:26:21 -0700 Subject: [PATCH 120/530] [mlir][acc] Add OpenACCSupport for extensible dialect handling (#164510) The OpenACC dialect must coexist with source language dialects (FIR, CIR, etc.) to enable offloading. While type interfaces (`PointerLikeType` and `MappableType`) provide the primary contract for variable mapping, some scenarios require pipeline-specific customization or need to express information that cannot be adequately captured through operation and type interfaces alone. This commit introduces the `OpenACCSupport` analysis, which provides extensible support APIs that can be customized per-pipeline. The analysis follows the Concept-Model pattern used in MLIR's `AliasAnalysis` and is never invalidated, persisting throughout the pass pipeline. The initial API, `getVariableName(Value) -> string`, retrieves variable names from MLIR values by: - Checking for `acc.var_name` attributes - Extracting names from ACC data clause operations (e.g., `acc.copyin`) - Walking through `ViewLikeOpInterface` operations to find the source This will be used in the implicit data mapping pass to automatically generate device mappings with correct user-visible variable names. Usage: Passes call `getAnalysis()` to get a cached instance with either the default or a previously- registered custom implementation. Custom implementations can be registered in a setup pass by calling `setImplementation()` before the consumer pass runs. --- .../Dialect/OpenACC/Analysis/OpenACCSupport.h | 135 ++++++++++++++++++ .../mlir/Dialect/OpenACC/OpenACCUtils.h | 5 + .../Dialect/OpenACC/Analysis/CMakeLists.txt | 13 ++ .../OpenACC/Analysis/OpenACCSupport.cpp | 26 ++++ mlir/lib/Dialect/OpenACC/CMakeLists.txt | 1 + .../Dialect/OpenACC/Utils/OpenACCUtils.cpp | 28 ++++ .../OpenACC/support-analysis-varname.mlir | 88 ++++++++++++ mlir/test/lib/Dialect/OpenACC/CMakeLists.txt | 2 + mlir/test/lib/Dialect/OpenACC/TestOpenACC.cpp | 2 + .../Dialect/OpenACC/TestOpenACCSupport.cpp | 73 ++++++++++ .../Dialect/OpenACC/OpenACCUtilsTest.cpp | 75 ++++++++++ 11 files changed, 448 insertions(+) create mode 100644 mlir/include/mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h create mode 100644 mlir/lib/Dialect/OpenACC/Analysis/CMakeLists.txt create mode 100644 mlir/lib/Dialect/OpenACC/Analysis/OpenACCSupport.cpp create mode 100644 mlir/test/Dialect/OpenACC/support-analysis-varname.mlir create mode 100644 mlir/test/lib/Dialect/OpenACC/TestOpenACCSupport.cpp diff --git a/mlir/include/mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h b/mlir/include/mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h new file mode 100644 index 0000000000000..0833462ea0509 --- /dev/null +++ b/mlir/include/mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h @@ -0,0 +1,135 @@ +//===- OpenACCSupport.h - OpenACC Support Interface -------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the OpenACCSupport analysis interface, which provides +// extensible support for OpenACC passes. Custom implementations +// can be registered to provide pipeline and dialect-specific information +// that cannot be adequately expressed through type or operation interfaces +// alone. +// +// Usage Pattern: +// ============== +// +// A pass that needs this functionality should call +// getAnalysis(), which will provide either: +// - A cached version if previously initialized, OR +// - A default implementation if not previously initialized +// +// This analysis is never invalidated (isInvalidated returns false), so it only +// needs to be initialized once and will persist throughout the pass pipeline. +// +// Registering a Custom Implementation: +// ===================================== +// +// If a custom implementation is needed, create a pass that runs BEFORE the pass +// that needs the analysis. In this setup pass, use +// getAnalysis() followed by setImplementation() to register +// your custom implementation. The custom implementation will need to provide +// implementation for all methods defined in the `OpenACCSupportTraits::Concept` +// class. +// +// Example: +// void MySetupPass::runOnOperation() { +// OpenACCSupport &support = getAnalysis(); +// support.setImplementation(MyCustomImpl()); +// } +// +// void MyAnalysisConsumerPass::runOnOperation() { +// OpenACCSupport &support = getAnalysis(); +// std::string name = support.getVariableName(someValue); +// // ... use the analysis results +// } +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_OPENACC_ANALYSIS_OPENACCSUPPORT_H +#define MLIR_DIALECT_OPENACC_ANALYSIS_OPENACCSUPPORT_H + +#include "mlir/IR/Value.h" +#include "mlir/Pass/AnalysisManager.h" +#include +#include + +namespace mlir { +namespace acc { + +namespace detail { +/// This class contains internal trait classes used by OpenACCSupport. +/// It follows the Concept-Model pattern used throughout MLIR (e.g., in +/// AliasAnalysis and interface definitions). +struct OpenACCSupportTraits { + class Concept { + public: + virtual ~Concept() = default; + + /// Get the variable name for a given MLIR value. + virtual std::string getVariableName(Value v) = 0; + }; + + /// This class wraps a concrete OpenACCSupport implementation and forwards + /// interface calls to it. This provides type erasure, allowing different + /// implementation types to be used interchangeably without inheritance. + template + class Model final : public Concept { + public: + explicit Model(ImplT &&impl) : impl(std::forward(impl)) {} + ~Model() override = default; + + std::string getVariableName(Value v) final { + return impl.getVariableName(v); + } + + private: + ImplT impl; + }; +}; +} // namespace detail + +//===----------------------------------------------------------------------===// +// OpenACCSupport +//===----------------------------------------------------------------------===// + +class OpenACCSupport { + using Concept = detail::OpenACCSupportTraits::Concept; + template + using Model = detail::OpenACCSupportTraits::Model; + +public: + OpenACCSupport() = default; + OpenACCSupport(Operation *op) {} + + /// Register a custom OpenACCSupport implementation. Only one implementation + /// can be registered at a time; calling this replaces any existing + /// implementation. + template + void setImplementation(AnalysisT &&analysis) { + impl = + std::make_unique>(std::forward(analysis)); + } + + /// Get the variable name for a given value. + /// + /// \param v The MLIR value to get the variable name for. + /// \return The variable name, or an empty string if unavailable. + std::string getVariableName(Value v); + + /// Signal that this analysis should always be preserved so that + /// underlying implementation registration is not lost. + bool isInvalidated(const AnalysisManager::PreservedAnalyses &pa) { + return false; + } + +private: + /// The registered custom implementation (if any). + std::unique_ptr impl; +}; + +} // namespace acc +} // namespace mlir + +#endif // MLIR_DIALECT_OPENACC_ANALYSIS_OPENACCSUPPORT_H diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h index 378f4348f2cf1..0ee88c6f47b67 100644 --- a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h +++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h @@ -38,6 +38,11 @@ std::optional getDefaultAttr(mlir::Operation *op); /// Get the type category of an OpenACC variable. mlir::acc::VariableTypeCategory getTypeCategory(mlir::Value var); +/// Attempts to extract the variable name from a value by walking through +/// view-like operations until an `acc.var_name` attribute is found. Returns +/// empty string if no name is found. +std::string getVariableName(mlir::Value v); + } // namespace acc } // namespace mlir diff --git a/mlir/lib/Dialect/OpenACC/Analysis/CMakeLists.txt b/mlir/lib/Dialect/OpenACC/Analysis/CMakeLists.txt new file mode 100644 index 0000000000000..f305068e1b3bc --- /dev/null +++ b/mlir/lib/Dialect/OpenACC/Analysis/CMakeLists.txt @@ -0,0 +1,13 @@ +add_mlir_dialect_library(MLIROpenACCAnalysis + OpenACCSupport.cpp + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/OpenACC + + LINK_LIBS PUBLIC + MLIRIR + MLIROpenACCDialect + MLIROpenACCUtils + MLIRSupport +) + diff --git a/mlir/lib/Dialect/OpenACC/Analysis/OpenACCSupport.cpp b/mlir/lib/Dialect/OpenACC/Analysis/OpenACCSupport.cpp new file mode 100644 index 0000000000000..f6b4534794eaf --- /dev/null +++ b/mlir/lib/Dialect/OpenACC/Analysis/OpenACCSupport.cpp @@ -0,0 +1,26 @@ +//===- OpenACCSupport.cpp - OpenACCSupport Implementation -----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the OpenACCSupport analysis interface. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h" +#include "mlir/Dialect/OpenACC/OpenACCUtils.h" + +namespace mlir { +namespace acc { + +std::string OpenACCSupport::getVariableName(Value v) { + if (impl) + return impl->getVariableName(v); + return acc::getVariableName(v); +} + +} // namespace acc +} // namespace mlir diff --git a/mlir/lib/Dialect/OpenACC/CMakeLists.txt b/mlir/lib/Dialect/OpenACC/CMakeLists.txt index 7117520599fa6..e8a916e824d71 100644 --- a/mlir/lib/Dialect/OpenACC/CMakeLists.txt +++ b/mlir/lib/Dialect/OpenACC/CMakeLists.txt @@ -1,3 +1,4 @@ +add_subdirectory(Analysis) add_subdirectory(IR) add_subdirectory(Utils) add_subdirectory(Transforms) diff --git a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp index 12233254f3fb4..89adda82646e6 100644 --- a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp +++ b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp @@ -9,6 +9,7 @@ #include "mlir/Dialect/OpenACC/OpenACCUtils.h" #include "mlir/Dialect/OpenACC/OpenACC.h" +#include "mlir/Interfaces/ViewLikeInterface.h" #include "llvm/ADT/TypeSwitch.h" mlir::Operation *mlir::acc::getEnclosingComputeOp(mlir::Region ®ion) { @@ -78,3 +79,30 @@ mlir::acc::VariableTypeCategory mlir::acc::getTypeCategory(mlir::Value var) { pointerLikeTy.getElementType()); return typeCategory; } + +std::string mlir::acc::getVariableName(mlir::Value v) { + Value current = v; + + // Walk through view operations until a name is found or can't go further + while (Operation *definingOp = current.getDefiningOp()) { + // Check for `acc.var_name` attribute + if (auto varNameAttr = + definingOp->getAttrOfType(getVarNameAttrName())) + return varNameAttr.getName().str(); + + // If it is a data entry operation, get name via getVarName + if (isa(definingOp)) + if (auto name = acc::getVarName(definingOp)) + return name->str(); + + // If it's a view operation, continue to the source + if (auto viewOp = dyn_cast(definingOp)) { + current = viewOp.getViewSource(); + continue; + } + + break; + } + + return ""; +} diff --git a/mlir/test/Dialect/OpenACC/support-analysis-varname.mlir b/mlir/test/Dialect/OpenACC/support-analysis-varname.mlir new file mode 100644 index 0000000000000..af52befb156e4 --- /dev/null +++ b/mlir/test/Dialect/OpenACC/support-analysis-varname.mlir @@ -0,0 +1,88 @@ +// RUN: mlir-opt %s -split-input-file -test-acc-support | FileCheck %s + +// Test with direct variable names +func.func @test_direct_var_name() { + // Create a memref with acc.var_name attribute + %0 = memref.alloca() {acc.var_name = #acc.var_name<"my_variable">} : memref<10xi32> + + %1 = memref.cast %0 {test.var_name} : memref<10xi32> to memref<10xi32> + + // CHECK: op=%{{.*}} = memref.cast %{{.*}} {test.var_name} : memref<10xi32> to memref<10xi32> + // CHECK-NEXT: getVariableName="my_variable" + + return +} + +// ----- + +// Test through memref.cast +func.func @test_through_cast() { + // Create a 5x2 memref with acc.var_name attribute + %0 = memref.alloca() {acc.var_name = #acc.var_name<"casted_variable">} : memref<5x2xi32> + + // Cast to dynamic dimensions + %1 = memref.cast %0 : memref<5x2xi32> to memref + + // Mark with test attribute - should find name through cast + %2 = memref.cast %1 {test.var_name} : memref to memref<5x2xi32> + + // CHECK: op=%{{.*}} = memref.cast %{{.*}} {test.var_name} : memref to memref<5x2xi32> + // CHECK-NEXT: getVariableName="casted_variable" + + return +} + +// ----- + +// Test with no variable name +func.func @test_no_var_name() { + // Create a memref without acc.var_name attribute + %0 = memref.alloca() : memref<10xi32> + + // Mark with test attribute - should find empty string + %1 = memref.cast %0 {test.var_name} : memref<10xi32> to memref<10xi32> + + // CHECK: op=%{{.*}} = memref.cast %{{.*}} {test.var_name} : memref<10xi32> to memref<10xi32> + // CHECK-NEXT: getVariableName="" + + return +} + +// ----- + +// Test through multiple casts +func.func @test_multiple_casts() { + // Create a memref with acc.var_name attribute + %0 = memref.alloca() {acc.var_name = #acc.var_name<"multi_cast">} : memref<10xi32> + + // Multiple casts + %1 = memref.cast %0 : memref<10xi32> to memref + %2 = memref.cast %1 : memref to memref<10xi32> + + // Mark with test attribute - should find name through multiple casts + %3 = memref.cast %2 {test.var_name} : memref<10xi32> to memref<10xi32> + + // CHECK: op=%{{.*}} = memref.cast %{{.*}} {test.var_name} : memref<10xi32> to memref<10xi32> + // CHECK-NEXT: getVariableName="multi_cast" + + return +} + +// ----- + +// Test with acc.copyin operation +func.func @test_copyin_name() { + // Create a memref + %0 = memref.alloca() : memref<10xf32> + + // Create an acc.copyin operation with a name + %1 = acc.copyin varPtr(%0 : memref<10xf32>) -> memref<10xf32> {name = "input_data"} + + // Mark with test attribute - should find name from copyin operation + %2 = memref.cast %1 {test.var_name} : memref<10xf32> to memref + + // CHECK: op=%{{.*}} = memref.cast %{{.*}} {test.var_name} : memref<10xf32> to memref + // CHECK-NEXT: getVariableName="input_data" + + return +} diff --git a/mlir/test/lib/Dialect/OpenACC/CMakeLists.txt b/mlir/test/lib/Dialect/OpenACC/CMakeLists.txt index 1e593389ec683..a54b642d4db42 100644 --- a/mlir/test/lib/Dialect/OpenACC/CMakeLists.txt +++ b/mlir/test/lib/Dialect/OpenACC/CMakeLists.txt @@ -2,6 +2,7 @@ add_mlir_library(MLIROpenACCTestPasses TestOpenACC.cpp TestPointerLikeTypeInterface.cpp TestRecipePopulate.cpp + TestOpenACCSupport.cpp EXCLUDE_FROM_LIBMLIR ) @@ -11,6 +12,7 @@ mlir_target_link_libraries(MLIROpenACCTestPasses PUBLIC MLIRFuncDialect MLIRMemRefDialect MLIROpenACCDialect + MLIROpenACCAnalysis MLIRPass MLIRSupport ) diff --git a/mlir/test/lib/Dialect/OpenACC/TestOpenACC.cpp b/mlir/test/lib/Dialect/OpenACC/TestOpenACC.cpp index bea21b9827f7e..e59d77777ee40 100644 --- a/mlir/test/lib/Dialect/OpenACC/TestOpenACC.cpp +++ b/mlir/test/lib/Dialect/OpenACC/TestOpenACC.cpp @@ -16,11 +16,13 @@ namespace test { // Forward declarations of individual test pass registration functions void registerTestPointerLikeTypeInterfacePass(); void registerTestRecipePopulatePass(); +void registerTestOpenACCSupportPass(); // Unified registration function for all OpenACC tests void registerTestOpenACC() { registerTestPointerLikeTypeInterfacePass(); registerTestRecipePopulatePass(); + registerTestOpenACCSupportPass(); } } // namespace test diff --git a/mlir/test/lib/Dialect/OpenACC/TestOpenACCSupport.cpp b/mlir/test/lib/Dialect/OpenACC/TestOpenACCSupport.cpp new file mode 100644 index 0000000000000..8bf984bdc2632 --- /dev/null +++ b/mlir/test/lib/Dialect/OpenACC/TestOpenACCSupport.cpp @@ -0,0 +1,73 @@ +//===- TestOpenACCSupport.cpp - Test OpenACCSupport Analysis -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains test passes for testing the OpenACCSupport analysis. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h" +#include "mlir/Dialect/OpenACC/OpenACC.h" +#include "mlir/Pass/Pass.h" + +using namespace mlir; +using namespace mlir::acc; + +namespace { + +struct TestOpenACCSupportPass + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestOpenACCSupportPass) + + StringRef getArgument() const override { return "test-acc-support"; } + + StringRef getDescription() const override { + return "Test OpenACCSupport analysis"; + } + + void runOnOperation() override; + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + registry.insert(); + } +}; + +void TestOpenACCSupportPass::runOnOperation() { + auto func = getOperation(); + + // Get the OpenACCSupport analysis + OpenACCSupport &support = getAnalysis(); + + // Walk through operations looking for test attributes + func.walk([&](Operation *op) { + // Check for test.var_name attribute. This is the marker used to identify + // the operations that need to be tested for getVariableName. + if (op->hasAttr("test.var_name")) { + // For each result of this operation, try to get the variable name + for (auto result : op->getResults()) { + std::string foundName = support.getVariableName(result); + llvm::outs() << "op=" << *op << "\n\tgetVariableName=\"" << foundName + << "\"\n"; + } + } + }); +} + +} // namespace + +namespace mlir { +namespace test { + +void registerTestOpenACCSupportPass() { + PassRegistration(); +} + +} // namespace test +} // namespace mlir diff --git a/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp index ab817b640edb3..3fbbcc90a67c9 100644 --- a/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp +++ b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp @@ -410,3 +410,78 @@ TEST_F(OpenACCUtilsTest, getTypeCategoryArray) { VariableTypeCategory category = getTypeCategory(varPtr); EXPECT_EQ(category, VariableTypeCategory::array); } + +//===----------------------------------------------------------------------===// +// getVariableName Tests +//===----------------------------------------------------------------------===// + +TEST_F(OpenACCUtilsTest, getVariableNameDirect) { + // Create a memref with acc.var_name attribute + auto memrefTy = MemRefType::get({10}, b.getI32Type()); + OwningOpRef allocOp = + memref::AllocaOp::create(b, loc, memrefTy); + + // Set the acc.var_name attribute + auto varNameAttr = VarNameAttr::get(&context, "my_variable"); + allocOp.get()->setAttr(getVarNameAttrName(), varNameAttr); + + Value varPtr = allocOp->getResult(); + + // Test that getVariableName returns the variable name + std::string varName = getVariableName(varPtr); + EXPECT_EQ(varName, "my_variable"); +} + +TEST_F(OpenACCUtilsTest, getVariableNameThroughCast) { + // Create a 5x2 memref with acc.var_name attribute + auto memrefTy = MemRefType::get({5, 2}, b.getI32Type()); + OwningOpRef allocOp = + memref::AllocaOp::create(b, loc, memrefTy); + + // Set the acc.var_name attribute on the alloca + auto varNameAttr = VarNameAttr::get(&context, "casted_variable"); + allocOp.get()->setAttr(getVarNameAttrName(), varNameAttr); + + Value allocResult = allocOp->getResult(); + + // Create a memref.cast operation to a flattened 10-element array + auto castedMemrefTy = MemRefType::get({10}, b.getI32Type()); + OwningOpRef castOp = + memref::CastOp::create(b, loc, castedMemrefTy, allocResult); + + Value castedPtr = castOp->getResult(); + + // Test that getVariableName walks through the cast to find the variable name + std::string varName = getVariableName(castedPtr); + EXPECT_EQ(varName, "casted_variable"); +} + +TEST_F(OpenACCUtilsTest, getVariableNameNotFound) { + // Create a memref without acc.var_name attribute + auto memrefTy = MemRefType::get({10}, b.getI32Type()); + OwningOpRef allocOp = + memref::AllocaOp::create(b, loc, memrefTy); + + Value varPtr = allocOp->getResult(); + + // Test that getVariableName returns empty string when no name is found + std::string varName = getVariableName(varPtr); + EXPECT_EQ(varName, ""); +} + +TEST_F(OpenACCUtilsTest, getVariableNameFromCopyin) { + // Create a memref + auto memrefTy = MemRefType::get({10}, b.getI32Type()); + OwningOpRef allocOp = + memref::AllocaOp::create(b, loc, memrefTy); + + Value varPtr = allocOp->getResult(); + StringRef name = "data_array"; + OwningOpRef copyinOp = + CopyinOp::create(b, loc, varPtr, /*structured=*/true, /*implicit=*/true, + /*name=*/name); + + // Test that getVariableName extracts the name from the copyin operation + std::string varName = getVariableName(copyinOp->getAccVar()); + EXPECT_EQ(varName, name); +} From 2936852d2f6b0e7af6237abe9796c494885b2aac Mon Sep 17 00:00:00 2001 From: Walter Lee <49250218+googlewalt@users.noreply.github.com> Date: Wed, 22 Oct 2025 11:27:14 -0400 Subject: [PATCH 121/530] [ARM][AArch64] Move TestInputs to Inputs (#164633) It's standard in LLVM to have test inputs be in "Inputs". See https://llvm.org/docs/TestingGuide.html#extra-files. Fixes internal issue with #86212. --- llvm/test/LTO/AArch64/{TestInputs => Inputs}/bar.ll | 0 llvm/test/LTO/AArch64/{TestInputs => Inputs}/fiz.ll | 0 llvm/test/LTO/AArch64/{TestInputs => Inputs}/foo.ll | 0 llvm/test/LTO/AArch64/{TestInputs => Inputs}/old.ll | 0 llvm/test/LTO/AArch64/link-branch-target-enforcement.ll | 2 +- llvm/test/LTO/AArch64/link-sign-return-address.ll | 8 ++++---- 6 files changed, 5 insertions(+), 5 deletions(-) rename llvm/test/LTO/AArch64/{TestInputs => Inputs}/bar.ll (100%) rename llvm/test/LTO/AArch64/{TestInputs => Inputs}/fiz.ll (100%) rename llvm/test/LTO/AArch64/{TestInputs => Inputs}/foo.ll (100%) rename llvm/test/LTO/AArch64/{TestInputs => Inputs}/old.ll (100%) diff --git a/llvm/test/LTO/AArch64/TestInputs/bar.ll b/llvm/test/LTO/AArch64/Inputs/bar.ll similarity index 100% rename from llvm/test/LTO/AArch64/TestInputs/bar.ll rename to llvm/test/LTO/AArch64/Inputs/bar.ll diff --git a/llvm/test/LTO/AArch64/TestInputs/fiz.ll b/llvm/test/LTO/AArch64/Inputs/fiz.ll similarity index 100% rename from llvm/test/LTO/AArch64/TestInputs/fiz.ll rename to llvm/test/LTO/AArch64/Inputs/fiz.ll diff --git a/llvm/test/LTO/AArch64/TestInputs/foo.ll b/llvm/test/LTO/AArch64/Inputs/foo.ll similarity index 100% rename from llvm/test/LTO/AArch64/TestInputs/foo.ll rename to llvm/test/LTO/AArch64/Inputs/foo.ll diff --git a/llvm/test/LTO/AArch64/TestInputs/old.ll b/llvm/test/LTO/AArch64/Inputs/old.ll similarity index 100% rename from llvm/test/LTO/AArch64/TestInputs/old.ll rename to llvm/test/LTO/AArch64/Inputs/old.ll diff --git a/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll b/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll index aef8907c03411..20254de995f73 100644 --- a/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll +++ b/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll @@ -2,7 +2,7 @@ ;; be mixed. ;; ; RUN: llvm-as %s -o %t1.bc -; RUN: llvm-as %p/TestInputs/foo.ll -o %t2.bc +; RUN: llvm-as %p/Inputs/foo.ll -o %t2.bc ; RUN: llvm-lto -exported-symbol main \ ; RUN: -exported-symbol foo_on \ ; RUN: -filetype=obj \ diff --git a/llvm/test/LTO/AArch64/link-sign-return-address.ll b/llvm/test/LTO/AArch64/link-sign-return-address.ll index df6276f32b6f1..331e481b69e43 100644 --- a/llvm/test/LTO/AArch64/link-sign-return-address.ll +++ b/llvm/test/LTO/AArch64/link-sign-return-address.ll @@ -2,10 +2,10 @@ ;; be mixed. ; ; RUN: llvm-as %s -o %t1.bc -; RUN: llvm-as %p/TestInputs/foo.ll -o %t2.bc -; RUN: llvm-as %p/TestInputs/fiz.ll -o %t3.bc -; RUN: llvm-as %p/TestInputs/bar.ll -o %t4.bc -; RUN: llvm-as %p/TestInputs/old.ll -o %t5.bc +; RUN: llvm-as %p/Inputs/foo.ll -o %t2.bc +; RUN: llvm-as %p/Inputs/fiz.ll -o %t3.bc +; RUN: llvm-as %p/Inputs/bar.ll -o %t4.bc +; RUN: llvm-as %p/Inputs/old.ll -o %t5.bc ; RUN: llvm-lto -exported-symbol main \ ; RUN: -exported-symbol foo_on \ ; RUN: -exported-symbol foo_off \ From ff37c1573cb8b4bcddc56e111544c244130af861 Mon Sep 17 00:00:00 2001 From: Ron Lieberman Date: Wed, 22 Oct 2025 10:28:24 -0500 Subject: [PATCH 122/530] Regen llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-debug-info-multi-entry.ll --- ...mplicit-kernargs-debug-info-multi-entry.ll | 67 +++++++++---------- 1 file changed, 30 insertions(+), 37 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-debug-info-multi-entry.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-debug-info-multi-entry.ll index 15d175031d9fd..47e5ccc12b975 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-debug-info-multi-entry.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-debug-info-multi-entry.ll @@ -69,33 +69,29 @@ define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg noundef ; GFX942-NEXT: s_mul_i32 s11, s5, s11 ; GFX942-NEXT: s_add_u32 s11, s14, s11 ; GFX942-NEXT: s_addc_u32 s14, 0, s15 -; GFX942-NEXT: s_add_u32 s15, s10, s11 -; GFX942-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX942-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GFX942-NEXT: s_add_u32 s10, s10, s11 ; GFX942-NEXT: s_addc_u32 s5, s5, s14 -; GFX942-NEXT: s_mul_i32 s10, s1, s5 -; GFX942-NEXT: s_mul_hi_u32 s11, s1, s15 -; GFX942-NEXT: s_add_i32 s10, s11, s10 -; GFX942-NEXT: s_mul_i32 s3, s3, s15 -; GFX942-NEXT: s_add_i32 s10, s10, s3 -; GFX942-NEXT: s_mul_i32 s1, s1, s15 -; GFX942-NEXT: s_mul_hi_u32 s11, s5, s1 -; GFX942-NEXT: s_mul_i32 s14, s5, s1 -; GFX942-NEXT: s_mul_i32 s17, s15, s10 -; GFX942-NEXT: s_mul_hi_u32 s1, s15, s1 -; GFX942-NEXT: s_mul_hi_u32 s16, s15, s10 +; GFX942-NEXT: s_mul_i32 s11, s1, s5 +; GFX942-NEXT: s_mul_hi_u32 s14, s1, s10 +; GFX942-NEXT: s_add_i32 s11, s14, s11 +; GFX942-NEXT: s_mul_i32 s3, s3, s10 +; GFX942-NEXT: s_add_i32 s11, s11, s3 +; GFX942-NEXT: s_mul_i32 s1, s1, s10 +; GFX942-NEXT: s_mul_hi_u32 s14, s5, s1 +; GFX942-NEXT: s_mul_i32 s15, s5, s1 +; GFX942-NEXT: s_mul_i32 s17, s10, s11 +; GFX942-NEXT: s_mul_hi_u32 s1, s10, s1 +; GFX942-NEXT: s_mul_hi_u32 s16, s10, s11 ; GFX942-NEXT: s_add_u32 s1, s1, s17 ; GFX942-NEXT: s_addc_u32 s16, 0, s16 -; GFX942-NEXT: s_add_u32 s1, s1, s14 -; GFX942-NEXT: s_mul_hi_u32 s3, s5, s10 -; GFX942-NEXT: s_addc_u32 s1, s16, s11 +; GFX942-NEXT: s_add_u32 s1, s1, s15 +; GFX942-NEXT: s_mul_hi_u32 s3, s5, s11 +; GFX942-NEXT: s_addc_u32 s1, s16, s14 ; GFX942-NEXT: s_addc_u32 s3, s3, 0 -; GFX942-NEXT: s_mul_i32 s10, s5, s10 -; GFX942-NEXT: s_add_u32 s1, s1, s10 +; GFX942-NEXT: s_mul_i32 s11, s5, s11 +; GFX942-NEXT: s_add_u32 s1, s1, s11 ; GFX942-NEXT: s_addc_u32 s3, 0, s3 -; GFX942-NEXT: s_add_u32 s1, s15, s1 -; GFX942-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX942-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GFX942-NEXT: s_add_u32 s1, s10, s1 ; GFX942-NEXT: s_addc_u32 s3, s5, s3 ; GFX942-NEXT: s_mul_i32 s10, s6, s3 ; GFX942-NEXT: s_mul_hi_u32 s11, s6, s1 @@ -118,37 +114,34 @@ define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg noundef ; GFX942-NEXT: s_add_i32 s5, s5, s10 ; GFX942-NEXT: s_sub_i32 s14, s7, s5 ; GFX942-NEXT: s_mul_i32 s10, s12, s1 -; GFX942-NEXT: s_sub_u32 s16, s6, s10 +; GFX942-NEXT: s_sub_u32 s15, s6, s10 ; GFX942-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX942-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX942-NEXT: s_subb_u32 s17, s14, s13 -; GFX942-NEXT: s_sub_u32 s18, s16, s12 -; GFX942-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX942-NEXT: s_cmp_lg_u64 s[14:15], 0 -; GFX942-NEXT: s_subb_u32 s14, s17, 0 +; GFX942-NEXT: s_subb_u32 s14, s14, s13 +; GFX942-NEXT: s_sub_u32 s16, s15, s12 +; GFX942-NEXT: s_subb_u32 s14, s14, 0 ; GFX942-NEXT: s_cmp_ge_u32 s14, s13 -; GFX942-NEXT: s_cselect_b32 s15, -1, 0 -; GFX942-NEXT: s_cmp_ge_u32 s18, s12 ; GFX942-NEXT: s_cselect_b32 s17, -1, 0 +; GFX942-NEXT: s_cmp_ge_u32 s16, s12 +; GFX942-NEXT: s_cselect_b32 s16, -1, 0 ; GFX942-NEXT: s_cmp_eq_u32 s14, s13 -; GFX942-NEXT: s_cselect_b32 s14, s17, s15 -; GFX942-NEXT: s_add_u32 s15, s1, 1 +; GFX942-NEXT: s_cselect_b32 s14, s16, s17 +; GFX942-NEXT: s_add_u32 s16, s1, 1 ; GFX942-NEXT: s_addc_u32 s17, s3, 0 ; GFX942-NEXT: s_add_u32 s18, s1, 2 ; GFX942-NEXT: s_addc_u32 s19, s3, 0 ; GFX942-NEXT: s_cmp_lg_u32 s14, 0 -; GFX942-NEXT: s_cselect_b32 s14, s18, s15 -; GFX942-NEXT: s_cselect_b32 s15, s19, s17 +; GFX942-NEXT: s_cselect_b32 s14, s18, s16 +; GFX942-NEXT: s_cselect_b32 s16, s19, s17 ; GFX942-NEXT: s_cmp_lg_u64 s[10:11], 0 ; GFX942-NEXT: s_subb_u32 s5, s7, s5 ; GFX942-NEXT: s_cmp_ge_u32 s5, s13 ; GFX942-NEXT: s_cselect_b32 s10, -1, 0 -; GFX942-NEXT: s_cmp_ge_u32 s16, s12 +; GFX942-NEXT: s_cmp_ge_u32 s15, s12 ; GFX942-NEXT: s_cselect_b32 s11, -1, 0 ; GFX942-NEXT: s_cmp_eq_u32 s5, s13 ; GFX942-NEXT: s_cselect_b32 s5, s11, s10 ; GFX942-NEXT: s_cmp_lg_u32 s5, 0 -; GFX942-NEXT: s_cselect_b32 s11, s15, s3 +; GFX942-NEXT: s_cselect_b32 s11, s16, s3 ; GFX942-NEXT: s_cselect_b32 s10, s14, s1 ; GFX942-NEXT: s_cbranch_execnz .LBB0_3 ; GFX942-NEXT: .LBB0_2: From 16641ad8a29b6c877a3f934cd61d6acc9719e87e Mon Sep 17 00:00:00 2001 From: Nicole Aschenbrenner Date: Wed, 22 Oct 2025 17:35:16 +0200 Subject: [PATCH 123/530] [OpenMP] Adds omp_target_is_accessible routine (#138294) Adds omp_target_is_accessible routine. Refactors common code from omp_target_is_present to work for both routines. --------- Co-authored-by: Shilei Tian --- clang/docs/OpenMPSupport.rst | 2 +- offload/include/device.h | 3 ++ offload/include/omptarget.h | 1 + offload/libomptarget/OpenMP/API.cpp | 28 +++++++++++++ offload/libomptarget/device.cpp | 4 ++ offload/libomptarget/exports | 1 + offload/plugins-nextgen/amdgpu/src/rtl.cpp | 24 +++++++++++ .../common/include/PluginInterface.h | 11 +++++ .../common/src/PluginInterface.cpp | 20 ++++++++++ offload/test/mapping/is_accessible.cpp | 40 +++++++++++++++++++ 10 files changed, 133 insertions(+), 1 deletion(-) create mode 100644 offload/test/mapping/is_accessible.cpp diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst index c75c1703a8dc3..61b5babbd18a8 100644 --- a/clang/docs/OpenMPSupport.rst +++ b/clang/docs/OpenMPSupport.rst @@ -256,7 +256,7 @@ implementation. +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ | device | device-specific environment variables | :none:`unclaimed` | | +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | omp_target_is_accessible routine | :part:`In Progress` | https://github.com/llvm/llvm-project/pull/138294 | +| device | omp_target_is_accessible routine | :good:`done` | https://github.com/llvm/llvm-project/pull/138294 | +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ | device | omp_get_mapped_ptr routine | :good:`done` | D141545 | +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ diff --git a/offload/include/device.h b/offload/include/device.h index bf93ce0460aef..4e27943d1dbc1 100644 --- a/offload/include/device.h +++ b/offload/include/device.h @@ -158,6 +158,9 @@ struct DeviceTy { /// Ask the device whether the runtime should use auto zero-copy. bool useAutoZeroCopy(); + /// Ask the device whether the storage is accessible. + bool isAccessiblePtr(const void *Ptr, size_t Size); + /// Check if there are pending images for this device. bool hasPendingImages() const { return HasPendingImages; } diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h index 794b79e07674e..89aa468689eaf 100644 --- a/offload/include/omptarget.h +++ b/offload/include/omptarget.h @@ -278,6 +278,7 @@ int omp_get_initial_device(void); void *omp_target_alloc(size_t Size, int DeviceNum); void omp_target_free(void *DevicePtr, int DeviceNum); int omp_target_is_present(const void *Ptr, int DeviceNum); +int omp_target_is_accessible(const void *Ptr, size_t Size, int DeviceNum); int omp_target_memcpy(void *Dst, const void *Src, size_t Length, size_t DstOffset, size_t SrcOffset, int DstDevice, int SrcDevice); diff --git a/offload/libomptarget/OpenMP/API.cpp b/offload/libomptarget/OpenMP/API.cpp index b0f0573833713..48b086d671285 100644 --- a/offload/libomptarget/OpenMP/API.cpp +++ b/offload/libomptarget/OpenMP/API.cpp @@ -196,6 +196,34 @@ EXTERN int omp_target_is_present(const void *Ptr, int DeviceNum) { return Rc; } +/// Check whether a pointer is accessible from a device. +/// Returns true when accessibility is guaranteed otherwise returns false. +EXTERN int omp_target_is_accessible(const void *Ptr, size_t Size, + int DeviceNum) { + TIMESCOPE(); + OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); + DP("Call to omp_target_is_accessible for device %d, address " DPxMOD + ", size %zu\n", + DeviceNum, DPxPTR(Ptr), Size); + + if (!Ptr) { + DP("Call to omp_target_is_accessible with NULL ptr returning false\n"); + return false; + } + + if (DeviceNum == omp_get_initial_device() || DeviceNum == -1) { + DP("Call to omp_target_is_accessible on host, returning true\n"); + return true; + } + + // The device number must refer to a valid device + auto DeviceOrErr = PM->getDevice(DeviceNum); + if (!DeviceOrErr) + FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str()); + + return DeviceOrErr->isAccessiblePtr(Ptr, Size); +} + EXTERN int omp_target_memcpy(void *Dst, const void *Src, size_t Length, size_t DstOffset, size_t SrcOffset, int DstDevice, int SrcDevice) { diff --git a/offload/libomptarget/device.cpp b/offload/libomptarget/device.cpp index 71423ae0c94d9..ee36fbed935a5 100644 --- a/offload/libomptarget/device.cpp +++ b/offload/libomptarget/device.cpp @@ -367,3 +367,7 @@ bool DeviceTy::useAutoZeroCopy() { return false; return RTL->use_auto_zero_copy(RTLDeviceID); } + +bool DeviceTy::isAccessiblePtr(const void *Ptr, size_t Size) { + return RTL->is_accessible_ptr(RTLDeviceID, Ptr, Size); +} diff --git a/offload/libomptarget/exports b/offload/libomptarget/exports index 1374bfea81511..910a5b6c827a7 100644 --- a/offload/libomptarget/exports +++ b/offload/libomptarget/exports @@ -43,6 +43,7 @@ VERS1.0 { omp_get_initial_device; omp_target_alloc; omp_target_free; + omp_target_is_accessible; omp_target_is_present; omp_target_memcpy; omp_target_memcpy_rect; diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 20d16fa5dc515..0b03ef534d273 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -3062,6 +3062,30 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { return ((IsAPU || OMPX_ApuMaps) && IsXnackEnabled); } + Expected isAccessiblePtrImpl(const void *Ptr, size_t Size) override { + hsa_amd_pointer_info_t Info; + Info.size = sizeof(hsa_amd_pointer_info_t); + + hsa_agent_t *Agents = nullptr; + uint32_t Count = 0; + hsa_status_t Status = + hsa_amd_pointer_info(Ptr, &Info, malloc, &Count, &Agents); + + if (auto Err = Plugin::check(Status, "error in hsa_amd_pointer_info: %s")) + return std::move(Err); + + // Checks if the pointer is known by HSA and accessible by the device + for (uint32_t i = 0; i < Count; i++) { + if (Agents[i].handle == getAgent().handle) + return Info.sizeInBytes >= Size; + } + + // If the pointer is unknown to HSA it's assumed a host pointer + // in that case the device can access it on unified memory support is + // enabled + return IsXnackEnabled; + } + /// Getters and setters for stack and heap sizes. Error getDeviceStackSize(uint64_t &Value) override { Value = StackSize; diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index 8c530bba3882c..f9bff9abd903c 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -1066,6 +1066,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy { bool useAutoZeroCopy(); virtual bool useAutoZeroCopyImpl() { return false; } + /// Returns true if the plugin can guarantee that the associated + /// storage is accessible + Expected isAccessiblePtr(const void *Ptr, size_t Size); + virtual Expected createInterop(int32_t InteropType, interop_spec_t &InteropSpec) { return nullptr; @@ -1166,6 +1170,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy { /// Per device setting of MemoryManager's Threshold virtual size_t getMemoryManagerSizeThreshold() { return 0; } + virtual Expected isAccessiblePtrImpl(const void *Ptr, size_t Size) { + return false; + } + /// Environment variables defined by the OpenMP standard. Int32Envar OMP_TeamLimit; Int32Envar OMP_NumTeams; @@ -1492,6 +1500,9 @@ struct GenericPluginTy { /// Returns if the plugin can support automatic copy. int32_t use_auto_zero_copy(int32_t DeviceId); + /// Returns if the associated storage is accessible for a given device. + int32_t is_accessible_ptr(int32_t DeviceId, const void *Ptr, size_t Size); + /// Look up a global symbol in the given binary. int32_t get_global(__tgt_device_binary Binary, uint64_t Size, const char *Name, void **DevicePtr); diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index db43cbe49cc2b..36d643b65922d 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -1599,6 +1599,10 @@ Error GenericDeviceTy::syncEvent(void *EventPtr) { bool GenericDeviceTy::useAutoZeroCopy() { return useAutoZeroCopyImpl(); } +Expected GenericDeviceTy::isAccessiblePtr(const void *Ptr, size_t Size) { + return isAccessiblePtrImpl(Ptr, Size); +} + Error GenericPluginTy::init() { if (Initialized) return Plugin::success(); @@ -2133,6 +2137,22 @@ int32_t GenericPluginTy::use_auto_zero_copy(int32_t DeviceId) { return getDevice(DeviceId).useAutoZeroCopy(); } +int32_t GenericPluginTy::is_accessible_ptr(int32_t DeviceId, const void *Ptr, + size_t Size) { + auto HandleError = [&](Error Err) -> bool { + [[maybe_unused]] std::string ErrStr = toString(std::move(Err)); + DP("Failure while checking accessibility of pointer %p for device %d: %s", + Ptr, DeviceId, ErrStr.c_str()); + return false; + }; + + auto AccessibleOrErr = getDevice(DeviceId).isAccessiblePtr(Ptr, Size); + if (Error Err = AccessibleOrErr.takeError()) + return HandleError(std::move(Err)); + + return *AccessibleOrErr; +} + int32_t GenericPluginTy::get_global(__tgt_device_binary Binary, uint64_t Size, const char *Name, void **DevicePtr) { assert(Binary.handle && "Invalid device binary handle"); diff --git a/offload/test/mapping/is_accessible.cpp b/offload/test/mapping/is_accessible.cpp new file mode 100644 index 0000000000000..7fb23893408ea --- /dev/null +++ b/offload/test/mapping/is_accessible.cpp @@ -0,0 +1,40 @@ +// RUN: %libomptarget-compilexx-generic +// RUN: env HSA_XNACK=1 %libomptarget-run-generic 2>&1 \ +// RUN: | %fcheck-generic + +// RUN: %libomptarget-compilexx-generic +// RUN: env HSA_XNACK=0 %libomptarget-run-generic 2>&1 \ +// RUN: | %fcheck-generic -check-prefix=NO_USM + +// REQUIRES: unified_shared_memory +// XFAIL: nvptx + +// CHECK: SUCCESS +// NO_USM: Not accessible + +#include +#include +#include +#include + +int main() { + int n = 10000; + int *a = new int[n]; + int err = 0; + + // program must be executed with HSA_XNACK=1 + if (!omp_target_is_accessible(a, n * sizeof(int), /*device_num=*/0)) + printf("Not accessible\n"); + else { +#pragma omp target teams distribute parallel for + for (int i = 0; i < n; i++) + a[i] = i; + + for (int i = 0; i < n; i++) + if (a[i] != i) + err++; + } + + printf("%s\n", err == 0 ? "SUCCESS" : "FAIL"); + return err; +} From 1128d3bddb2dbfba357febcc1bf44e7593644ce5 Mon Sep 17 00:00:00 2001 From: Charles Zablit Date: Wed, 22 Oct 2025 08:37:28 -0700 Subject: [PATCH 124/530] [lldb] add a warning if `dirname` is not in the PATH (#164494) This patch adds a check in lldb's API test CMakeLists file to ensure that `dirname` is in the PATH, which is not obvious on Windows. --------- Co-authored-by: nerix --- lldb/test/API/CMakeLists.txt | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/lldb/test/API/CMakeLists.txt b/lldb/test/API/CMakeLists.txt index e3bffbc73c394..c719ac3382a67 100644 --- a/lldb/test/API/CMakeLists.txt +++ b/lldb/test/API/CMakeLists.txt @@ -74,6 +74,16 @@ else() endif() endif() +find_program(LLDB_DIRNAME_PATH dirname) +if(LLDB_DIRNAME_PATH) + message(STATUS "Found dirname: ${LLDB_DIRNAME_PATH}") +else() + message(STATUS "Could NOT find 'dirname'") + message(WARNING + "Many LLDB API tests require the GNU coreutils tools. Please make " + "sure they are installed and in PATH.") +endif() + if (TARGET clang) set(LLDB_DEFAULT_TEST_COMPILER "${LLVM_TOOLS_BINARY_DIR}/clang${CMAKE_EXECUTABLE_SUFFIX}") else() From 1e8834ea3ab770bf7befa3fae917f2c686d00e3b Mon Sep 17 00:00:00 2001 From: Charitha Saumya <136391709+charithaintc@users.noreply.github.com> Date: Wed, 22 Oct 2025 08:41:41 -0700 Subject: [PATCH 125/530] [mlir][vector][xegpu] Accept uniform values in `getDistributedType` (#163887) Uniform values should not be distributed during vector distribution. Example would be a reduction result where reduction happens across lanes. However, current `getDistributedType` does not accept a zero result affine map (i.e. no distributed dims) when describing the distributed dimensions. This result in null type being returned and crashing the vector distribution in some cases. An example case would be a `scf.for` op (about to be distributed) in which one of the for result is a uniform value and it does not have a user outside the warp op. This necessitates querying the `getDistributedType` to figure our the distributed type of this value. --- .../Vector/Transforms/VectorDistribute.cpp | 7 ++- .../Transforms/XeGPUSubgroupDistribute.cpp | 15 ++++-- .../Dialect/XeGPU/subgroup-distribute.mlir | 51 +++++++++++++++++++ 3 files changed, 67 insertions(+), 6 deletions(-) diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp index 7c019e7d25bf2..8b5e950733a22 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp @@ -341,13 +341,18 @@ struct WarpOpToScfIfPattern : public WarpDistributionPattern { /// Return the distributed vector type based on the original type and the /// distribution map. The map is expected to have a dimension equal to the /// original type rank and should be a projection where the results are the -/// distributed dimensions. The number of results should be equal to the number +/// distributed dimensions. If the number of results is zero there is no +/// distribution (i.e. original type is returned). +/// Otherwise, The number of results should be equal to the number /// of warp sizes which is currently limited to 1. /// Example: For a vector<16x32x64> distributed with a map(d0, d1, d2) -> (d1) /// and a warp size of 16 would distribute the second dimension (associated to /// d1) and return vector<16x2x64> static VectorType getDistributedType(VectorType originalType, AffineMap map, int64_t warpSize) { + // If the map has zero results, return the original type. + if (map.getNumResults() == 0) + return originalType; SmallVector targetShape(originalType.getShape()); for (unsigned i = 0, e = map.getNumResults(); i < e; i++) { unsigned position = map.getDimPosition(i); diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index ba89daafedb30..d09dc196c0bf7 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -1505,14 +1505,19 @@ void XeGPUSubgroupDistributePass::runOnOperation() { return AffineMap::get(val.getContext()); // Get the layout of the vector type. xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(val); - // If no layout is specified, assume the inner most dimension is distributed - // for now. + // If no layout is specified, that means no distribution. if (!layout) - return AffineMap::getMultiDimMapWithTargets( - vecRank, {static_cast(vecRank - 1)}, val.getContext()); + return AffineMap::getMultiDimMapWithTargets(vecRank, {}, + val.getContext()); + // Expecting vector and layout rank to match. + assert(layout.getRank() == vecRank && + "Expecting vector and layout rank to match"); + // A dimension is distributed only if layout suggests there are + // multiple lanes assigned for this dimension and the shape can be evenly + // distributed to those lanes. SmallVector distributedDims; for (auto [i, v] : llvm::enumerate(layout.getEffectiveLaneLayoutAsInt())) { - if (v > 1) + if (v > 1 && vecType.getShape()[i] % v == 0) distributedDims.push_back(i); } return AffineMap::getMultiDimMapWithTargets(vecRank, distributedDims, diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir index 0e1365aa64171..27a3dc373c739 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir @@ -214,3 +214,54 @@ gpu.module @xevm_module{ } } + +// ----- +// CHECK-LABEL: gpu.func @warp_scf_for_unused_uniform_for_result( +// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] args(%{{.*}} : index, +// CHECK-SAME: !xegpu.tensor_desc<16x16xf32, #xegpu.layout>, +// CHECK-SAME: memref<16x16xf32>) -> (vector<16x1xf32>, vector<16x1xf32>) { +// CHECK: gpu.yield %{{.*}}, {{.*}} : vector<16x16xf32>, vector<16x1xf32> +// CHECK: } +// CHECK: %{{.*}}:2 = scf.for {{.*}} to %{{.*}} step %{{.*}} iter_args +// CHECK-SAME: (%{{.*}} = %[[W]]#0, %{{.*}} = %[[W]]#1) -> (vector<16x1xf32>, vector<16x1xf32>) { +// CHECK: %[[W1:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] +// CHECK-SAME: args(%{{.*}} : vector<16x1xf32>, vector<16x1xf32>) -> (vector<16x1xf32>, vector<16x1xf32>) { +// CHECK: gpu.yield %{{.*}}, %{{.*}} : vector<16x16xf32>, vector<16x1xf32> +// CHECK: } +// CHECK: scf.yield %[[W1]]#0, %[[W1]]#1 : vector<16x1xf32>, vector<16x1xf32> +// CHECK: } +gpu.module @xevm_module{ + gpu.func @warp_scf_for_unused_uniform_for_result(%arg0: index, + %arg1: !xegpu.tensor_desc<16x16xf32, #xegpu.layout>, + %arg2: memref<16x16xf32>) { + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %ini = "some_def"() {layout_result_0 = #xegpu.layout} + : () -> (vector<16x1xf32>) + %ini2 = "some_def"() {layout_result_0 = #xegpu.layout} + : () -> (vector<16x16xf32>) + %3:2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini2, %arg5 = %ini) -> (vector<16x16xf32>, vector<16x1xf32>) { + %1 = "some_def"(%arg5) + { + layout_operand_0 = #xegpu.layout, + layout_result_0 = #xegpu.layout + } + : (vector<16x1xf32>) -> (vector<16x1xf32>) + %acc = "some_def"(%arg4, %1) + { + layout_operand_0 = #xegpu.layout, + layout_operand_1 = #xegpu.layout, + layout_result_0 = #xegpu.layout + } + : (vector<16x16xf32>, vector<16x1xf32>) -> (vector<16x16xf32>) + scf.yield %acc, %1 : vector<16x16xf32>, vector<16x1xf32> + } + { + layout_result_0 = #xegpu.layout + } + xegpu.store_nd %3#0, %arg1[%c0, %c0] + : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout> + gpu.return + } +} From 6fca1381189065bf363257d5bc626f4f11e91a7f Mon Sep 17 00:00:00 2001 From: Robert Imschweiler Date: Wed, 22 Oct 2025 17:47:14 +0200 Subject: [PATCH 126/530] [BasicBlockUtils] Add BasicBlock printer (#163066) --- .../llvm/Transforms/Utils/BasicBlockUtils.h | 5 ++++ llvm/lib/Transforms/Utils/BasicBlockUtils.cpp | 10 +++++++ .../Transforms/Utils/BasicBlockUtilsTest.cpp | 29 +++++++++++++++++++ 3 files changed, 44 insertions(+) diff --git a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h index 979f3b3eb72ff..e677cbf2d8968 100644 --- a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h +++ b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h @@ -21,6 +21,7 @@ #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Dominators.h" #include "llvm/Support/Compiler.h" +#include "llvm/Support/Printable.h" #include namespace llvm { @@ -611,6 +612,10 @@ LLVM_ABI void InvertBranch(BranchInst *PBI, IRBuilderBase &Builder); // br/brcond/unreachable/ret LLVM_ABI bool hasOnlySimpleTerminator(const Function &F); +/// Print BasicBlock \p BB as an operand or print "" if \p BB is a +/// nullptr. +LLVM_ABI Printable printBasicBlock(const BasicBlock *BB); + } // end namespace llvm #endif // LLVM_TRANSFORMS_UTILS_BASICBLOCKUTILS_H diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp index 8714741a08630..9829d4d50098c 100644 --- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -1793,3 +1793,13 @@ bool llvm::hasOnlySimpleTerminator(const Function &F) { } return true; } + +Printable llvm::printBasicBlock(const BasicBlock *BB) { + return Printable([BB](raw_ostream &OS) { + if (!BB) { + OS << ""; + return; + } + BB->printAsOperand(OS); + }); +} diff --git a/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp b/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp index 3c9374b526b09..4235c93f275f0 100644 --- a/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp +++ b/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp @@ -716,3 +716,32 @@ attributes #0 = { presplitcoroutine } EXPECT_FALSE(llvm::isPresplitCoroSuspendExitEdge( *ExitN.getSinglePredecessor(), ExitN)); } + +TEST(BasicBlockUtils, BasicBlockPrintable) { + std::string S; + std::string SCheck; + llvm::raw_string_ostream OS{S}; + llvm::raw_string_ostream OSCheck{SCheck}; + + LLVMContext C; + std::unique_ptr M = parseIR(C, R"IR( +define void @foo() { + br label %bb0 +bb0: + br label %.exit +.exit: + ret void +} +)IR"); + + Function *F = M->getFunction("foo"); + for (const BasicBlock &BB : *F) { + OS << printBasicBlock(&BB); + BB.printAsOperand(OSCheck); + EXPECT_EQ(OS.str(), OSCheck.str()); + S.clear(); + SCheck.clear(); + } + OS << printBasicBlock(nullptr); + EXPECT_EQ(OS.str(), ""); +} From 57a8599d14d4685487064e8b8a5c748970daa4d6 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 22 Oct 2025 08:58:32 -0700 Subject: [PATCH 127/530] [Docs] Clarify the brace policy for if/else/loop statements (#164570) Without this patch, the five-paragraph section is unclear about exactly when to use and not to use braces. Specifically, the first paragraph suggests omitting braces, and then subsequent paragraphs carve out exceptions. At the end, it's unclear what situations remain for omitting braces. This patch overhauls the text for readability. Specifically, it first describes when to omit braces and then lists cases where we should retain braces. --- llvm/docs/CodingStandards.rst | 36 +++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/llvm/docs/CodingStandards.rst b/llvm/docs/CodingStandards.rst index 8677d894a94a6..63f6663d687ea 100644 --- a/llvm/docs/CodingStandards.rst +++ b/llvm/docs/CodingStandards.rst @@ -1692,29 +1692,29 @@ faraway places in the file to tell that the function is local: Don't Use Braces on Simple Single-Statement Bodies of if/else/loop Statements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -When writing the body of an ``if``, ``else``, or for/while loop statement, we -prefer to omit the braces to avoid unnecessary line noise. However, braces -should be used in cases where the omission of braces harms the readability and -maintainability of the code. +When writing the body of an ``if``, ``else``, or ``for``/``while`` loop +statement, we aim to reduce unnecessary line noise. -We consider that readability is harmed when omitting the brace in the presence -of a single statement that is accompanied by a comment (assuming the comment -can't be hoisted above the ``if`` or loop statement, see below). +**Omit braces when:** -Similarly, braces should be used when a single-statement body is complex enough -that it becomes difficult to see where the block containing the following -statement began. An ``if``/``else`` chain or a loop is considered a single -statement for this rule, and this rule applies recursively. +* The body consists of a single **simple** statement. +* The single statement is not preceded by a comment. + (Hoist comments above the control statement if you can.) +* An ``else`` clause, if present, also meets the above criteria (single + simple statement, no associated comments). -This list is not exhaustive. For example, readability is also harmed if an -``if``/``else`` chain does not use braced bodies for either all or none of its -members, or has complex conditionals, deep nesting, etc. The examples below -intend to provide some guidelines. +**Use braces in all other cases, including:** -Maintainability is harmed if the body of an ``if`` ends with a (directly or -indirectly) nested ``if`` statement with no ``else``. Braces on the outer ``if`` -would help to avoid running into a "dangling else" situation. +* Multi-statement bodies +* Single-statement bodies with non-hoistable comments +* Complex single-statement bodies (e.g., deep nesting, complex nested + loops) +* Inconsistent bracing within ``if``/``else if``/``else`` chains (if one + block requires braces, all must) +* ``if`` statements ending with a nested ``if`` lacking an ``else`` (to + prevent "dangling else") +The examples below provide guidelines for these cases: .. code-block:: c++ From 72616c5ca99bcf4d93d423642387016cb88409bb Mon Sep 17 00:00:00 2001 From: Corentin Jabot Date: Wed, 22 Oct 2025 18:11:56 +0200 Subject: [PATCH 128/530] [Clang] Avoid building unnecessary expressions when checking satisfaction (#164611) When establishing constraint satisfaction, we were building expressions even for compound constraint, This is unnecessary extra work that accounts for ~20% of the performance regression observed here https://github.com/llvm/llvm-project/pull/161671#issuecomment-3420976661 --- clang/lib/AST/ASTConcept.cpp | 2 +- clang/lib/Sema/SemaConcept.cpp | 32 ++++++++------ clang/test/SemaCXX/cxx2c-fold-exprs.cpp | 42 +++++++------------ .../SemaTemplate/concepts-recursive-inst.cpp | 1 - 4 files changed, 34 insertions(+), 43 deletions(-) diff --git a/clang/lib/AST/ASTConcept.cpp b/clang/lib/AST/ASTConcept.cpp index 9ea104c4c3c9d..fd12bc4e83827 100644 --- a/clang/lib/AST/ASTConcept.cpp +++ b/clang/lib/AST/ASTConcept.cpp @@ -86,7 +86,7 @@ void ConstraintSatisfaction::Profile(llvm::FoldingSetNodeID &ID, ID.AddPointer(ConstraintOwner); ID.AddInteger(TemplateArgs.size()); for (auto &Arg : TemplateArgs) - C.getCanonicalTemplateArgument(Arg).Profile(ID, C); + Arg.Profile(ID, C); } ConceptReference * diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp index 54cbfe46a6528..a1163e904dc1b 100644 --- a/clang/lib/Sema/SemaConcept.cpp +++ b/clang/lib/Sema/SemaConcept.cpp @@ -417,8 +417,8 @@ class ConstraintSatisfactionChecker { const NamedDecl *Template; SourceLocation TemplateNameLoc; UnsignedOrNone PackSubstitutionIndex; - ConstraintSatisfaction &Satisfaction; + bool BuildExpression; private: ExprResult @@ -461,10 +461,11 @@ class ConstraintSatisfactionChecker { ConstraintSatisfactionChecker(Sema &SemaRef, const NamedDecl *Template, SourceLocation TemplateNameLoc, UnsignedOrNone PackSubstitutionIndex, - ConstraintSatisfaction &Satisfaction) + ConstraintSatisfaction &Satisfaction, + bool BuildExpression) : S(SemaRef), Template(Template), TemplateNameLoc(TemplateNameLoc), PackSubstitutionIndex(PackSubstitutionIndex), - Satisfaction(Satisfaction) {} + Satisfaction(Satisfaction), BuildExpression(BuildExpression) {} ExprResult Evaluate(const NormalizedConstraint &Constraint, const MultiLevelTemplateArgumentList &MLTAL); @@ -821,9 +822,10 @@ ExprResult ConstraintSatisfactionChecker::EvaluateSlow( Satisfaction.ContainsErrors = false; ExprResult Expr = ConstraintSatisfactionChecker(S, Template, TemplateNameLoc, - UnsignedOrNone(I), Satisfaction) + UnsignedOrNone(I), Satisfaction, + /*BuildExpression=*/false) .Evaluate(Constraint.getNormalizedPattern(), *SubstitutedArgs); - if (Expr.isUsable()) { + if (BuildExpression && Expr.isUsable()) { if (Out.isUnset()) Out = Expr; else @@ -834,7 +836,7 @@ ExprResult ConstraintSatisfactionChecker::EvaluateSlow( Constraint.getBeginLoc(), FPOptionsOverride{}); } else { - assert(!Satisfaction.IsSatisfied); + assert(!BuildExpression || !Satisfaction.IsSatisfied); } if (!Conjunction && Satisfaction.IsSatisfied) { Satisfaction.Details.erase(Satisfaction.Details.begin() + @@ -985,7 +987,7 @@ ExprResult ConstraintSatisfactionChecker::Evaluate( ExprResult E = Evaluate(Constraint.getNormalizedConstraint(), MLTAL); - if (!E.isUsable()) { + if (E.isInvalid()) { Satisfaction.Details.insert(Satisfaction.Details.begin() + Size, ConceptId); return E; } @@ -1041,7 +1043,7 @@ ExprResult ConstraintSatisfactionChecker::Evaluate( if (Conjunction && (!Satisfaction.IsSatisfied || Satisfaction.ContainsErrors)) return LHS; - if (!Conjunction && LHS.isUsable() && Satisfaction.IsSatisfied && + if (!Conjunction && !LHS.isInvalid() && Satisfaction.IsSatisfied && !Satisfaction.ContainsErrors) return LHS; @@ -1050,12 +1052,15 @@ ExprResult ConstraintSatisfactionChecker::Evaluate( ExprResult RHS = Evaluate(Constraint.getRHS(), MLTAL); - if (RHS.isUsable() && Satisfaction.IsSatisfied && + if (!Conjunction && !RHS.isInvalid() && Satisfaction.IsSatisfied && !Satisfaction.ContainsErrors) Satisfaction.Details.erase(Satisfaction.Details.begin() + EffectiveDetailEndIndex, Satisfaction.Details.end()); + if (!BuildExpression) + return Satisfaction.ContainsErrors ? ExprError() : ExprEmpty(); + if (!LHS.isUsable()) return RHS; @@ -1136,10 +1141,11 @@ static bool CheckConstraintSatisfaction( Template, /*CSE=*/nullptr, S.ArgPackSubstIndex); - ExprResult Res = - ConstraintSatisfactionChecker(S, Template, TemplateIDRange.getBegin(), - S.ArgPackSubstIndex, Satisfaction) - .Evaluate(*C, TemplateArgsLists); + ExprResult Res = ConstraintSatisfactionChecker( + S, Template, TemplateIDRange.getBegin(), + S.ArgPackSubstIndex, Satisfaction, + /*BuildExpression=*/ConvertedExpr != nullptr) + .Evaluate(*C, TemplateArgsLists); if (Res.isInvalid()) return true; diff --git a/clang/test/SemaCXX/cxx2c-fold-exprs.cpp b/clang/test/SemaCXX/cxx2c-fold-exprs.cpp index 137f46ee3dc01..289059ea86eb9 100644 --- a/clang/test/SemaCXX/cxx2c-fold-exprs.cpp +++ b/clang/test/SemaCXX/cxx2c-fold-exprs.cpp @@ -157,66 +157,55 @@ static_assert(And1() == 1); // FIXME: The diagnostics are not so great static_assert(And1() == 1); // expected-error {{no matching function for call to 'And1'}} // expected-note@#and1 {{candidate template ignored: constraints not satisfied [with T = ]}} - // expected-note@#and1 {{because 'typename T::type' does not satisfy 'C'}} - // expected-note@#C {{because 'T' does not satisfy 'A'}} + // expected-note@#and1 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}} static_assert(And1() == 1); // expected-error {{no matching function for call to 'And1'}} // expected-note@#and1 {{candidate template ignored: constraints not satisfied [with T = ]}} - // expected-note@#and1 {{because 'typename T::type' does not satisfy 'C'}} - // expected-note@#C {{because 'T' does not satisfy 'A'}} + // expected-note@#and1 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}} static_assert(And1() == 1); // expected-error {{no matching function for call to 'And1'}} // expected-note@#and1 {{candidate template ignored: constraints not satisfied [with T = ]}} - // expected-note@#and1 {{because 'typename T::type' does not satisfy 'C'}} - // expected-note@#C {{because 'T' does not satisfy 'A'}} + // expected-note@#and1 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}} static_assert(And2() == 2); static_assert(And2() == 2); static_assert(And2() == 2); // expected-error {{no matching function for call to 'And2'}} // expected-note@#and2 {{candidate template ignored: constraints not satisfied [with T = int, U = <>]}} - // expected-note@#and2 {{because 'typename U::type' does not satisfy 'C'}} - // expected-note@#C {{because 'T' does not satisfy 'A'}} + // expected-note@#and2 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}} static_assert(And2() == 2); // expected-error {{no matching function for call to 'And2'}} // expected-note@#and2 {{candidate template ignored: constraints not satisfied [with T = S, U = ]}} \ - // expected-note@#and2 {{because 'typename U::type' does not satisfy 'C'}} - // expected-note@#C {{because 'T' does not satisfy 'A'}} + // expected-note@#and2 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}} static_assert(And2() == 2); // expected-error {{no matching function for call to 'And2'}} // expected-note@#and2 {{candidate template ignored: constraints not satisfied [with T = int, U = ]}} - // expected-note@#and2 {{because 'typename T::type' does not satisfy 'C'}} - // expected-note@#C {{because 'T' does not satisfy 'A'}} + // expected-note@#and2 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}} static_assert(And2() == 2); // expected-error {{no matching function for call to 'And2'}} // expected-note@#and2 {{candidate template ignored: constraints not satisfied [with T = int, U = ]}} - // expected-note@#and2 {{because 'typename T::type' does not satisfy 'C'}} - // expected-note@#C {{because 'T' does not satisfy 'A'}} + // expected-note@#and2 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}} static_assert(And3() == 3); static_assert(And3() == 3); static_assert(And3() == 3); // expected-error {{no matching function for call to 'And3'}} // expected-note@#and3 {{candidate template ignored: constraints not satisfied [with T = int, U = <>]}} - // expected-note@#and3 {{because 'typename T::type' does not satisfy 'C'}} - // expected-note@#C {{because 'T' does not satisfy 'A'}} + // expected-note@#and3 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}} static_assert(And3() == 3); // expected-error {{no matching function for call to 'And3'}} // expected-note@#and3 {{candidate template ignored: constraints not satisfied [with T = int, U = ]}} - // expected-note@#and3 {{because 'typename T::type' does not satisfy 'C'}} - // expected-note@#C {{because 'T' does not satisfy 'A'}} + // expected-note@#and3 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}} static_assert(And3() == 3); // expected-error {{no matching function for call to 'And3'}} // expected-note@#and3 {{candidate template ignored: constraints not satisfied [with T = S, U = ]}} - // expected-note@#and3 {{because 'typename U::type' does not satisfy 'C'}} - // expected-note@#C {{because 'T' does not satisfy 'A'}} + // expected-note@#and3 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}} static_assert(And3() == 3); // expected-error {{no matching function for call to 'And3'}} // expected-note@#and3 {{candidate template ignored: constraints not satisfied [with T = int, U = ]}} - // expected-note@#and3 {{because 'typename T::type' does not satisfy 'C'}} - // expected-note@#C {{because 'T' does not satisfy 'A'}} + // expected-note@#and3 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}} static_assert(Or1<>() == 1); // expected-error {{no matching function for call to 'Or1'}} @@ -227,8 +216,7 @@ static_assert(Or1() == 1); static_assert(Or1() == 1); static_assert(Or1() == 1); // expected-error {{no matching function for call to 'Or1'}} // expected-note@#or1 {{candidate template ignored: constraints not satisfied}} - // expected-note@#or1 {{because 'typename T::type' does not satisfy 'C'}} - // expected-note@#C {{because 'T' does not satisfy 'A'}} + // expected-note@#or1 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}} static_assert(Or2() == 2); static_assert(Or2() == 2); @@ -236,16 +224,14 @@ static_assert(Or2() == 2); static_assert(Or2() == 2); static_assert(Or2() == 2); // expected-error {{no matching function for call to 'Or2'}} // expected-note@#or2 {{candidate template ignored: constraints not satisfied [with T = int, U = <>]}} - // expected-note@#or2 {{because 'typename T::type' does not satisfy 'C'}} - // expected-note@#C {{because 'T' does not satisfy 'A'}} + // expected-note@#or2 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}} static_assert(Or3() == 3); static_assert(Or3() == 3); static_assert(Or3() == 3); static_assert(Or3() == 3); static_assert(Or3() == 3); // expected-error {{no matching function for call to 'Or3'}} // expected-note@#or3 {{candidate template ignored: constraints not satisfied}} - // expected-note@#or3 {{because 'typename T::type' does not satisfy 'C'}} - // expected-note@#C {{because 'T' does not satisfy 'A'}} + // expected-note@#or3 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}} } namespace bool_conversion_break { diff --git a/clang/test/SemaTemplate/concepts-recursive-inst.cpp b/clang/test/SemaTemplate/concepts-recursive-inst.cpp index 73dce9317f383..5e1bce5f15684 100644 --- a/clang/test/SemaTemplate/concepts-recursive-inst.cpp +++ b/clang/test/SemaTemplate/concepts-recursive-inst.cpp @@ -82,7 +82,6 @@ auto it = begin(rng); // #BEGIN_CALL // expected-error@#BEGIN_CALL {{no matching function for call to 'begin'}} // expected-note@#NOTINF_BEGIN {{candidate function}} // expected-note@#INF_BEGIN{{candidate template ignored: constraints not satisfied}} -// expected-note@#INF_BEGIN{{because 'Inf auto' does not satisfy 'Inf}} } } // namespace DirectRecursiveCheck From a4ff6d9cf8bcd0b1545644a137f63e00f5750123 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Wed, 22 Oct 2025 18:22:09 +0200 Subject: [PATCH 129/530] [libc++][C++03] Introduce LIBCPP_{,NON_}FROZEN_ASSERT (#163380) There are a few cases where test only fail with the C++03 frozen headers because some implementation details are asserted. This patch introduces new assertion macros to differentiate between frozen and non-frozen headers. --- .../unord/unord.map/unord.map.cnstr/assign_copy.pass.cpp | 4 +--- .../unord.multimap.cnstr/assign_copy.pass.cpp | 4 +--- .../unord.multiset.cnstr/assign_copy.pass.cpp | 4 +--- .../unord/unord.set/unord.set.cnstr/assign_copy.pass.cpp | 4 +--- .../facet.num.put.members/put_pointer.pass.cpp | 4 +--- libcxx/test/support/test_macros.h | 6 ++++++ 6 files changed, 11 insertions(+), 15 deletions(-) diff --git a/libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/assign_copy.pass.cpp b/libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/assign_copy.pass.cpp index 3e4c5b1c03fcd..c9a0fa162b7d0 100644 --- a/libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/assign_copy.pass.cpp +++ b/libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/assign_copy.pass.cpp @@ -14,8 +14,6 @@ // unordered_map& operator=(const unordered_map& u); -// XFAIL: FROZEN-CXX03-HEADERS-FIXME - #include #include #include @@ -270,7 +268,7 @@ void test_alloc(const Alloc& lhs_alloc = Alloc(), V rhs_arr[] = {V(10, 4), V(13, 5), V(12, 324), V(0, 54), V(50, 5), V(2, 5)}; Map copy(begin(rhs_arr), end(rhs_arr), 0, std::hash(), std::equal_to(), rhs_alloc); copy = orig; - LIBCPP_ASSERT(copy.bucket_count() == 5); + LIBCPP_NON_FROZEN_ASSERT(copy.bucket_count() == 5); assert(copy.size() == 4); assert(copy.at(1) == 1); assert(copy.at(2) == 3); diff --git a/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/assign_copy.pass.cpp b/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/assign_copy.pass.cpp index 938b6beccd141..beb67d89915d3 100644 --- a/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/assign_copy.pass.cpp +++ b/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/assign_copy.pass.cpp @@ -14,8 +14,6 @@ // unordered_multimap& operator=(const unordered_multimap& u); -// XFAIL: FROZEN-CXX03-HEADERS-FIXME - #include #include #include @@ -289,7 +287,7 @@ void test_alloc(const Alloc& lhs_alloc = Alloc(), V rhs_arr[] = {V(10, 4), V(13, 5), V(12, 324), V(0, 54), V(50, 5), V(2, 5)}; Map copy(begin(rhs_arr), end(rhs_arr), 0, std::hash(), std::equal_to(), rhs_alloc); copy = orig; - LIBCPP_ASSERT(copy.bucket_count() == 5); + LIBCPP_NON_FROZEN_ASSERT(copy.bucket_count() == 5); assert(copy.size() == 4); assert(copy.find(1)->second == 1); assert(copy.find(2)->second == 3); diff --git a/libcxx/test/std/containers/unord/unord.multiset/unord.multiset.cnstr/assign_copy.pass.cpp b/libcxx/test/std/containers/unord/unord.multiset/unord.multiset.cnstr/assign_copy.pass.cpp index e415253c5f60f..5c85676fac2e3 100644 --- a/libcxx/test/std/containers/unord/unord.multiset/unord.multiset.cnstr/assign_copy.pass.cpp +++ b/libcxx/test/std/containers/unord/unord.multiset/unord.multiset.cnstr/assign_copy.pass.cpp @@ -14,8 +14,6 @@ // unordered_multiset& operator=(const unordered_multiset& u); -// XFAIL: FROZEN-CXX03-HEADERS-FIXME - #include #include #include @@ -259,7 +257,7 @@ void test_alloc(const Alloc& lhs_alloc = Alloc(), int rhs_arr[] = {10, 13, 12, 0, 50, 2}; Set copy(std::begin(rhs_arr), std::end(rhs_arr), 0, std::hash(), std::equal_to(), rhs_alloc); copy = orig; - LIBCPP_ASSERT(copy.bucket_count() == 5); + LIBCPP_NON_FROZEN_ASSERT(copy.bucket_count() == 5); assert(copy.size() == 4); assert(copy.count(1) == 1); assert(copy.count(2) == 1); diff --git a/libcxx/test/std/containers/unord/unord.set/unord.set.cnstr/assign_copy.pass.cpp b/libcxx/test/std/containers/unord/unord.set/unord.set.cnstr/assign_copy.pass.cpp index 9828b8b459c89..30e12e2dfa57c 100644 --- a/libcxx/test/std/containers/unord/unord.set/unord.set.cnstr/assign_copy.pass.cpp +++ b/libcxx/test/std/containers/unord/unord.set/unord.set.cnstr/assign_copy.pass.cpp @@ -14,8 +14,6 @@ // unordered_set& operator=(const unordered_set& u); -// XFAIL: FROZEN-CXX03-HEADERS-FIXME - #include #include #include @@ -262,7 +260,7 @@ void test_alloc(const Alloc& lhs_alloc = Alloc(), int rhs_arr[] = {10, 13, 12, 0, 50, 2}; Set copy(std::begin(rhs_arr), std::end(rhs_arr), 0, std::hash(), std::equal_to(), rhs_alloc); copy = orig; - LIBCPP_ASSERT(copy.bucket_count() == 5); + LIBCPP_NON_FROZEN_ASSERT(copy.bucket_count() == 5); assert(copy.size() == 4); assert(copy.count(1) == 1); assert(copy.count(2) == 1); diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_pointer.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_pointer.pass.cpp index 572a14e67e631..fed5b4a610fd4 100644 --- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_pointer.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_pointer.pass.cpp @@ -12,8 +12,6 @@ // iter_type put(iter_type s, ios_base& iob, char_type fill, void* v) const; -// XFAIL: FROZEN-CXX03-HEADERS-FIXME - #include #include #include @@ -36,7 +34,7 @@ int main(int, char**) { cpp17_output_iterator iter = f.put(cpp17_output_iterator(str), ios, '*', v); std::string ex(str, base(iter)); assert(!ex.empty()); - LIBCPP_ASSERT(ex == "0"); + LIBCPP_NON_FROZEN_ASSERT(ex == "0"); } return 0; diff --git a/libcxx/test/support/test_macros.h b/libcxx/test/support/test_macros.h index 2fc25fc024ac1..c4e1600572456 100644 --- a/libcxx/test/support/test_macros.h +++ b/libcxx/test/support/test_macros.h @@ -264,6 +264,12 @@ #define LIBCPP_ONLY(...) static_assert(true, "") #endif +#ifdef _LIBCPP_USE_FROZEN_CXX03_HEADERS +# define LIBCPP_NON_FROZEN_ASSERT(...) static_assert(true, "") +#else +# define LIBCPP_NON_FROZEN_ASSERT(...) LIBCPP_ASSERT(__VA_ARGS__) +#endif + #if __has_cpp_attribute(nodiscard) # define TEST_NODISCARD [[nodiscard]] #else From a516cc0badca0ebbe8db4ce415b95f4696c8d845 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Wed, 22 Oct 2025 18:22:43 +0200 Subject: [PATCH 130/530] [libc++][C++03] Fix alg.count/count.pass.cpp (#163751) --- libcxx/include/__cxx03/__algorithm/count.h | 10 +++++----- libcxx/include/__cxx03/__bit/popcount.h | 12 +++--------- .../alg.nonmodifying/alg.count/count.pass.cpp | 1 - 3 files changed, 8 insertions(+), 15 deletions(-) diff --git a/libcxx/include/__cxx03/__algorithm/count.h b/libcxx/include/__cxx03/__algorithm/count.h index 5440fd031a1d3..5b05b4b71f519 100644 --- a/libcxx/include/__cxx03/__algorithm/count.h +++ b/libcxx/include/__cxx03/__algorithm/count.h @@ -54,18 +54,18 @@ __count_bool(__bit_iterator<_Cp, _IsConst> __first, typename _Cp::size_type __n) if (__first.__ctz_ != 0) { __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_); __storage_type __dn = std::min(__clz_f, __n); - __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn)); - __r = std::__libcpp_popcount(std::__invert_if(*__first.__seg_) & __m); + __storage_type __m = (__storage_type(~0) << __first.__ctz_) & (__storage_type(~0) >> (__clz_f - __dn)); + __r = std::__libcpp_popcount<__storage_type>(std::__invert_if(*__first.__seg_) & __m); __n -= __dn; ++__first.__seg_; } // do middle whole words for (; __n >= __bits_per_word; ++__first.__seg_, __n -= __bits_per_word) - __r += std::__libcpp_popcount(std::__invert_if(*__first.__seg_)); + __r += std::__libcpp_popcount<__storage_type>(std::__invert_if(*__first.__seg_)); // do last partial word if (__n > 0) { - __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n); - __r += std::__libcpp_popcount(std::__invert_if(*__first.__seg_) & __m); + __storage_type __m = __storage_type(~0) >> (__bits_per_word - __n); + __r += std::__libcpp_popcount<__storage_type>(std::__invert_if(*__first.__seg_) & __m); } return __r; } diff --git a/libcxx/include/__cxx03/__bit/popcount.h b/libcxx/include/__cxx03/__bit/popcount.h index 64404d2cf4948..a61a92177c5b7 100644 --- a/libcxx/include/__cxx03/__bit/popcount.h +++ b/libcxx/include/__cxx03/__bit/popcount.h @@ -6,9 +6,6 @@ // //===----------------------------------------------------------------------===// -// TODO: __builtin_popcountg is available since Clang 19 and GCC 14. When support for older versions is dropped, we can -// refactor this code to exclusively use __builtin_popcountg. - #ifndef _LIBCPP___CXX03___BIT_POPCOUNT_H #define _LIBCPP___CXX03___BIT_POPCOUNT_H @@ -25,12 +22,9 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD -inline _LIBCPP_HIDE_FROM_ABI int __libcpp_popcount(unsigned __x) _NOEXCEPT { return __builtin_popcount(__x); } - -inline _LIBCPP_HIDE_FROM_ABI int __libcpp_popcount(unsigned long __x) _NOEXCEPT { return __builtin_popcountl(__x); } - -inline _LIBCPP_HIDE_FROM_ABI int __libcpp_popcount(unsigned long long __x) _NOEXCEPT { - return __builtin_popcountll(__x); +template +_LIBCPP_HIDE_FROM_ABI int __libcpp_popcount(_Tp __v) { + return __builtin_popcountg(__v); } _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp index e696dcdb34351..ffe3e0ef746c1 100644 --- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp +++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp @@ -15,7 +15,6 @@ // ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=20000000 // ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=80000000 -// XFAIL: FROZEN-CXX03-HEADERS-FIXME #include #include From 949148c1f33b96cf8893741e16129684e595b0ce Mon Sep 17 00:00:00 2001 From: Luke Hutton Date: Wed, 22 Oct 2025 17:35:22 +0100 Subject: [PATCH 131/530] [mlir][tosa] Fix argmax folder when output type is i64 (#163583) Previously the following IR: ``` tosa.argmax %arg0 {axis = 0 : i32} : (tensor<1xi8>) -> tensor ``` Would result in a crash with the assertion: ``` expected dense element bit width 64 to match data size 32 for type i64 ``` This commit ensures that zero is constructed with the correct bitwidth while folding, therefore fixing the crash. --- mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp | 8 ++++++-- mlir/test/Dialect/Tosa/canonicalize.mlir | 9 +++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp index caf80165fc640..99b7cda49094e 100644 --- a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp +++ b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp @@ -1001,8 +1001,12 @@ OpFoldResult ArgMaxOp::fold(FoldAdaptor adaptor) { !outputTy.hasStaticShape()) return {}; - if (inputTy.getDimSize(getAxis()) == 1) - return DenseElementsAttr::get(outputTy, 0); + const Type outputElementTy = getElementTypeOrSelf(outputTy); + if (inputTy.getDimSize(getAxis()) == 1 && outputElementTy.isInteger()) { + const auto outputElemIntTy = cast(outputElementTy); + const APInt zero = APInt::getZero(outputElemIntTy.getWidth()); + return DenseElementsAttr::get(outputTy, zero); + } return {}; } diff --git a/mlir/test/Dialect/Tosa/canonicalize.mlir b/mlir/test/Dialect/Tosa/canonicalize.mlir index e8525a5d2ed62..7574afa215e78 100644 --- a/mlir/test/Dialect/Tosa/canonicalize.mlir +++ b/mlir/test/Dialect/Tosa/canonicalize.mlir @@ -9,6 +9,15 @@ func.func @argmax_nofold(%arg0: tensor) -> tensor<1xi32> { // ----- +// CHECK-LABEL: @test_argmax_fold_i64_index +func.func @test_argmax_fold_i64_index(%arg0: tensor<1xi8>) -> tensor { + // CHECK: "tosa.const"() <{values = dense<0> : tensor}> : () -> tensor + %0 = tosa.argmax %arg0 {axis = 0 : i32} : (tensor<1xi8>) -> tensor + return %0 : tensor +} + +// ----- + // CHECK-LABEL: @pad_wh_avg_pool2d_fold func.func @pad_wh_avg_pool2d_fold(%input: tensor<1x10x8x3xf32>) -> tensor<1x6x5x3xf32> { // CHECK-NOT: tosa.pad From f67880ae3d23981c733383126267dd8e841d1ea9 Mon Sep 17 00:00:00 2001 From: Ebuka Ezike Date: Wed, 22 Oct 2025 17:37:59 +0100 Subject: [PATCH 132/530] [lldb-dap] Fix mutex acquisition order for modules event. (#163821) The modules event requires the `APIMutex` to create the module load address. this may happen at the same time the `module request` is handled. The modules request also requires the `APIMutex` and the `modules_mutex, set the order to acquire the mutexes. --- lldb/tools/lldb-dap/DAP.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp index 52c8c6b9092b9..3c4f2253d1ad5 100644 --- a/lldb/tools/lldb-dap/DAP.cpp +++ b/lldb/tools/lldb-dap/DAP.cpp @@ -26,6 +26,7 @@ #include "lldb/API/SBEvent.h" #include "lldb/API/SBLanguageRuntime.h" #include "lldb/API/SBListener.h" +#include "lldb/API/SBMutex.h" #include "lldb/API/SBProcess.h" #include "lldb/API/SBStream.h" #include "lldb/Host/JSONTransport.h" @@ -1452,7 +1453,11 @@ void DAP::EventThread() { const bool remove_module = event_mask & lldb::SBTarget::eBroadcastBitModulesUnloaded; - std::lock_guard guard(modules_mutex); + // NOTE: Both mutexes must be acquired to prevent deadlock when + // handling `modules_request`, which also requires both locks. + lldb::SBMutex api_mutex = GetAPIMutex(); + const std::scoped_lock guard( + api_mutex, modules_mutex); for (uint32_t i = 0; i < num_modules; ++i) { lldb::SBModule module = lldb::SBTarget::GetModuleAtIndexFromEvent(i, event); From d5e9a56ccb44eeb1f36a7af8be81081bbf00eb9c Mon Sep 17 00:00:00 2001 From: Amr Hesham Date: Wed, 22 Oct 2025 18:39:46 +0200 Subject: [PATCH 133/530] [CIR][NFC] Update redirection syntax in lit test (#164572) Update the redirection syntax in the lit test file to be compatible with the LLVM internal shell, which is the default now --- clang/test/CIR/CodeGen/aapcs-volatile-bitfields.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/test/CIR/CodeGen/aapcs-volatile-bitfields.c b/clang/test/CIR/CodeGen/aapcs-volatile-bitfields.c index 92eae6aab6800..19362cf79b107 100644 --- a/clang/test/CIR/CodeGen/aapcs-volatile-bitfields.c +++ b/clang/test/CIR/CodeGen/aapcs-volatile-bitfields.c @@ -1,11 +1,11 @@ -// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -fclangir -emit-cir -fdump-record-layouts %s -o %t.cir 1> %t.cirlayout +// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -fclangir -emit-cir -fdump-record-layouts %s -o %t.cir > %t.cirlayout // RUN: FileCheck --input-file=%t.cirlayout %s --check-prefix=CIR-LAYOUT // RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR // RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll // RUN: FileCheck --input-file=%t-cir.ll %s --check-prefix=LLVM -// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -emit-llvm -fdump-record-layouts %s -o %t.ll 1> %t.ogcglayout +// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -emit-llvm -fdump-record-layouts %s -o %t.ll > %t.ogcglayout // RUN: FileCheck --input-file=%t.ogcglayout %s --check-prefix=OGCG-LAYOUT // RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG From 57a8228f40b6d8593a8744f6285444f124c636b8 Mon Sep 17 00:00:00 2001 From: Alexey Samsonov Date: Wed, 22 Oct 2025 09:41:03 -0700 Subject: [PATCH 134/530] [libc] Add macros definitions for (#164474) This PR adds required macro definitions to `` header, so that they're available to the code including it. The header itself was added in 12abe8aed6dc724546cf93c94d7ff6abe864f28b, together with stub definitions of the three required functions. --- libc/include/CMakeLists.txt | 1 + libc/include/llvm-libc-macros/CMakeLists.txt | 6 ++++++ libc/include/llvm-libc-macros/nl-types-macros.h | 15 +++++++++++++++ libc/include/nl_types.yaml | 6 +++++- libc/test/src/nl_types/CMakeLists.txt | 1 + libc/test/src/nl_types/nl_types_test.cpp | 5 +++-- 6 files changed, 31 insertions(+), 3 deletions(-) create mode 100644 libc/include/llvm-libc-macros/nl-types-macros.h diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt index 09f169b9a985f..a277690ae2e17 100644 --- a/libc/include/CMakeLists.txt +++ b/libc/include/CMakeLists.txt @@ -776,6 +776,7 @@ add_header_macro( ../libc/include/nl_types.yaml nl_types.h DEPENDS + .llvm-libc-macros.nl_types_macros .llvm-libc-types.nl_catd ) diff --git a/libc/include/llvm-libc-macros/CMakeLists.txt b/libc/include/llvm-libc-macros/CMakeLists.txt index 76c03d913ee12..a33ff6cabbcf5 100644 --- a/libc/include/llvm-libc-macros/CMakeLists.txt +++ b/libc/include/llvm-libc-macros/CMakeLists.txt @@ -345,6 +345,12 @@ add_macro_header( locale-macros.h ) +add_macro_header( + nl_types_macros + HDR + nl-types-macros.h +) + add_macro_header( pthread_macros HDR diff --git a/libc/include/llvm-libc-macros/nl-types-macros.h b/libc/include/llvm-libc-macros/nl-types-macros.h new file mode 100644 index 0000000000000..b6d0d3519b510 --- /dev/null +++ b/libc/include/llvm-libc-macros/nl-types-macros.h @@ -0,0 +1,15 @@ +//===-- Definition of macros from nl_types.h ------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_MACROS_NL_TYPES_MACROS_H +#define LLVM_LIBC_MACROS_NL_TYPES_MACROS_H + +#define NL_SETD 1 +#define NL_CAT_LOCALE 1 + +#endif // LLVM_LIBC_MACROS_NL_TYPES_MACROS_H diff --git a/libc/include/nl_types.yaml b/libc/include/nl_types.yaml index aecbb44a02224..bdb59a82a3337 100644 --- a/libc/include/nl_types.yaml +++ b/libc/include/nl_types.yaml @@ -1,7 +1,11 @@ header: nl_types.h standards: - posix -macros: [] +macros: + - macro_name: NL_SETD + macro_header: nl-types-macros.h + - macro_name: NL_CAT_LOCALE + macro_header: nl-types-macros.h types: - type_name: nl_catd enums: [] diff --git a/libc/test/src/nl_types/CMakeLists.txt b/libc/test/src/nl_types/CMakeLists.txt index 4fce637baa726..6bafb32f15c0b 100644 --- a/libc/test/src/nl_types/CMakeLists.txt +++ b/libc/test/src/nl_types/CMakeLists.txt @@ -7,6 +7,7 @@ add_libc_test( SRCS nl_types_test.cpp DEPENDS + libc.include.llvm-libc-macros.nl_types_macros libc.include.llvm-libc-types.nl_catd libc.src.nl_types.catopen libc.src.nl_types.catclose diff --git a/libc/test/src/nl_types/nl_types_test.cpp b/libc/test/src/nl_types/nl_types_test.cpp index 5ae5c5ab28546..7392200272dfd 100644 --- a/libc/test/src/nl_types/nl_types_test.cpp +++ b/libc/test/src/nl_types/nl_types_test.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include "include/llvm-libc-macros/nl-types-macros.h" #include "include/llvm-libc-types/nl_catd.h" #include "src/nl_types/catclose.h" #include "src/nl_types/catgets.h" @@ -15,7 +16,7 @@ using LlvmLibcNlTypesTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; TEST_F(LlvmLibcNlTypesTest, CatopenFails) { - ASSERT_EQ(LIBC_NAMESPACE::catopen("/somepath", 0), + ASSERT_EQ(LIBC_NAMESPACE::catopen("/somepath", NL_CAT_LOCALE), reinterpret_cast(-1)); ASSERT_ERRNO_EQ(EINVAL); } @@ -28,6 +29,6 @@ TEST_F(LlvmLibcNlTypesTest, CatgetsFails) { const char *message = "message"; // Note that we test for pointer equality here, since catgets // is expected to return the input argument as-is. - ASSERT_EQ(LIBC_NAMESPACE::catgets(nullptr, 0, 0, message), + ASSERT_EQ(LIBC_NAMESPACE::catgets(nullptr, NL_SETD, 1, message), const_cast(message)); } From e6af0a40acc6d244b6862142829274e0bde7d75c Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Wed, 22 Oct 2025 11:46:11 -0500 Subject: [PATCH 135/530] [flang][OpenMP] Keep track of scoping units in OmpStructureChecker (#164419) Introduce a stack of scopes to OmpStructureChecker for scoping units, plus function/subroutine entries in interfaces. This will help with applying and locating properties introduced by declarative or informational directives (e.g. DECLARE_TARGET, REQUIRES), which are stored as flags on the corresponding symbols. --- flang/lib/Semantics/check-omp-structure.cpp | 118 ++++++++++++++++++++ flang/lib/Semantics/check-omp-structure.h | 35 ++++-- 2 files changed, 142 insertions(+), 11 deletions(-) diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index be10669ac2536..b3c4d24ad496d 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -61,6 +61,124 @@ namespace Fortran::semantics { using namespace Fortran::semantics::omp; using namespace Fortran::parser::omp; +OmpStructureChecker::OmpStructureChecker(SemanticsContext &context) + : DirectiveStructureChecker(context, +#define GEN_FLANG_DIRECTIVE_CLAUSE_MAP +#include "llvm/Frontend/OpenMP/OMP.inc" + ) { + scopeStack_.push_back(&context.globalScope()); +} + +bool OmpStructureChecker::Enter(const parser::MainProgram &x) { + using StatementProgramStmt = parser::Statement; + if (auto &stmt{std::get>(x.t)}) { + scopeStack_.push_back(stmt->statement.v.symbol->scope()); + } else { + for (const Scope &scope : context_.globalScope().children()) { + // There can only be one main program. + if (scope.kind() == Scope::Kind::MainProgram) { + scopeStack_.push_back(&scope); + break; + } + } + } + return true; +} + +void OmpStructureChecker::Leave(const parser::MainProgram &x) { + scopeStack_.pop_back(); +} + +bool OmpStructureChecker::Enter(const parser::BlockData &x) { + // The BLOCK DATA name is optional, so we need to look for the + // corresponding scope in the global scope. + auto &stmt{std::get>(x.t)}; + if (auto &name{stmt.statement.v}) { + scopeStack_.push_back(name->symbol->scope()); + } else { + for (const Scope &scope : context_.globalScope().children()) { + if (scope.kind() == Scope::Kind::BlockData) { + if (scope.symbol()->name().empty()) { + scopeStack_.push_back(&scope); + break; + } + } + } + } + return true; +} + +void OmpStructureChecker::Leave(const parser::BlockData &x) { + scopeStack_.pop_back(); +} + +bool OmpStructureChecker::Enter(const parser::Module &x) { + auto &stmt{std::get>(x.t)}; + const Symbol *sym{stmt.statement.v.symbol}; + scopeStack_.push_back(sym->scope()); + return true; +} + +void OmpStructureChecker::Leave(const parser::Module &x) { + scopeStack_.pop_back(); +} + +bool OmpStructureChecker::Enter(const parser::Submodule &x) { + auto &stmt{std::get>(x.t)}; + const Symbol *sym{std::get(stmt.statement.t).symbol}; + scopeStack_.push_back(sym->scope()); + return true; +} + +void OmpStructureChecker::Leave(const parser::Submodule &x) { + scopeStack_.pop_back(); +} + +// Function/subroutine subprogram nodes don't appear in INTERFACEs, but +// the subprogram/end statements do. +bool OmpStructureChecker::Enter(const parser::SubroutineStmt &x) { + const Symbol *sym{std::get(x.t).symbol}; + scopeStack_.push_back(sym->scope()); + return true; +} + +bool OmpStructureChecker::Enter(const parser::EndSubroutineStmt &x) { + scopeStack_.pop_back(); + return true; +} + +bool OmpStructureChecker::Enter(const parser::FunctionStmt &x) { + const Symbol *sym{std::get(x.t).symbol}; + scopeStack_.push_back(sym->scope()); + return true; +} + +bool OmpStructureChecker::Enter(const parser::EndFunctionStmt &x) { + scopeStack_.pop_back(); + return true; +} + +bool OmpStructureChecker::Enter(const parser::BlockConstruct &x) { + auto &specPart{std::get(x.t)}; + auto &execPart{std::get(x.t)}; + if (auto &&source{parser::GetSource(specPart)}) { + scopeStack_.push_back(&context_.FindScope(*source)); + } else if (auto &&source{parser::GetSource(execPart)}) { + scopeStack_.push_back(&context_.FindScope(*source)); + } + return true; +} + +void OmpStructureChecker::Leave(const parser::BlockConstruct &x) { + auto &specPart{std::get(x.t)}; + auto &execPart{std::get(x.t)}; + if (auto &&source{parser::GetSource(specPart)}) { + scopeStack_.push_back(&context_.FindScope(*source)); + } else if (auto &&source{parser::GetSource(execPart)}) { + scopeStack_.push_back(&context_.FindScope(*source)); + } +} + // Use when clause falls under 'struct OmpClause' in 'parse-tree.h'. #define CHECK_SIMPLE_CLAUSE(X, Y) \ void OmpStructureChecker::Enter(const parser::OmpClause::X &) { \ diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h index b3fd6c8b4b7c0..3b184f1214904 100644 --- a/flang/lib/Semantics/check-omp-structure.h +++ b/flang/lib/Semantics/check-omp-structure.h @@ -56,21 +56,32 @@ using SymbolSourceMap = std::multimap; using DirectivesClauseTriple = std::multimap>; -class OmpStructureChecker - : public DirectiveStructureChecker { +using OmpStructureCheckerBase = DirectiveStructureChecker; + +class OmpStructureChecker : public OmpStructureCheckerBase { public: - using Base = DirectiveStructureChecker; + using Base = OmpStructureCheckerBase; + + OmpStructureChecker(SemanticsContext &context); - OmpStructureChecker(SemanticsContext &context) - : DirectiveStructureChecker(context, -#define GEN_FLANG_DIRECTIVE_CLAUSE_MAP -#include "llvm/Frontend/OpenMP/OMP.inc" - ) { - } using llvmOmpClause = const llvm::omp::Clause; + bool Enter(const parser::MainProgram &); + void Leave(const parser::MainProgram &); + bool Enter(const parser::BlockData &); + void Leave(const parser::BlockData &); + bool Enter(const parser::Module &); + void Leave(const parser::Module &); + bool Enter(const parser::Submodule &); + void Leave(const parser::Submodule &); + bool Enter(const parser::SubroutineStmt &); + bool Enter(const parser::EndSubroutineStmt &); + bool Enter(const parser::FunctionStmt &); + bool Enter(const parser::EndFunctionStmt &); + bool Enter(const parser::BlockConstruct &); + void Leave(const parser::BlockConstruct &); + void Enter(const parser::OpenMPConstruct &); void Leave(const parser::OpenMPConstruct &); void Enter(const parser::OpenMPInteropConstruct &); @@ -373,6 +384,8 @@ class OmpStructureChecker using LoopConstruct = std::variant; std::vector loopStack_; + // Scopes for scoping units. + std::vector scopeStack_; }; /// Find a duplicate entry in the range, and return an iterator to it. From 322dd630043855f81043f02ff4ab43899587ea47 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Wed, 22 Oct 2025 11:46:48 -0500 Subject: [PATCH 136/530] [flang][OpenMP] Refactor/update semantic checks for ALLOCATE directive (#164420) OpenMP 5.0 and 5.1 allowed the ALLOCATE directive to appear in two forms, declarative and executable. The syntax of an individual directive was the same in both cases, but the semantic restrictions were slightly different. - Update the semantic checks to reflect the different restrictions, gather them in a single function. - Improve test for the presence of a TARGET region, add a check for REQUIRES directive. - Update tests. --- flang/include/flang/Semantics/openmp-utils.h | 8 +- flang/lib/Semantics/check-omp-structure.cpp | 184 +++++++++++++----- flang/lib/Semantics/check-omp-structure.h | 6 + flang/lib/Semantics/openmp-utils.cpp | 2 +- flang/lib/Semantics/resolve-directives.cpp | 20 -- flang/test/Semantics/OpenMP/allocate01.f90 | 2 +- flang/test/Semantics/OpenMP/allocate04.f90 | 17 +- flang/test/Semantics/OpenMP/allocate05.f90 | 2 +- flang/test/Semantics/OpenMP/allocate06.f90 | 2 +- flang/test/Semantics/OpenMP/allocate08.f90 | 26 ++- flang/test/Semantics/OpenMP/allocators04.f90 | 2 - flang/test/Semantics/OpenMP/allocators06.f90 | 18 -- .../OpenMP/declarative-directive02.f90 | 6 +- 13 files changed, 175 insertions(+), 120 deletions(-) delete mode 100644 flang/test/Semantics/OpenMP/allocators06.f90 diff --git a/flang/include/flang/Semantics/openmp-utils.h b/flang/include/flang/Semantics/openmp-utils.h index 0f851830edd46..7539d12264435 100644 --- a/flang/include/flang/Semantics/openmp-utils.h +++ b/flang/include/flang/Semantics/openmp-utils.h @@ -13,9 +13,11 @@ #ifndef FORTRAN_SEMANTICS_OPENMP_UTILS_H #define FORTRAN_SEMANTICS_OPENMP_UTILS_H +#include "flang/Common/indirection.h" #include "flang/Evaluate/type.h" #include "flang/Parser/char-block.h" #include "flang/Parser/parse-tree.h" +#include "flang/Parser/tools.h" #include "flang/Semantics/tools.h" #include "llvm/ADT/ArrayRef.h" @@ -74,7 +76,11 @@ bool IsVarOrFunctionRef(const MaybeExpr &expr); bool IsMapEnteringType(parser::OmpMapType::Value type); bool IsMapExitingType(parser::OmpMapType::Value type); -std::optional GetEvaluateExpr(const parser::Expr &parserExpr); +MaybeExpr GetEvaluateExpr(const parser::Expr &parserExpr); +template MaybeExpr GetEvaluateExpr(const T &inp) { + return GetEvaluateExpr(parser::UnwrapRef(inp)); +} + std::optional GetDynamicType( const parser::Expr &parserExpr); diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index b3c4d24ad496d..41416304c1ea6 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -480,6 +480,36 @@ bool OmpStructureChecker::IsNestedInDirective(llvm::omp::Directive directive) { return false; } +bool OmpStructureChecker::InTargetRegion() { + if (IsNestedInDirective(llvm::omp::Directive::OMPD_target)) { + // Return true even for device_type(host). + return true; + } + for (const Scope *scope : llvm::reverse(scopeStack_)) { + if (const auto *symbol{scope->symbol()}) { + if (symbol->test(Symbol::Flag::OmpDeclareTarget)) { + return true; + } + } + } + return false; +} + +bool OmpStructureChecker::HasRequires(llvm::omp::Clause req) { + const Scope &unit{GetProgramUnit(*scopeStack_.back())}; + return common::visit( + [&](const auto &details) { + if constexpr (std::is_convertible_v) { + if (auto *reqs{details.ompRequires()}) { + return reqs->test(req); + } + } + return false; + }, + DEREF(unit.symbol()).details()); +} + void OmpStructureChecker::CheckVariableListItem( const SymbolSourceMap &symbols) { for (auto &[symbol, source] : symbols) { @@ -1680,40 +1710,95 @@ void OmpStructureChecker::Leave(const parser::OpenMPRequiresConstruct &) { dirContext_.pop_back(); } -void OmpStructureChecker::Enter(const parser::OpenMPDeclarativeAllocate &x) { - isPredefinedAllocator = true; - const auto &dir{std::get(x.t)}; - const auto &objectList{std::get(x.t)}; - PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_allocate); - SymbolSourceMap currSymbols; - GetSymbolsInObjectList(objectList, currSymbols); - for (auto &[symbol, source] : currSymbols) { - if (IsPointer(*symbol)) { - context_.Say(source, - "List item '%s' in ALLOCATE directive must not have POINTER " - "attribute"_err_en_US, - source.ToString()); +void OmpStructureChecker::CheckAllocateDirective(parser::CharBlock source, + const parser::OmpObjectList &objects, + const parser::OmpClauseList &clauses) { + const Scope &thisScope{context_.FindScope(source)}; + SymbolSourceMap symbols; + GetSymbolsInObjectList(objects, symbols); + + auto maybeHasPredefinedAllocator{[&](const parser::OmpClause *calloc) { + // Return "true" if the ALLOCATOR clause was provided with an argument + // that is either a prefdefined allocator, or a run-time value. + // Otherwise return "false". + if (!calloc) { + return false; + } + auto *allocator{std::get_if(&calloc->u)}; + if (auto val{ToInt64(GetEvaluateExpr(DEREF(allocator).v))}) { + // Predefined allocators (defined in OpenMP 6.0 20.8.1): + // omp_null_allocator = 0, + // omp_default_mem_alloc = 1, + // omp_large_cap_mem_alloc = 2, + // omp_const_mem_alloc = 3, + // omp_high_bw_mem_alloc = 4, + // omp_low_lat_mem_alloc = 5, + // omp_cgroup_mem_alloc = 6, + // omp_pteam_mem_alloc = 7, + // omp_thread_mem_alloc = 8 + return *val >= 0 && *val <= 8; } - if (IsDummy(*symbol)) { + return true; + }}; + + const auto *allocator{FindClause(llvm::omp::Clause::OMPC_allocator)}; + if (InTargetRegion()) { + bool hasDynAllocators{ + HasRequires(llvm::omp::Clause::OMPC_dynamic_allocators)}; + if (!allocator && !hasDynAllocators) { context_.Say(source, - "List item '%s' in ALLOCATE directive must not be a dummy " - "argument"_err_en_US, - source.ToString()); + "An ALLOCATE directive in a TARGET region must specify an ALLOCATOR clause or REQUIRES(DYNAMIC_ALLOCATORS) must be specified"_err_en_US); + } + } + + auto maybePredefined{maybeHasPredefinedAllocator(allocator)}; + + for (auto &[symbol, source] : symbols) { + if (!inExecutableAllocate_) { + if (symbol->owner() != thisScope) { + context_.Say(source, + "A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears"_err_en_US); + } + if (IsPointer(*symbol) || IsAllocatable(*symbol)) { + context_.Say(source, + "A list item in a declarative ALLOCATE cannot have the ALLOCATABLE or POINTER attribute"_err_en_US); + } } if (symbol->GetUltimate().has()) { context_.Say(source, - "List item '%s' in ALLOCATE directive must not be an associate " - "name"_err_en_US, - source.ToString()); + "A list item in a declarative ALLOCATE cannot be an associate name"_err_en_US); + } + if (symbol->attrs().test(Attr::SAVE) || IsCommonBlock(*symbol)) { + if (!allocator) { + context_.Say(source, + "If a list item is a named common block or has SAVE attribute, an ALLOCATOR clause must be present with a predefined allocator"_err_en_US); + } else if (!maybePredefined) { + context_.Say(source, + "If a list item is a named common block or has SAVE attribute, only a predefined allocator may be used on the ALLOCATOR clause"_err_en_US); + } + } + if (FindCommonBlockContaining(*symbol)) { + context_.Say(source, + "A variable that is part of a common block may not be specified as a list item in an ALLOCATE directive, except implicitly via the named common block"_err_en_US); } } - CheckVarIsNotPartOfAnotherVar(dir.source, objectList); + CheckVarIsNotPartOfAnotherVar(source, objects); } -void OmpStructureChecker::Leave(const parser::OpenMPDeclarativeAllocate &x) { +void OmpStructureChecker::Enter(const parser::OpenMPDeclarativeAllocate &x) { const auto &dir{std::get(x.t)}; - const auto &objectList{std::get(x.t)}; - CheckPredefinedAllocatorRestriction(dir.source, objectList); + PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_allocate); +} + +void OmpStructureChecker::Leave(const parser::OpenMPDeclarativeAllocate &x) { + if (!inExecutableAllocate_) { + const auto &dir{std::get(x.t)}; + const auto &clauseList{std::get(x.t)}; + const auto &objectList{std::get(x.t)}; + + isPredefinedAllocator = true; + CheckAllocateDirective(dir.source, objectList, clauseList); + } dirContext_.pop_back(); } @@ -2069,6 +2154,7 @@ void OmpStructureChecker::CheckNameInAllocateStmt( } void OmpStructureChecker::Enter(const parser::OpenMPExecutableAllocate &x) { + inExecutableAllocate_ = true; const auto &dir{std::get(x.t)}; PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_allocate); @@ -2078,24 +2164,6 @@ void OmpStructureChecker::Enter(const parser::OpenMPExecutableAllocate &x) { "The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead"_warn_en_US); } - bool hasAllocator = false; - // TODO: Investigate whether searching the clause list can be done with - // parser::Unwrap instead of the following loop - const auto &clauseList{std::get(x.t)}; - for (const auto &clause : clauseList.v) { - if (std::get_if(&clause.u)) { - hasAllocator = true; - } - } - - if (IsNestedInDirective(llvm::omp::Directive::OMPD_target) && !hasAllocator) { - // TODO: expand this check to exclude the case when a requires - // directive with the dynamic_allocators clause is present - // in the same compilation unit (OMP5.0 2.11.3). - context_.Say(x.source, - "ALLOCATE directives that appear in a TARGET region must specify an allocator clause"_err_en_US); - } - const auto &allocateStmt = std::get>(x.t).statement; if (const auto &list{std::get>(x.t)}) { @@ -2112,18 +2180,34 @@ void OmpStructureChecker::Enter(const parser::OpenMPExecutableAllocate &x) { } isPredefinedAllocator = true; - const auto &objectList{std::get>(x.t)}; - if (objectList) { - CheckVarIsNotPartOfAnotherVar(dir.source, *objectList); - } } void OmpStructureChecker::Leave(const parser::OpenMPExecutableAllocate &x) { - const auto &dir{std::get(x.t)}; - const auto &objectList{std::get>(x.t)}; - if (objectList) - CheckPredefinedAllocatorRestriction(dir.source, *objectList); + parser::OmpObjectList empty{std::list{}}; + auto &objects{[&]() -> const parser::OmpObjectList & { + if (auto &objects{std::get>(x.t)}) { + return *objects; + } else { + return empty; + } + }()}; + auto &clauses{std::get(x.t)}; + CheckAllocateDirective( + std::get(x.t).source, objects, clauses); + + if (const auto &subDirs{ + std::get>>( + x.t)}) { + for (const auto &dalloc : *subDirs) { + const auto &dir{std::get(x.t)}; + const auto &clauses{std::get(dalloc.t)}; + const auto &objects{std::get(dalloc.t)}; + CheckAllocateDirective(dir.source, objects, clauses); + } + } + dirContext_.pop_back(); + inExecutableAllocate_ = false; } void OmpStructureChecker::Enter(const parser::OpenMPAllocatorsConstruct &x) { diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h index 3b184f1214904..7426559e77ff7 100644 --- a/flang/lib/Semantics/check-omp-structure.h +++ b/flang/lib/Semantics/check-omp-structure.h @@ -188,10 +188,12 @@ class OmpStructureChecker : public OmpStructureCheckerBase { const parser::CharBlock &, const OmpDirectiveSet &); bool IsCloselyNestedRegion(const OmpDirectiveSet &set); bool IsNestedInDirective(llvm::omp::Directive directive); + bool InTargetRegion(); void HasInvalidTeamsNesting( const llvm::omp::Directive &dir, const parser::CharBlock &source); void HasInvalidDistributeNesting(const parser::OpenMPLoopConstruct &x); void HasInvalidLoopBinding(const parser::OpenMPLoopConstruct &x); + bool HasRequires(llvm::omp::Clause req); // specific clause related void CheckAllowedMapTypes( parser::OmpMapType::Value, llvm::ArrayRef); @@ -261,6 +263,9 @@ class OmpStructureChecker : public OmpStructureCheckerBase { bool CheckTargetBlockOnlyTeams(const parser::Block &); void CheckWorkshareBlockStmts(const parser::Block &, parser::CharBlock); void CheckWorkdistributeBlockStmts(const parser::Block &, parser::CharBlock); + void CheckAllocateDirective(parser::CharBlock source, + const parser::OmpObjectList &objects, + const parser::OmpClauseList &clauses); void CheckIteratorRange(const parser::OmpIteratorSpecifier &x); void CheckIteratorModifier(const parser::OmpIterator &x); @@ -378,6 +383,7 @@ class OmpStructureChecker : public OmpStructureCheckerBase { }; int directiveNest_[LastType + 1] = {0}; + bool inExecutableAllocate_{false}; parser::CharBlock visitedAtomicSource_; SymbolSourceMap deferredNonVariables_; diff --git a/flang/lib/Semantics/openmp-utils.cpp b/flang/lib/Semantics/openmp-utils.cpp index 292e73b4899c0..cc55bb4954cc3 100644 --- a/flang/lib/Semantics/openmp-utils.cpp +++ b/flang/lib/Semantics/openmp-utils.cpp @@ -218,7 +218,7 @@ bool IsMapExitingType(parser::OmpMapType::Value type) { } } -std::optional GetEvaluateExpr(const parser::Expr &parserExpr) { +MaybeExpr GetEvaluateExpr(const parser::Expr &parserExpr) { const parser::TypedExpr &typedExpr{parserExpr.typedExpr}; // ForwardOwningPointer typedExpr // `- GenericExprWrapper ^.get() diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp index c410bd48b352c..196755e2912a8 100644 --- a/flang/lib/Semantics/resolve-directives.cpp +++ b/flang/lib/Semantics/resolve-directives.cpp @@ -3094,26 +3094,6 @@ void OmpAttributeVisitor::ResolveOmpDesignator( AddAllocateName(name); } } - if (ompFlag == Symbol::Flag::OmpDeclarativeAllocateDirective && - IsAllocatable(*symbol) && - !IsNestedInDirective(llvm::omp::Directive::OMPD_allocate)) { - context_.Say(designator.source, - "List items specified in the ALLOCATE directive must not have the ALLOCATABLE attribute unless the directive is associated with an ALLOCATE statement"_err_en_US); - } - bool checkScope{ompFlag == Symbol::Flag::OmpDeclarativeAllocateDirective}; - // In 5.1 the scope check only applies to declarative allocate. - if (version == 50 && !checkScope) { - checkScope = ompFlag == Symbol::Flag::OmpExecutableAllocateDirective; - } - if (checkScope) { - if (omp::GetScopingUnit(GetContext().scope) != - omp::GetScopingUnit(symbol->GetUltimate().owner())) { - context_.Say(designator.source, // 2.15.3 - "List items must be declared in the same scoping unit in which the %s directive appears"_err_en_US, - parser::ToUpperCaseLetters( - llvm::omp::getOpenMPDirectiveName(directive, version))); - } - } if (ompFlag == Symbol::Flag::OmpReduction) { // Using variables inside of a namelist in OpenMP reductions // is allowed by the standard, but is not allowed for diff --git a/flang/test/Semantics/OpenMP/allocate01.f90 b/flang/test/Semantics/OpenMP/allocate01.f90 index 1d99811156438..229fd4d6c3f95 100644 --- a/flang/test/Semantics/OpenMP/allocate01.f90 +++ b/flang/test/Semantics/OpenMP/allocate01.f90 @@ -15,7 +15,7 @@ subroutine sema() integer :: a, b real, dimension (:,:), allocatable :: darray - !ERROR: List items must be declared in the same scoping unit in which the ALLOCATE directive appears + !ERROR: A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears !$omp allocate(y) print *, a diff --git a/flang/test/Semantics/OpenMP/allocate04.f90 b/flang/test/Semantics/OpenMP/allocate04.f90 index bbd74eb2ca101..5fd75bad6c4ec 100644 --- a/flang/test/Semantics/OpenMP/allocate04.f90 +++ b/flang/test/Semantics/OpenMP/allocate04.f90 @@ -14,16 +14,19 @@ subroutine allocate(z) type(c_ptr), pointer :: p integer :: x, y, z - associate (a => x) - !$omp allocate(x) allocator(omp_default_mem_alloc) - !ERROR: PRIVATE clause is not allowed on the ALLOCATE directive !$omp allocate(y) private(y) - !ERROR: List item 'z' in ALLOCATE directive must not be a dummy argument - !$omp allocate(z) - !ERROR: List item 'p' in ALLOCATE directive must not have POINTER attribute + !ERROR: A list item in a declarative ALLOCATE cannot have the ALLOCATABLE or POINTER attribute !$omp allocate(p) - !ERROR: List item 'a' in ALLOCATE directive must not be an associate name + + associate (a => x) + block + !ERROR: A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears + !$omp allocate(x) allocator(omp_default_mem_alloc) + + !ERROR: A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears + !ERROR: A list item in a declarative ALLOCATE cannot be an associate name !$omp allocate(a) + end block end associate end subroutine allocate diff --git a/flang/test/Semantics/OpenMP/allocate05.f90 b/flang/test/Semantics/OpenMP/allocate05.f90 index a787e8bb32a4c..b5f7864a42b92 100644 --- a/flang/test/Semantics/OpenMP/allocate05.f90 +++ b/flang/test/Semantics/OpenMP/allocate05.f90 @@ -18,7 +18,7 @@ subroutine allocate() !$omp end target !$omp target - !ERROR: ALLOCATE directives that appear in a TARGET region must specify an allocator clause + !ERROR: An ALLOCATE directive in a TARGET region must specify an ALLOCATOR clause or REQUIRES(DYNAMIC_ALLOCATORS) must be specified !$omp allocate allocate ( darray(a, b) ) !$omp end target diff --git a/flang/test/Semantics/OpenMP/allocate06.f90 b/flang/test/Semantics/OpenMP/allocate06.f90 index e14134cd07301..9b57322bbadc6 100644 --- a/flang/test/Semantics/OpenMP/allocate06.f90 +++ b/flang/test/Semantics/OpenMP/allocate06.f90 @@ -11,7 +11,7 @@ subroutine allocate() integer :: a, b, x real, dimension (:,:), allocatable :: darray - !ERROR: List items specified in the ALLOCATE directive must not have the ALLOCATABLE attribute unless the directive is associated with an ALLOCATE statement + !ERROR: A list item in a declarative ALLOCATE cannot have the ALLOCATABLE or POINTER attribute !$omp allocate(darray) allocator(omp_default_mem_alloc) !$omp allocate(darray) allocator(omp_default_mem_alloc) diff --git a/flang/test/Semantics/OpenMP/allocate08.f90 b/flang/test/Semantics/OpenMP/allocate08.f90 index 5bfa918be4cad..f4f11e229a28b 100644 --- a/flang/test/Semantics/OpenMP/allocate08.f90 +++ b/flang/test/Semantics/OpenMP/allocate08.f90 @@ -3,14 +3,15 @@ ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags ! OpenMP Version 5.0 ! 2.11.3 allocate Directive -! If list items within the ALLOCATE directive have the SAVE attribute, are a common block name, or are declared in the scope of a -! module, then only predefined memory allocator parameters can be used in the allocator clause +! If list items within the ALLOCATE directive have the SAVE attribute, are a +! common block name, or are declared in the scope of a module, then only +! predefined memory allocator parameters can be used in the allocator clause module AllocateModule INTEGER :: z end module -subroutine allocate() +subroutine allocate(custom_allocator) use omp_lib use AllocateModule integer, SAVE :: x @@ -18,30 +19,25 @@ subroutine allocate() COMMON /CommonName/ y integer(kind=omp_allocator_handle_kind) :: custom_allocator - integer(kind=omp_memspace_handle_kind) :: memspace - type(omp_alloctrait), dimension(1) :: trait - memspace = omp_default_mem_space - trait(1)%key = fallback - trait(1)%value = default_mem_fb - custom_allocator = omp_init_allocator(memspace, 1, trait) !$omp allocate(x) allocator(omp_default_mem_alloc) + !ERROR: A variable that is part of a common block may not be specified as a list item in an ALLOCATE directive, except implicitly via the named common block !$omp allocate(y) allocator(omp_default_mem_alloc) - !ERROR: List items must be declared in the same scoping unit in which the ALLOCATE directive appears + !ERROR: A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears !$omp allocate(z) allocator(omp_default_mem_alloc) + !ERROR: If a list item is a named common block or has SAVE attribute, an ALLOCATOR clause must be present with a predefined allocator !$omp allocate(x) + !ERROR: A variable that is part of a common block may not be specified as a list item in an ALLOCATE directive, except implicitly via the named common block !$omp allocate(y) - !ERROR: List items must be declared in the same scoping unit in which the ALLOCATE directive appears + !ERROR: A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears !$omp allocate(z) !$omp allocate(w) allocator(custom_allocator) - !ERROR: If list items within the ALLOCATE directive have the SAVE attribute, are a common block name, or are declared in the scope of a module, then only predefined memory allocator parameters can be used in the allocator clause !$omp allocate(x) allocator(custom_allocator) - !ERROR: If list items within the ALLOCATE directive have the SAVE attribute, are a common block name, or are declared in the scope of a module, then only predefined memory allocator parameters can be used in the allocator clause + !ERROR: A variable that is part of a common block may not be specified as a list item in an ALLOCATE directive, except implicitly via the named common block !$omp allocate(y) allocator(custom_allocator) - !ERROR: If list items within the ALLOCATE directive have the SAVE attribute, are a common block name, or are declared in the scope of a module, then only predefined memory allocator parameters can be used in the allocator clause - !ERROR: List items must be declared in the same scoping unit in which the ALLOCATE directive appears + !ERROR: A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears !$omp allocate(z) allocator(custom_allocator) end subroutine allocate diff --git a/flang/test/Semantics/OpenMP/allocators04.f90 b/flang/test/Semantics/OpenMP/allocators04.f90 index c71c7ca8466ba..212e48fbd1b26 100644 --- a/flang/test/Semantics/OpenMP/allocators04.f90 +++ b/flang/test/Semantics/OpenMP/allocators04.f90 @@ -22,12 +22,10 @@ subroutine allocate() trait(1)%value = default_mem_fb custom_allocator = omp_init_allocator(omp_default_mem_space, 1, trait) - !ERROR: List items must be declared in the same scoping unit in which the ALLOCATORS directive appears !$omp allocators allocate(omp_default_mem_alloc: a) allocate(a) !ERROR: If list items within the ALLOCATORS directive have the SAVE attribute, are a common block name, or are declared in the scope of a module, then only predefined memory allocator parameters can be used in the allocator clause - !ERROR: List items must be declared in the same scoping unit in which the ALLOCATORS directive appears !$omp allocators allocate(custom_allocator: b) allocate(b) end subroutine diff --git a/flang/test/Semantics/OpenMP/allocators06.f90 b/flang/test/Semantics/OpenMP/allocators06.f90 deleted file mode 100644 index 8e63512369e38..0000000000000 --- a/flang/test/Semantics/OpenMP/allocators06.f90 +++ /dev/null @@ -1,18 +0,0 @@ -! REQUIRES: openmp_runtime - -! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags -fopenmp-version=50 -! OpenMP Version 5.2 -! Inherited from 2.11.3 allocate directive -! The allocate directive must appear in the same scope as the declarations of -! each of its list items and must follow all such declarations. - -subroutine allocate() - use omp_lib - integer, allocatable :: a -contains - subroutine test() - !ERROR: List items must be declared in the same scoping unit in which the ALLOCATORS directive appears - !$omp allocators allocate(omp_default_mem_alloc: a) - allocate(a) - end subroutine -end subroutine diff --git a/flang/test/Semantics/OpenMP/declarative-directive02.f90 b/flang/test/Semantics/OpenMP/declarative-directive02.f90 index dcde963689eb0..04b8c3d43ba7a 100644 --- a/flang/test/Semantics/OpenMP/declarative-directive02.f90 +++ b/flang/test/Semantics/OpenMP/declarative-directive02.f90 @@ -9,7 +9,7 @@ subroutine test_decl implicit none save :: x1, y1 !$omp threadprivate(x1) - !$omp allocate(y1) + !$omp allocate(y1) allocator(0) integer :: x1, y1 ! OMPv5.2 7.7 declare-simd @@ -33,12 +33,12 @@ subroutine test_decl subroutine test_decl2 save x1, y1 !$omp threadprivate(x1) - !$omp allocate(y1) + !$omp allocate(y1) allocator(0) integer :: x1, y1 ! implicit decl !$omp threadprivate(x2) - !$omp allocate(y2) + !$omp allocate(y2) allocator(0) save x2, y2 end subroutine From 23ead476550a667d532554e966704494173fd9d7 Mon Sep 17 00:00:00 2001 From: Jakub Kuderski Date: Wed, 22 Oct 2025 12:47:48 -0400 Subject: [PATCH 137/530] [flang][mlir] Migrate to free create functions. NFC. (#164657) See https://discourse.llvm.org/t/psa-opty-create-now-with-100-more-tab-complete/87339. I plan to mark these as deprecated in https://github.com/llvm/llvm-project/pull/164649. --- flang/lib/Lower/Bridge.cpp | 2 +- flang/lib/Lower/OpenMP/OpenMP.cpp | 55 ++++---- flang/lib/Optimizer/CodeGen/CodeGen.cpp | 2 +- flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp | 7 +- .../Transforms/ACCRecipeBufferization.cpp | 6 +- .../Optimizer/OpenMP/AutomapToTargetData.cpp | 4 +- .../OpenMP/DoConcurrentConversion.cpp | 22 ++-- .../Optimizer/OpenMP/LowerWorkdistribute.cpp | 119 +++++++++--------- .../Optimizer/OpenMP/MapInfoFinalization.cpp | 37 +++--- flang/lib/Optimizer/Support/Utils.cpp | 6 +- flang/lib/Utils/OpenMP.cpp | 2 +- 11 files changed, 132 insertions(+), 130 deletions(-) diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index acb8e114c167b..a516a44204cac 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -1766,7 +1766,7 @@ class FirConverter : public Fortran::lower::AbstractConverter { // to a crash due to a block with no terminator. See issue #126452. mlir::FunctionType funcType = builder->getFunction().getFunctionType(); mlir::Type resultType = funcType.getResult(0); - mlir::Value undefResult = builder->create(loc, resultType); + mlir::Value undefResult = fir::UndefOp::create(*builder, loc, resultType); genExitRoutine(false, undefResult); return; } diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index a49961cc233c6..71067283d13f7 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -2059,37 +2059,38 @@ static void genCanonicalLoopNest( // Start lowering mlir::Value zero = firOpBuilder.createIntegerConstant(loc, loopVarType, 0); mlir::Value one = firOpBuilder.createIntegerConstant(loc, loopVarType, 1); - mlir::Value isDownwards = firOpBuilder.create( - loc, mlir::arith::CmpIPredicate::slt, loopStepVar, zero); + mlir::Value isDownwards = mlir::arith::CmpIOp::create( + firOpBuilder, loc, mlir::arith::CmpIPredicate::slt, loopStepVar, zero); // Ensure we are counting upwards. If not, negate step and swap lb and ub. mlir::Value negStep = - firOpBuilder.create(loc, zero, loopStepVar); - mlir::Value incr = firOpBuilder.create( - loc, isDownwards, negStep, loopStepVar); - mlir::Value lb = firOpBuilder.create( - loc, isDownwards, loopUBVar, loopLBVar); - mlir::Value ub = firOpBuilder.create( - loc, isDownwards, loopLBVar, loopUBVar); + mlir::arith::SubIOp::create(firOpBuilder, loc, zero, loopStepVar); + mlir::Value incr = mlir::arith::SelectOp::create( + firOpBuilder, loc, isDownwards, negStep, loopStepVar); + mlir::Value lb = mlir::arith::SelectOp::create( + firOpBuilder, loc, isDownwards, loopUBVar, loopLBVar); + mlir::Value ub = mlir::arith::SelectOp::create( + firOpBuilder, loc, isDownwards, loopLBVar, loopUBVar); // Compute the trip count assuming lb <= ub. This guarantees that the result // is non-negative and we can use unsigned arithmetic. - mlir::Value span = firOpBuilder.create( - loc, ub, lb, ::mlir::arith::IntegerOverflowFlags::nuw); + mlir::Value span = mlir::arith::SubIOp::create( + firOpBuilder, loc, ub, lb, ::mlir::arith::IntegerOverflowFlags::nuw); mlir::Value tcMinusOne = - firOpBuilder.create(loc, span, incr); - mlir::Value tcIfLooping = firOpBuilder.create( - loc, tcMinusOne, one, ::mlir::arith::IntegerOverflowFlags::nuw); + mlir::arith::DivUIOp::create(firOpBuilder, loc, span, incr); + mlir::Value tcIfLooping = + mlir::arith::AddIOp::create(firOpBuilder, loc, tcMinusOne, one, + ::mlir::arith::IntegerOverflowFlags::nuw); // Fall back to 0 if lb > ub - mlir::Value isZeroTC = firOpBuilder.create( - loc, mlir::arith::CmpIPredicate::slt, ub, lb); - mlir::Value tripcount = firOpBuilder.create( - loc, isZeroTC, zero, tcIfLooping); + mlir::Value isZeroTC = mlir::arith::CmpIOp::create( + firOpBuilder, loc, mlir::arith::CmpIPredicate::slt, ub, lb); + mlir::Value tripcount = mlir::arith::SelectOp::create( + firOpBuilder, loc, isZeroTC, zero, tcIfLooping); tripcounts.push_back(tripcount); // Create the CLI handle. - auto newcli = firOpBuilder.create(loc); + auto newcli = mlir::omp::NewCliOp::create(firOpBuilder, loc); mlir::Value cli = newcli.getResult(); clis.push_back(cli); @@ -2122,10 +2123,10 @@ static void genCanonicalLoopNest( "Expecting all block args to have been collected by now"); for (auto j : llvm::seq(numLoops)) { mlir::Value natIterNum = fir::getBase(blockArgs[j]); - mlir::Value scaled = firOpBuilder.create( - loc, natIterNum, loopStepVars[j]); - mlir::Value userVal = firOpBuilder.create( - loc, loopLBVars[j], scaled); + mlir::Value scaled = mlir::arith::MulIOp::create( + firOpBuilder, loc, natIterNum, loopStepVars[j]); + mlir::Value userVal = mlir::arith::AddIOp::create( + firOpBuilder, loc, loopLBVars[j], scaled); mlir::OpBuilder::InsertPoint insPt = firOpBuilder.saveInsertionPoint(); @@ -2198,9 +2199,9 @@ static void genTileOp(Fortran::lower::AbstractConverter &converter, gridGeneratees.reserve(numLoops); intratileGeneratees.reserve(numLoops); for ([[maybe_unused]] auto i : llvm::seq(0, sizesClause.sizes.size())) { - auto gridCLI = firOpBuilder.create(loc); + auto gridCLI = mlir::omp::NewCliOp::create(firOpBuilder, loc); gridGeneratees.push_back(gridCLI.getResult()); - auto intratileCLI = firOpBuilder.create(loc); + auto intratileCLI = mlir::omp::NewCliOp::create(firOpBuilder, loc); intratileGeneratees.push_back(intratileCLI.getResult()); } @@ -2209,8 +2210,8 @@ static void genTileOp(Fortran::lower::AbstractConverter &converter, generatees.append(gridGeneratees); generatees.append(intratileGeneratees); - firOpBuilder.create(loc, generatees, applyees, - sizesClause.sizes); + mlir::omp::TileOp::create(firOpBuilder, loc, generatees, applyees, + sizesClause.sizes); } static void genUnrollOp(Fortran::lower::AbstractConverter &converter, diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp index e71f4e3cee49c..478ab151b96d0 100644 --- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp +++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp @@ -1151,7 +1151,7 @@ struct AllocMemOpConversion : public fir::FIROpConversion { mlir::Value size = genTypeSizeInBytes(loc, ity, rewriter, llvmObjectTy); if (auto scaleSize = fir::genAllocationScaleSize(loc, heap.getInType(), ity, rewriter)) - size = rewriter.create(loc, ity, size, scaleSize); + size = mlir::LLVM::MulOp::create(rewriter, loc, ity, size, scaleSize); for (mlir::Value opnd : adaptor.getOperands()) size = mlir::LLVM::MulOp::create(rewriter, loc, ity, size, integerCast(loc, rewriter, ity, opnd)); diff --git a/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp b/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp index 381b2a29c517a..f74d635d50a75 100644 --- a/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp +++ b/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp @@ -242,10 +242,11 @@ struct TargetAllocMemOpConversion loc, llvmObjectTy, ity, rewriter, lowerTy().getDataLayout()); if (auto scaleSize = fir::genAllocationScaleSize( loc, allocmemOp.getInType(), ity, rewriter)) - size = rewriter.create(loc, ity, size, scaleSize); + size = mlir::LLVM::MulOp::create(rewriter, loc, ity, size, scaleSize); for (mlir::Value opnd : adaptor.getOperands().drop_front()) - size = rewriter.create( - loc, ity, size, integerCast(lowerTy(), loc, rewriter, ity, opnd)); + size = mlir::LLVM::MulOp::create( + rewriter, loc, ity, size, + integerCast(lowerTy(), loc, rewriter, ity, opnd)); auto mallocTyWidth = lowerTy().getIndexTypeBitwidth(); auto mallocTy = mlir::IntegerType::get(rewriter.getContext(), mallocTyWidth); diff --git a/flang/lib/Optimizer/OpenACC/Transforms/ACCRecipeBufferization.cpp b/flang/lib/Optimizer/OpenACC/Transforms/ACCRecipeBufferization.cpp index 4840a999ecd27..0d135a94588e4 100644 --- a/flang/lib/Optimizer/OpenACC/Transforms/ACCRecipeBufferization.cpp +++ b/flang/lib/Optimizer/OpenACC/Transforms/ACCRecipeBufferization.cpp @@ -39,13 +39,13 @@ class BufferizeInterface { static mlir::Operation *load(mlir::OpBuilder &builder, mlir::Location loc, mlir::Value value) { - return builder.create(loc, value); + return fir::LoadOp::create(builder, loc, value); } static mlir::Value placeInMemory(mlir::OpBuilder &builder, mlir::Location loc, mlir::Value value) { - auto alloca = builder.create(loc, value.getType()); - builder.create(loc, value, alloca); + auto alloca = fir::AllocaOp::create(builder, loc, value.getType()); + fir::StoreOp::create(builder, loc, value, alloca); return alloca; } }; diff --git a/flang/lib/Optimizer/OpenMP/AutomapToTargetData.cpp b/flang/lib/Optimizer/OpenMP/AutomapToTargetData.cpp index 817434ff3dc30..5793d46a192a7 100644 --- a/flang/lib/Optimizer/OpenMP/AutomapToTargetData.cpp +++ b/flang/lib/Optimizer/OpenMP/AutomapToTargetData.cpp @@ -130,8 +130,8 @@ class AutomapToTargetDataPass builder.getBoolAttr(false)); clauses.mapVars.push_back(mapInfo); isa(memOp) - ? builder.create(memOp.getLoc(), clauses) - : builder.create(memOp.getLoc(), clauses); + ? omp::TargetEnterDataOp::create(builder, memOp.getLoc(), clauses) + : omp::TargetExitDataOp::create(builder, memOp.getLoc(), clauses); }; for (fir::GlobalOp globalOp : automapGlobals) { diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp index 65a23be243716..1229018bd9b3e 100644 --- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp +++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp @@ -595,7 +595,7 @@ class DoConcurrentConversion mlir::omp::TargetOperands &clauseOps, mlir::omp::LoopNestOperands &loopNestClauseOps, const LiveInShapeInfoMap &liveInShapeInfoMap) const { - auto targetOp = rewriter.create(loc, clauseOps); + auto targetOp = mlir::omp::TargetOp::create(rewriter, loc, clauseOps); auto argIface = llvm::cast(*targetOp); mlir::Region ®ion = targetOp.getRegion(); @@ -672,7 +672,7 @@ class DoConcurrentConversion // temporary. Fortran::utils::openmp::cloneOrMapRegionOutsiders(builder, targetOp); rewriter.setInsertionPoint( - rewriter.create(targetOp.getLoc())); + mlir::omp::TerminatorOp::create(rewriter, targetOp.getLoc())); return targetOp; } @@ -715,8 +715,8 @@ class DoConcurrentConversion auto shapeShiftType = fir::ShapeShiftType::get( builder.getContext(), shapeShiftOperands.size() / 2); - return builder.create( - liveInArg.getLoc(), shapeShiftType, shapeShiftOperands); + return fir::ShapeShiftOp::create(builder, liveInArg.getLoc(), + shapeShiftType, shapeShiftOperands); } llvm::SmallVector shapeOperands; @@ -728,11 +728,11 @@ class DoConcurrentConversion ++shapeIdx; } - return builder.create(liveInArg.getLoc(), shapeOperands); + return fir::ShapeOp::create(builder, liveInArg.getLoc(), shapeOperands); }(); - return builder.create(liveInArg.getLoc(), liveInArg, - liveInName, shape); + return hlfir::DeclareOp::create(builder, liveInArg.getLoc(), liveInArg, + liveInName, shape); } mlir::omp::TeamsOp genTeamsOp(mlir::ConversionPatternRewriter &rewriter, @@ -742,13 +742,13 @@ class DoConcurrentConversion genReductions(rewriter, mapper, loop, teamsOps); mlir::Location loc = loop.getLoc(); - auto teamsOp = rewriter.create(loc, teamsOps); + auto teamsOp = mlir::omp::TeamsOp::create(rewriter, loc, teamsOps); Fortran::common::openmp::EntryBlockArgs teamsArgs; teamsArgs.reduction.vars = teamsOps.reductionVars; Fortran::common::openmp::genEntryBlock(rewriter, teamsArgs, teamsOp.getRegion()); - rewriter.setInsertionPoint(rewriter.create(loc)); + rewriter.setInsertionPoint(mlir::omp::TerminatorOp::create(rewriter, loc)); for (auto [loopVar, teamsArg] : llvm::zip_equal( loop.getReduceVars(), teamsOp.getRegion().getArguments())) { @@ -761,8 +761,8 @@ class DoConcurrentConversion mlir::omp::DistributeOp genDistributeOp(mlir::Location loc, mlir::ConversionPatternRewriter &rewriter) const { - auto distOp = rewriter.create( - loc, /*clauses=*/mlir::omp::DistributeOperands{}); + auto distOp = mlir::omp::DistributeOp::create( + rewriter, loc, /*clauses=*/mlir::omp::DistributeOperands{}); rewriter.createBlock(&distOp.getRegion()); return distOp; diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkdistribute.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkdistribute.cpp index 8a9b383ec1356..7b61539984232 100644 --- a/flang/lib/Optimizer/OpenMP/LowerWorkdistribute.cpp +++ b/flang/lib/Optimizer/OpenMP/LowerWorkdistribute.cpp @@ -282,14 +282,14 @@ fissionWorkdistribute(omp::WorkdistributeOp workdistribute) { &newTeams.getRegion(), newTeams.getRegion().begin(), {}, {}); for (auto arg : teamsBlock->getArguments()) newTeamsBlock->addArgument(arg.getType(), arg.getLoc()); - auto newWorkdistribute = rewriter.create(loc); - rewriter.create(loc); + auto newWorkdistribute = omp::WorkdistributeOp::create(rewriter, loc); + omp::TerminatorOp::create(rewriter, loc); rewriter.createBlock(&newWorkdistribute.getRegion(), newWorkdistribute.getRegion().begin(), {}, {}); auto *cloned = rewriter.clone(*parallelize); parallelize->replaceAllUsesWith(cloned); parallelize->erase(); - rewriter.create(loc); + omp::TerminatorOp::create(rewriter, loc); changed = true; } } @@ -298,10 +298,10 @@ fissionWorkdistribute(omp::WorkdistributeOp workdistribute) { /// Generate omp.parallel operation with an empty region. static void genParallelOp(Location loc, OpBuilder &rewriter, bool composite) { - auto parallelOp = rewriter.create(loc); + auto parallelOp = mlir::omp::ParallelOp::create(rewriter, loc); parallelOp.setComposite(composite); rewriter.createBlock(¶llelOp.getRegion()); - rewriter.setInsertionPoint(rewriter.create(loc)); + rewriter.setInsertionPoint(mlir::omp::TerminatorOp::create(rewriter, loc)); return; } @@ -309,7 +309,7 @@ static void genParallelOp(Location loc, OpBuilder &rewriter, bool composite) { static void genDistributeOp(Location loc, OpBuilder &rewriter, bool composite) { mlir::omp::DistributeOperands distributeClauseOps; auto distributeOp = - rewriter.create(loc, distributeClauseOps); + mlir::omp::DistributeOp::create(rewriter, loc, distributeClauseOps); distributeOp.setComposite(composite); auto distributeBlock = rewriter.createBlock(&distributeOp.getRegion()); rewriter.setInsertionPointToStart(distributeBlock); @@ -334,12 +334,12 @@ static void genWsLoopOp(mlir::OpBuilder &rewriter, fir::DoLoopOp doLoop, const mlir::omp::LoopNestOperands &clauseOps, bool composite) { - auto wsloopOp = rewriter.create(doLoop.getLoc()); + auto wsloopOp = mlir::omp::WsloopOp::create(rewriter, doLoop.getLoc()); wsloopOp.setComposite(composite); rewriter.createBlock(&wsloopOp.getRegion()); auto loopNestOp = - rewriter.create(doLoop.getLoc(), clauseOps); + mlir::omp::LoopNestOp::create(rewriter, doLoop.getLoc(), clauseOps); // Clone the loop's body inside the loop nest construct using the // mapped values. @@ -351,7 +351,7 @@ static void genWsLoopOp(mlir::OpBuilder &rewriter, fir::DoLoopOp doLoop, // Erase fir.result op of do loop and create yield op. if (auto resultOp = dyn_cast(terminatorOp)) { rewriter.setInsertionPoint(terminatorOp); - rewriter.create(doLoop->getLoc()); + mlir::omp::YieldOp::create(rewriter, doLoop->getLoc()); terminatorOp->erase(); } } @@ -494,15 +494,15 @@ static SmallVector convertFlatToMultiDim(OpBuilder &builder, // Convert flat index to multi-dimensional indices SmallVector indices(rank); Value temp = flatIdx; - auto c1 = builder.create(loc, 1); + auto c1 = arith::ConstantIndexOp::create(builder, loc, 1); // Work backwards through dimensions (row-major order) for (int i = rank - 1; i >= 0; --i) { - Value zeroBasedIdx = builder.create(loc, temp, extents[i]); + Value zeroBasedIdx = arith::RemSIOp::create(builder, loc, temp, extents[i]); // Convert to one-based index - indices[i] = builder.create(loc, zeroBasedIdx, c1); + indices[i] = arith::AddIOp::create(builder, loc, zeroBasedIdx, c1); if (i > 0) { - temp = builder.create(loc, temp, extents[i]); + temp = arith::DivSIOp::create(builder, loc, temp, extents[i]); } } @@ -525,7 +525,7 @@ static Value CalculateTotalElements(OpBuilder &builder, Location loc, if (i == 0) { totalElems = extent; } else { - totalElems = builder.create(loc, totalElems, extent); + totalElems = arith::MulIOp::create(builder, loc, totalElems, extent); } } return totalElems; @@ -562,14 +562,14 @@ static void replaceWithUnorderedDoLoop(OpBuilder &builder, Location loc, // Load destination array box (if it's a reference) Value arrayBox = destBox; if (isa(destBox.getType())) - arrayBox = builder.create(loc, destBox); + arrayBox = fir::LoadOp::create(builder, loc, destBox); - auto scalarValue = builder.create(loc, srcBox); - Value scalar = builder.create(loc, scalarValue); + auto scalarValue = fir::BoxAddrOp::create(builder, loc, srcBox); + Value scalar = fir::LoadOp::create(builder, loc, scalarValue); // Calculate total number of elements (flattened) - auto c0 = builder.create(loc, 0); - auto c1 = builder.create(loc, 1); + auto c0 = arith::ConstantIndexOp::create(builder, loc, 0); + auto c1 = arith::ConstantIndexOp::create(builder, loc, 1); Value totalElems = CalculateTotalElements(builder, loc, arrayBox); auto *workdistributeBlock = &workdistribute.getRegion().front(); @@ -587,7 +587,7 @@ static void replaceWithUnorderedDoLoop(OpBuilder &builder, Location loc, builder, loc, fir::ReferenceType::get(scalar.getType()), arrayBox, nullptr, nullptr, ValueRange{indices}, ValueRange{}); - builder.create(loc, scalar, elemPtr); + fir::StoreOp::create(builder, loc, scalar, elemPtr); } /// workdistributeRuntimeCallLower method finds the runtime calls @@ -749,14 +749,15 @@ FailureOr splitTargetData(omp::TargetOp targetOp, auto deviceAddrVars = targetOp.getHasDeviceAddrVars(); auto devicePtrVars = targetOp.getIsDevicePtrVars(); // Create the target data op - auto targetDataOp = rewriter.create( - loc, device, ifExpr, outerMapInfos, deviceAddrVars, devicePtrVars); + auto targetDataOp = + omp::TargetDataOp::create(rewriter, loc, device, ifExpr, outerMapInfos, + deviceAddrVars, devicePtrVars); auto taregtDataBlock = rewriter.createBlock(&targetDataOp.getRegion()); - rewriter.create(loc); + mlir::omp::TerminatorOp::create(rewriter, loc); rewriter.setInsertionPointToStart(taregtDataBlock); // Create the inner target op - auto newTargetOp = rewriter.create( - targetOp.getLoc(), targetOp.getAllocateVars(), + auto newTargetOp = omp::TargetOp::create( + rewriter, targetOp.getLoc(), targetOp.getAllocateVars(), targetOp.getAllocatorVars(), targetOp.getBareAttr(), targetOp.getDependKindsAttr(), targetOp.getDependVars(), targetOp.getDevice(), targetOp.getHasDeviceAddrVars(), @@ -821,19 +822,19 @@ static TempOmpVar allocateTempOmpVar(Location loc, Type ty, // Get the appropriate type for allocation if (isPtr(ty)) { Type intTy = rewriter.getI32Type(); - auto one = rewriter.create(loc, intTy, 1); + auto one = LLVM::ConstantOp::create(rewriter, loc, intTy, 1); allocType = llvmPtrTy; - alloc = rewriter.create(loc, llvmPtrTy, allocType, one); + alloc = LLVM::AllocaOp::create(rewriter, loc, llvmPtrTy, allocType, one); allocType = intTy; } else { allocType = ty; - alloc = rewriter.create(loc, allocType); + alloc = fir::AllocaOp::create(rewriter, loc, allocType); } // Lambda to create mapinfo ops auto getMapInfo = [&](mlir::omp::ClauseMapFlags mappingFlags, const char *name) { - return rewriter.create( - loc, alloc.getType(), alloc, TypeAttr::get(allocType), + return omp::MapInfoOp::create( + rewriter, loc, alloc.getType(), alloc, TypeAttr::get(allocType), rewriter.getAttr(mappingFlags), rewriter.getAttr( omp::VariableCaptureKind::ByRef), @@ -979,12 +980,12 @@ static void reloadCacheAndRecompute( // If the original value is a pointer or reference, load and convert if // necessary. if (isPtr(original.getType())) { - restored = rewriter.create(loc, llvmPtrTy, newArg); + restored = LLVM::LoadOp::create(rewriter, loc, llvmPtrTy, newArg); if (!isa(original.getType())) restored = - rewriter.create(loc, original.getType(), restored); + fir::ConvertOp::create(rewriter, loc, original.getType(), restored); } else { - restored = rewriter.create(loc, newArg); + restored = fir::LoadOp::create(rewriter, loc, newArg); } irMapping.map(original, restored); } @@ -1053,7 +1054,7 @@ static mlir::LLVM::ConstantOp genI32Constant(mlir::Location loc, mlir::RewriterBase &rewriter, int value) { mlir::Type i32Ty = rewriter.getI32Type(); mlir::IntegerAttr attr = rewriter.getI32IntegerAttr(value); - return rewriter.create(loc, i32Ty, attr); + return mlir::LLVM::ConstantOp::create(rewriter, loc, i32Ty, attr); } /// Given a box descriptor, extract the base address of the data it describes. @@ -1230,8 +1231,8 @@ static void genFortranAssignOmpReplacement(fir::FirOpBuilder &builder, genOmpGetMappedPtrIfPresent(builder, loc, destBase, device, module); Value srcPtr = genOmpGetMappedPtrIfPresent(builder, loc, srcBase, device, module); - Value zero = builder.create(loc, builder.getI64Type(), - builder.getI64IntegerAttr(0)); + Value zero = LLVM::ConstantOp::create(builder, loc, builder.getI64Type(), + builder.getI64IntegerAttr(0)); // Generate the call to omp_target_memcpy to perform the data copy on the // device. @@ -1348,23 +1349,24 @@ static LogicalResult moveToHost(omp::TargetOp targetOp, RewriterBase &rewriter, for (Operation *op : opsToReplace) { if (auto allocOp = dyn_cast(op)) { rewriter.setInsertionPoint(allocOp); - auto ompAllocmemOp = rewriter.create( - allocOp.getLoc(), rewriter.getI64Type(), device, + auto ompAllocmemOp = omp::TargetAllocMemOp::create( + rewriter, allocOp.getLoc(), rewriter.getI64Type(), device, allocOp.getInTypeAttr(), allocOp.getUniqNameAttr(), allocOp.getBindcNameAttr(), allocOp.getTypeparams(), allocOp.getShape()); - auto firConvertOp = rewriter.create( - allocOp.getLoc(), allocOp.getResult().getType(), - ompAllocmemOp.getResult()); + auto firConvertOp = fir::ConvertOp::create(rewriter, allocOp.getLoc(), + allocOp.getResult().getType(), + ompAllocmemOp.getResult()); rewriter.replaceOp(allocOp, firConvertOp.getResult()); } // Replace fir.freemem with omp.target_freemem. else if (auto freeOp = dyn_cast(op)) { rewriter.setInsertionPoint(freeOp); - auto firConvertOp = rewriter.create( - freeOp.getLoc(), rewriter.getI64Type(), freeOp.getHeapref()); - rewriter.create(freeOp.getLoc(), device, - firConvertOp.getResult()); + auto firConvertOp = + fir::ConvertOp::create(rewriter, freeOp.getLoc(), + rewriter.getI64Type(), freeOp.getHeapref()); + omp::TargetFreeMemOp::create(rewriter, freeOp.getLoc(), device, + firConvertOp.getResult()); rewriter.eraseOp(freeOp); } // fir.declare changes its type when hoisting it out of omp.target to @@ -1376,8 +1378,9 @@ static LogicalResult moveToHost(omp::TargetOp targetOp, RewriterBase &rewriter, dyn_cast(clonedInType); Type clonedEleTy = clonedRefType.getElementType(); rewriter.setInsertionPoint(op); - Value loadedValue = rewriter.create( - clonedDeclareOp.getLoc(), clonedEleTy, clonedDeclareOp.getMemref()); + Value loadedValue = + fir::LoadOp::create(rewriter, clonedDeclareOp.getLoc(), clonedEleTy, + clonedDeclareOp.getMemref()); clonedDeclareOp.getResult().replaceAllUsesWith(loadedValue); } // Replace runtime calls with omp versions. @@ -1473,8 +1476,8 @@ genPreTargetOp(omp::TargetOp targetOp, SmallVector &preMapOperands, auto *targetBlock = &targetOp.getRegion().front(); SmallVector preHostEvalVars{targetOp.getHostEvalVars()}; // update the hostEvalVars of preTargetOp - omp::TargetOp preTargetOp = rewriter.create( - targetOp.getLoc(), targetOp.getAllocateVars(), + omp::TargetOp preTargetOp = omp::TargetOp::create( + rewriter, targetOp.getLoc(), targetOp.getAllocateVars(), targetOp.getAllocatorVars(), targetOp.getBareAttr(), targetOp.getDependKindsAttr(), targetOp.getDependVars(), targetOp.getDevice(), targetOp.getHasDeviceAddrVars(), preHostEvalVars, @@ -1513,13 +1516,13 @@ genPreTargetOp(omp::TargetOp targetOp, SmallVector &preMapOperands, // Create the store operation. if (isPtr(originalResult.getType())) { if (!isa(toStore.getType())) - toStore = rewriter.create(loc, llvmPtrTy, toStore); - rewriter.create(loc, toStore, newArg); + toStore = fir::ConvertOp::create(rewriter, loc, llvmPtrTy, toStore); + LLVM::StoreOp::create(rewriter, loc, toStore, newArg); } else { - rewriter.create(loc, toStore, newArg); + fir::StoreOp::create(rewriter, loc, toStore, newArg); } } - rewriter.create(loc); + omp::TerminatorOp::create(rewriter, loc); // Update hostEvalVars with the mapped values for the loop bounds if we have // a loopNestOp and we are not generating code for the target device. @@ -1563,8 +1566,8 @@ genIsolatedTargetOp(omp::TargetOp targetOp, SmallVector &postMapOperands, hostEvalVars.steps.end()); } // Create the isolated target op - omp::TargetOp isolatedTargetOp = rewriter.create( - targetOp.getLoc(), targetOp.getAllocateVars(), + omp::TargetOp isolatedTargetOp = omp::TargetOp::create( + rewriter, targetOp.getLoc(), targetOp.getAllocateVars(), targetOp.getAllocatorVars(), targetOp.getBareAttr(), targetOp.getDependKindsAttr(), targetOp.getDependVars(), targetOp.getDevice(), targetOp.getHasDeviceAddrVars(), @@ -1590,7 +1593,7 @@ genIsolatedTargetOp(omp::TargetOp targetOp, SmallVector &postMapOperands, // Clone the original operations. rewriter.clone(*splitBeforeOp, isolatedMapping); - rewriter.create(loc); + omp::TerminatorOp::create(rewriter, loc); // update the loop bounds in the isolatedTargetOp if we have host_eval vars // and we are not generating code for the target device. @@ -1643,8 +1646,8 @@ static omp::TargetOp genPostTargetOp(omp::TargetOp targetOp, auto *targetBlock = &targetOp.getRegion().front(); SmallVector postHostEvalVars{targetOp.getHostEvalVars()}; // Create the post target op - omp::TargetOp postTargetOp = rewriter.create( - targetOp.getLoc(), targetOp.getAllocateVars(), + omp::TargetOp postTargetOp = omp::TargetOp::create( + rewriter, targetOp.getLoc(), targetOp.getAllocateVars(), targetOp.getAllocatorVars(), targetOp.getBareAttr(), targetOp.getDependKindsAttr(), targetOp.getDependVars(), targetOp.getDevice(), targetOp.getHasDeviceAddrVars(), postHostEvalVars, diff --git a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp index 566e88b9d6588..bd07d7fe01b85 100644 --- a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp +++ b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp @@ -883,18 +883,16 @@ class MapInfoFinalizationPass if (explicitMappingPresent(op, targetDataOp)) return; - mlir::omp::MapInfoOp newDescParentMapOp = - builder.create( - op->getLoc(), op.getResult().getType(), op.getVarPtr(), - op.getVarTypeAttr(), - builder.getAttr( - mlir::omp::ClauseMapFlags::to | - mlir::omp::ClauseMapFlags::always), - op.getMapCaptureTypeAttr(), /*varPtrPtr=*/mlir::Value{}, - mlir::SmallVector{}, mlir::ArrayAttr{}, - /*bounds=*/mlir::SmallVector{}, - /*mapperId*/ mlir::FlatSymbolRefAttr(), op.getNameAttr(), - /*partial_map=*/builder.getBoolAttr(false)); + mlir::omp::MapInfoOp newDescParentMapOp = mlir::omp::MapInfoOp::create( + builder, op->getLoc(), op.getResult().getType(), op.getVarPtr(), + op.getVarTypeAttr(), + builder.getAttr( + mlir::omp::ClauseMapFlags::to | mlir::omp::ClauseMapFlags::always), + op.getMapCaptureTypeAttr(), /*varPtrPtr=*/mlir::Value{}, + mlir::SmallVector{}, mlir::ArrayAttr{}, + /*bounds=*/mlir::SmallVector{}, + /*mapperId*/ mlir::FlatSymbolRefAttr(), op.getNameAttr(), + /*partial_map=*/builder.getBoolAttr(false)); targetDataOp.getMapVarsMutable().append({newDescParentMapOp}); } @@ -946,14 +944,13 @@ class MapInfoFinalizationPass // need to see how well this alteration works. auto loadBaseAddr = builder.loadIfRef(op->getLoc(), baseAddr.getVarPtrPtr()); - mlir::omp::MapInfoOp newBaseAddrMapOp = - builder.create( - op->getLoc(), loadBaseAddr.getType(), loadBaseAddr, - baseAddr.getVarTypeAttr(), baseAddr.getMapTypeAttr(), - baseAddr.getMapCaptureTypeAttr(), mlir::Value{}, members, - membersAttr, baseAddr.getBounds(), - /*mapperId*/ mlir::FlatSymbolRefAttr(), op.getNameAttr(), - /*partial_map=*/builder.getBoolAttr(false)); + mlir::omp::MapInfoOp newBaseAddrMapOp = mlir::omp::MapInfoOp::create( + builder, op->getLoc(), loadBaseAddr.getType(), loadBaseAddr, + baseAddr.getVarTypeAttr(), baseAddr.getMapTypeAttr(), + baseAddr.getMapCaptureTypeAttr(), mlir::Value{}, members, membersAttr, + baseAddr.getBounds(), + /*mapperId*/ mlir::FlatSymbolRefAttr(), op.getNameAttr(), + /*partial_map=*/builder.getBoolAttr(false)); op.replaceAllUsesWith(newBaseAddrMapOp.getResult()); op->erase(); baseAddr.erase(); diff --git a/flang/lib/Optimizer/Support/Utils.cpp b/flang/lib/Optimizer/Support/Utils.cpp index 92390e4a3a230..2f33d89564796 100644 --- a/flang/lib/Optimizer/Support/Utils.cpp +++ b/flang/lib/Optimizer/Support/Utils.cpp @@ -66,7 +66,7 @@ fir::genConstantIndex(mlir::Location loc, mlir::Type ity, mlir::ConversionPatternRewriter &rewriter, std::int64_t offset) { auto cattr = rewriter.getI64IntegerAttr(offset); - return rewriter.create(loc, ity, cattr); + return mlir::LLVM::ConstantOp::create(rewriter, loc, ity, cattr); } mlir::Value @@ -125,9 +125,9 @@ mlir::Value fir::integerCast(const fir::LLVMTypeConverter &converter, return rewriter.createOrFold(loc, ty, val); } else { if (toSize < fromSize) - return rewriter.create(loc, ty, val); + return mlir::LLVM::TruncOp::create(rewriter, loc, ty, val); if (toSize > fromSize) - return rewriter.create(loc, ty, val); + return mlir::LLVM::SExtOp::create(rewriter, loc, ty, val); } return val; } diff --git a/flang/lib/Utils/OpenMP.cpp b/flang/lib/Utils/OpenMP.cpp index 15a42c3f50866..c2036c4a383fd 100644 --- a/flang/lib/Utils/OpenMP.cpp +++ b/flang/lib/Utils/OpenMP.cpp @@ -112,7 +112,7 @@ mlir::Value mapTemporaryValue(fir::FirOpBuilder &firOpBuilder, mlir::Block *entryBlock = ®ion.getBlocks().front(); firOpBuilder.setInsertionPointToStart(entryBlock); auto loadOp = - firOpBuilder.create(clonedValArg.getLoc(), clonedValArg); + fir::LoadOp::create(firOpBuilder, clonedValArg.getLoc(), clonedValArg); return loadOp.getResult(); } From 47ea8543e26a823a0543bbdf2ff529ec432c09e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Wed, 22 Oct 2025 06:48:10 -1000 Subject: [PATCH 138/530] [flang] Update target rewrite to support workgroup and private attributions (#164515) Some operations like the gpu.func have arguments that need to stay in place while rewriting the signature. This is the case for the workgroup and private attribution. Update the target rewrite pass to be aware of that when adding argument at the end of the function signature. If any trailing arguments are present, the new argument will be inserted just before them. --- flang/lib/Optimizer/CodeGen/TargetRewrite.cpp | 24 +++++++-- flang/test/Fir/CUDA/cuda-target-rewrite.mlir | 53 +++++++++++++++++++ flang/tools/fir-opt/fir-opt.cpp | 1 + 3 files changed, 74 insertions(+), 4 deletions(-) diff --git a/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp b/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp index ac285b5d403df..0776346870c72 100644 --- a/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp +++ b/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp @@ -872,6 +872,14 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase { } } + // Count the number of arguments that have to stay in place at the end of + // the argument list. + unsigned trailingArgs = 0; + if constexpr (std::is_same_v) { + trailingArgs = + func.getNumWorkgroupAttributions() + func.getNumPrivateAttributions(); + } + // Convert return value(s) for (auto ty : funcTy.getResults()) llvm::TypeSwitch(ty) @@ -981,6 +989,16 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase { } } + // Add the argument at the end if the number of trailing arguments is 0, + // otherwise insert the argument at the appropriate index. + auto addOrInsertArgument = [&](mlir::Type ty, mlir::Location loc) { + unsigned inputIndex = func.front().getArguments().size() - trailingArgs; + auto newArg = trailingArgs == 0 + ? func.front().addArgument(ty, loc) + : func.front().insertArgument(inputIndex, ty, loc); + return newArg; + }; + if (!func.empty()) { // If the function has a body, then apply the fixups to the arguments and // return ops as required. These fixups are done in place. @@ -1117,8 +1135,7 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase { // original arguments. (Boxchar arguments.) auto newBufArg = func.front().insertArgument(fixup.index, fixupType, loc); - auto newLenArg = - func.front().addArgument(trailingTys[fixup.second], loc); + auto newLenArg = addOrInsertArgument(trailingTys[fixup.second], loc); auto boxTy = oldArgTys[fixup.index - offset]; rewriter->setInsertionPointToStart(&func.front()); auto box = fir::EmboxCharOp::create(*rewriter, loc, boxTy, newBufArg, @@ -1133,8 +1150,7 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase { // appended after all the original arguments. auto newProcPointerArg = func.front().insertArgument(fixup.index, fixupType, loc); - auto newLenArg = - func.front().addArgument(trailingTys[fixup.second], loc); + auto newLenArg = addOrInsertArgument(trailingTys[fixup.second], loc); auto tupleType = oldArgTys[fixup.index - offset]; rewriter->setInsertionPointToStart(&func.front()); fir::FirOpBuilder builder(*rewriter, getModule()); diff --git a/flang/test/Fir/CUDA/cuda-target-rewrite.mlir b/flang/test/Fir/CUDA/cuda-target-rewrite.mlir index a334934f31723..48fee10f3db97 100644 --- a/flang/test/Fir/CUDA/cuda-target-rewrite.mlir +++ b/flang/test/Fir/CUDA/cuda-target-rewrite.mlir @@ -55,3 +55,56 @@ func.func @main(%arg0: complex) { // CHECK-SAME: (%arg0: f64, %arg1: f64) kernel { // CHECK: gpu.return // CHECK: gpu.launch_func @testmod::@_QPtest blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) : i64 dynamic_shared_memory_size %{{.*}} args(%{{.*}} : f64, %{{.*}} : f64) {cuf.proc_attr = #cuf.cuda_proc} + +// ----- + +module attributes {gpu.container_module, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} { + gpu.module @testmod { + gpu.func @_QMbarPfoo(%arg0: f32, %arg1: !fir.ref>, %arg2: !fir.boxchar<1>) workgroup(%arg3 : memref<1xf32, #gpu.address_space> {llvm.align = 16 : i32}) { + %c0 = arith.constant 0 : index + memref.store %arg0, %arg3[%c0] : memref<1xf32, #gpu.address_space> + gpu.return + } +// CHECK-LABEL: gpu.func @_QMbarPfoo( +// CHECK-SAME: %{{.*}}: f32, %{{.*}}: !fir.ref>, %[[CHAR:.*]]: !fir.ref>, %[[LENGTH:.*]]: i64) workgroup(%[[WORKGROUP:.*]] : memref<1xf32, #gpu.address_space> {llvm.align = 16 : i32}) { +// CHECK: %{{.*}} = fir.emboxchar %[[CHAR]], %[[LENGTH]] : (!fir.ref>, i64) -> !fir.boxchar<1> +// CHECK: memref.store %{{.*}}, %[[WORKGROUP]][%{{.*}}] : memref<1xf32, #gpu.address_space> + + gpu.func @_QMbarPfoo2(%arg0: f32, %arg1: !fir.ref>, %arg2: !fir.boxchar<1>) workgroup(%arg3 : memref<1xf32, #gpu.address_space> {llvm.align = 16 : i32}, %arg4 : memref<1xf32, #gpu.address_space> {llvm.align = 16 : i32}) { + %c0 = arith.constant 0 : index + memref.store %arg0, %arg3[%c0] : memref<1xf32, #gpu.address_space> + memref.store %arg0, %arg4[%c0] : memref<1xf32, #gpu.address_space> + gpu.return + } +// CHECK-LABEL: gpu.func @_QMbarPfoo2( +// CHECK-SAME: %{{.*}}: f32, %{{.*}}: !fir.ref>, %[[CHAR:.*]]: !fir.ref>, %[[LENGTH:.*]]: i64) workgroup(%[[WG1:.*]] : memref<1xf32, #gpu.address_space> {llvm.align = 16 : i32}, %[[WG2:.*]] : memref<1xf32, #gpu.address_space> {llvm.align = 16 : i32}) { +// CHECK: %{{.*}} = fir.emboxchar %[[CHAR]], %[[LENGTH]] : (!fir.ref>, i64) -> !fir.boxchar<1> +// CHECK: memref.store %{{.*}}, %[[WG1]][%{{.*}}] : memref<1xf32, #gpu.address_space> +// CHECK: memref.store %{{.*}}, %[[WG2]][%{{.*}}] : memref<1xf32, #gpu.address_space> + + gpu.func @_QMbarPprivate(%arg0: f32, %arg1: !fir.boxchar<1>) workgroup(%arg2 : memref<1xf32, #gpu.address_space> {llvm.align = 16 : i32}) private(%arg3 : memref<1xf32, #gpu.address_space> {llvm.align = 16 : i32}) { + %c0 = arith.constant 0 : index + memref.store %arg0, %arg2[%c0] : memref<1xf32, #gpu.address_space> + memref.store %arg0, %arg3[%c0] : memref<1xf32, #gpu.address_space> + gpu.return + } +// CHECK-LABEL: gpu.func @_QMbarPprivate( +// CHECK-SAME: %{{.*}}: f32, %[[CHAR:.*]]: !fir.ref>, %[[LENGTH:.*]]: i64) workgroup(%[[WG:.*]] : memref<1xf32, #gpu.address_space> {llvm.align = 16 : i32}) private(%[[PRIVATE:.*]] : memref<1xf32, #gpu.address_space> {llvm.align = 16 : i32}) { +// CHECK: %{{.*}} = fir.emboxchar %[[CHAR]], %[[LENGTH]] : (!fir.ref>, i64) -> !fir.boxchar<1> +// CHECK: memref.store %{{.*}}, %[[WG]][%{{.*}}] : memref<1xf32, #gpu.address_space> +// CHECK: memref.store %{{.*}}, %[[PRIVATE]][%{{.*}}] : memref<1xf32, #gpu.address_space> + + gpu.func @test_with_char_proc(%arg0: f32, %arg1: tuple<() -> (), i64> {fir.char_proc}) workgroup(%arg2 : memref<1xf32, #gpu.address_space>) { + %c0 = arith.constant 0 : index + memref.store %arg0, %arg2[%c0] : memref<1xf32, #gpu.address_space> + gpu.return + } +// CHECK-LABEL: gpu.func @test_with_char_proc( +// CHECK-SAME: %{{.*}}: f32, %[[CHARPROC:.*]]: () -> () {fir.char_proc}, %[[LENGTH:.*]]: i64) workgroup(%[[WG:.*]] : memref<1xf32, #gpu.address_space>) { +// CHECK: %{{.*}} = fir.undefined tuple<() -> (), i64> +// CHECK: %{{.*}} = fir.insert_value %{{.*}}, %[[CHARPROC]], [0 : index] : (tuple<() -> (), i64>, () -> ()) -> tuple<() -> (), i64> +// CHECK: %{{.*}} = fir.insert_value %{{.*}}, %[[LENGTH]], [1 : index] : (tuple<() -> (), i64>, i64) -> tuple<() -> (), i64> +// CHECK: memref.store %{{.*}}, %[[WG]][%{{.*}}] : memref<1xf32, #gpu.address_space> + } +} + diff --git a/flang/tools/fir-opt/fir-opt.cpp b/flang/tools/fir-opt/fir-opt.cpp index 32b0a1dfa5c7a..67d07eee1f4fc 100644 --- a/flang/tools/fir-opt/fir-opt.cpp +++ b/flang/tools/fir-opt/fir-opt.cpp @@ -50,6 +50,7 @@ int main(int argc, char **argv) { #endif DialectRegistry registry; fir::support::registerDialects(registry); + registry.insert(); fir::support::addFIRExtensions(registry); return failed(MlirOptMain(argc, argv, "FIR modular optimizer driver\n", registry)); From 866879f80342b857a8b911c804189c43ac4fc334 Mon Sep 17 00:00:00 2001 From: Jan Svoboda Date: Wed, 22 Oct 2025 10:15:12 -0700 Subject: [PATCH 139/530] [clang] Don't silently inherit the VFS from `FileManager` (#164323) Since https://github.com/llvm/llvm-project/pull/158381 the `CompilerInstance` is aware of the VFS and co-owns it. To reduce scope of that PR, the VFS was being inherited from the `FileManager` during `setFileManager()` if it wasn't configured before. However, the implementation of that setter was buggy. This PR fixes the bug, and moves us closer to the long-term goal of `CompilerInstance` requiring the VFS to be configured explicitly and owned by the instance. --- clang-tools-extra/clang-include-fixer/IncludeFixer.cpp | 1 + clang/include/clang/Frontend/ASTUnit.h | 5 +++++ clang/include/clang/Frontend/CompilerInstance.h | 2 +- clang/lib/Frontend/ASTUnit.cpp | 2 ++ clang/lib/Frontend/CompilerInstance.cpp | 2 -- clang/lib/Frontend/FrontendAction.cpp | 1 + clang/lib/Frontend/PrecompiledPreamble.cpp | 10 +++------- clang/lib/StaticAnalyzer/Frontend/ModelInjector.cpp | 1 + clang/lib/Tooling/Tooling.cpp | 1 + clang/tools/clang-installapi/ClangInstallAPI.cpp | 1 + .../DependencyScanning/DependencyScannerTest.cpp | 3 +++ clang/unittests/Tooling/Syntax/TokensTest.cpp | 1 + clang/unittests/Tooling/Syntax/TreeTestBase.cpp | 1 + .../ExpressionParser/Clang/ClangExpressionParser.cpp | 1 - 14 files changed, 21 insertions(+), 11 deletions(-) diff --git a/clang-tools-extra/clang-include-fixer/IncludeFixer.cpp b/clang-tools-extra/clang-include-fixer/IncludeFixer.cpp index e825547ba0134..799e02ffaf3af 100644 --- a/clang-tools-extra/clang-include-fixer/IncludeFixer.cpp +++ b/clang-tools-extra/clang-include-fixer/IncludeFixer.cpp @@ -90,6 +90,7 @@ bool IncludeFixerActionFactory::runInvocation( // Set up Clang. CompilerInstance Compiler(std::move(Invocation), std::move(PCHContainerOps)); + Compiler.setVirtualFileSystem(Files->getVirtualFileSystemPtr()); Compiler.setFileManager(Files); // Create the compiler's actual diagnostics engine. We want to drop all diff --git a/clang/include/clang/Frontend/ASTUnit.h b/clang/include/clang/Frontend/ASTUnit.h index f66df89aad904..3cea159afa33c 100644 --- a/clang/include/clang/Frontend/ASTUnit.h +++ b/clang/include/clang/Frontend/ASTUnit.h @@ -499,6 +499,11 @@ class ASTUnit { return *PPOpts; } + IntrusiveRefCntPtr getVirtualFileSystemPtr() { + // FIXME: Don't defer VFS ownership to the FileManager. + return FileMgr->getVirtualFileSystemPtr(); + } + const FileManager &getFileManager() const { return *FileMgr; } FileManager &getFileManager() { return *FileMgr; } IntrusiveRefCntPtr getFileManagerPtr() { return FileMgr; } diff --git a/clang/include/clang/Frontend/CompilerInstance.h b/clang/include/clang/Frontend/CompilerInstance.h index 44fff69c217c5..2403cbbb652dd 100644 --- a/clang/include/clang/Frontend/CompilerInstance.h +++ b/clang/include/clang/Frontend/CompilerInstance.h @@ -460,7 +460,7 @@ class CompilerInstance : public ModuleLoader { FileMgr.resetWithoutRelease(); } - /// Replace the current file manager and virtual file system. + /// Replace the current file manager. void setFileManager(IntrusiveRefCntPtr Value); /// @} diff --git a/clang/lib/Frontend/ASTUnit.cpp b/clang/lib/Frontend/ASTUnit.cpp index cb445682ac48b..d53b64a414ddc 100644 --- a/clang/lib/Frontend/ASTUnit.cpp +++ b/clang/lib/Frontend/ASTUnit.cpp @@ -1651,6 +1651,7 @@ ASTUnit *ASTUnit::LoadFromCompilerInvocationAction( AST->Reader = nullptr; // Create a file manager object to provide access to and cache the filesystem. + Clang->setVirtualFileSystem(AST->getVirtualFileSystemPtr()); Clang->setFileManager(AST->getFileManagerPtr()); // Create the source manager. @@ -2290,6 +2291,7 @@ void ASTUnit::CodeComplete( "IR inputs not support here!"); // Use the source and file managers that we were given. + Clang->setVirtualFileSystem(FileMgr->getVirtualFileSystemPtr()); Clang->setFileManager(FileMgr); Clang->setSourceManager(SourceMgr); diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp index 584436665622d..374138fe4cf8f 100644 --- a/clang/lib/Frontend/CompilerInstance.cpp +++ b/clang/lib/Frontend/CompilerInstance.cpp @@ -160,8 +160,6 @@ bool CompilerInstance::createTarget() { } void CompilerInstance::setFileManager(IntrusiveRefCntPtr Value) { - if (!hasVirtualFileSystem()) - setVirtualFileSystem(Value->getVirtualFileSystemPtr()); assert(Value == nullptr || getVirtualFileSystemPtr() == Value->getVirtualFileSystemPtr()); FileMgr = std::move(Value); diff --git a/clang/lib/Frontend/FrontendAction.cpp b/clang/lib/Frontend/FrontendAction.cpp index 0daa20a87dd7d..ed1169eb06d22 100644 --- a/clang/lib/Frontend/FrontendAction.cpp +++ b/clang/lib/Frontend/FrontendAction.cpp @@ -945,6 +945,7 @@ bool FrontendAction::BeginSourceFile(CompilerInstance &CI, // Set the shared objects, these are reset when we finish processing the // file, otherwise the CompilerInstance will happily destroy them. + CI.setVirtualFileSystem(AST->getVirtualFileSystemPtr()); CI.setFileManager(AST->getFileManagerPtr()); CI.setSourceManager(AST->getSourceManagerPtr()); CI.setPreprocessor(AST->getPreprocessorPtr()); diff --git a/clang/lib/Frontend/PrecompiledPreamble.cpp b/clang/lib/Frontend/PrecompiledPreamble.cpp index 03f70b74dfb42..9bf18b4d897ce 100644 --- a/clang/lib/Frontend/PrecompiledPreamble.cpp +++ b/clang/lib/Frontend/PrecompiledPreamble.cpp @@ -479,16 +479,12 @@ llvm::ErrorOr PrecompiledPreamble::Build( Diagnostics->Reset(); ProcessWarningOptions(*Diagnostics, Clang->getDiagnosticOpts(), *VFS); - VFS = createVFSFromCompilerInvocation(Clang->getInvocation(), *Diagnostics, - VFS); - // Create a file manager object to provide access to and cache the filesystem. - Clang->setFileManager( - llvm::makeIntrusiveRefCnt(Clang->getFileSystemOpts(), VFS)); + Clang->createVirtualFileSystem(VFS); + Clang->createFileManager(); // Create the source manager. - Clang->setSourceManager(llvm::makeIntrusiveRefCnt( - *Diagnostics, Clang->getFileManager())); + Clang->createSourceManager(); auto PreambleDepCollector = std::make_shared(); Clang->addDependencyCollector(PreambleDepCollector); diff --git a/clang/lib/StaticAnalyzer/Frontend/ModelInjector.cpp b/clang/lib/StaticAnalyzer/Frontend/ModelInjector.cpp index 5301f88057203..531c642bf4f31 100644 --- a/clang/lib/StaticAnalyzer/Frontend/ModelInjector.cpp +++ b/clang/lib/StaticAnalyzer/Frontend/ModelInjector.cpp @@ -93,6 +93,7 @@ void ModelInjector::onBodySynthesis(const NamedDecl *D) { // The instance wants to take ownership, however DisableFree frontend option // is set to true to avoid double free issues + Instance.setVirtualFileSystem(CI.getVirtualFileSystemPtr()); Instance.setFileManager(CI.getFileManagerPtr()); Instance.setSourceManager(SM); Instance.setPreprocessor(CI.getPreprocessorPtr()); diff --git a/clang/lib/Tooling/Tooling.cpp b/clang/lib/Tooling/Tooling.cpp index ea5a37216e959..e8eef5ed9c9fa 100644 --- a/clang/lib/Tooling/Tooling.cpp +++ b/clang/lib/Tooling/Tooling.cpp @@ -446,6 +446,7 @@ bool FrontendActionFactory::runInvocation( DiagnosticConsumer *DiagConsumer) { // Create a compiler instance to handle the actual work. CompilerInstance Compiler(std::move(Invocation), std::move(PCHContainerOps)); + Compiler.setVirtualFileSystem(Files->getVirtualFileSystemPtr()); Compiler.setFileManager(Files); // The FrontendAction can have lifetime requirements for Compiler or its diff --git a/clang/tools/clang-installapi/ClangInstallAPI.cpp b/clang/tools/clang-installapi/ClangInstallAPI.cpp index 16abeb10284c0..4e66485343b89 100644 --- a/clang/tools/clang-installapi/ClangInstallAPI.cpp +++ b/clang/tools/clang-installapi/ClangInstallAPI.cpp @@ -114,6 +114,7 @@ static bool run(ArrayRef Args, const char *ProgName) { // Set up compilation. std::unique_ptr CI(new CompilerInstance()); + CI->setVirtualFileSystem(FM->getVirtualFileSystemPtr()); CI->setFileManager(FM); CI->createDiagnostics(); if (!CI->hasDiagnostics()) diff --git a/clang/unittests/Tooling/DependencyScanning/DependencyScannerTest.cpp b/clang/unittests/Tooling/DependencyScanning/DependencyScannerTest.cpp index aa32bb3d39f6d..4523af33e3c28 100644 --- a/clang/unittests/Tooling/DependencyScanning/DependencyScannerTest.cpp +++ b/clang/unittests/Tooling/DependencyScanning/DependencyScannerTest.cpp @@ -49,6 +49,8 @@ class TestFileCollector : public DependencyFileGenerator { std::vector &Deps; }; +// FIXME: Use the regular Service/Worker/Collector APIs instead of +// reimplementing the action. class TestDependencyScanningAction : public tooling::ToolAction { public: TestDependencyScanningAction(std::vector &Deps) : Deps(Deps) {} @@ -59,6 +61,7 @@ class TestDependencyScanningAction : public tooling::ToolAction { DiagnosticConsumer *DiagConsumer) override { CompilerInstance Compiler(std::move(Invocation), std::move(PCHContainerOps)); + Compiler.setVirtualFileSystem(FileMgr->getVirtualFileSystemPtr()); Compiler.setFileManager(FileMgr); Compiler.createDiagnostics(DiagConsumer, /*ShouldOwnClient=*/false); diff --git a/clang/unittests/Tooling/Syntax/TokensTest.cpp b/clang/unittests/Tooling/Syntax/TokensTest.cpp index 6094177e4817b..47184cbf5d768 100644 --- a/clang/unittests/Tooling/Syntax/TokensTest.cpp +++ b/clang/unittests/Tooling/Syntax/TokensTest.cpp @@ -134,6 +134,7 @@ class TokenCollectorTest : public ::testing::Test { FileName, llvm::MemoryBuffer::getMemBufferCopy(Code).release()); CompilerInstance Compiler(std::move(CI)); Compiler.setDiagnostics(Diags); + Compiler.setVirtualFileSystem(FS); Compiler.setFileManager(FileMgr); Compiler.setSourceManager(SourceMgr); diff --git a/clang/unittests/Tooling/Syntax/TreeTestBase.cpp b/clang/unittests/Tooling/Syntax/TreeTestBase.cpp index 400a0d5a1801b..b2be64fc08f3d 100644 --- a/clang/unittests/Tooling/Syntax/TreeTestBase.cpp +++ b/clang/unittests/Tooling/Syntax/TreeTestBase.cpp @@ -153,6 +153,7 @@ SyntaxTreeTest::buildTree(StringRef Code, const TestClangConfig &ClangConfig) { FileName, llvm::MemoryBuffer::getMemBufferCopy(Code).release()); CompilerInstance Compiler(Invocation); Compiler.setDiagnostics(Diags); + Compiler.setVirtualFileSystem(FS); Compiler.setFileManager(FileMgr); Compiler.setSourceManager(SourceMgr); diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp index 6b121c934dfbb..990074566be7e 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp +++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp @@ -754,7 +754,6 @@ ClangExpressionParser::ClangExpressionParser( // Make sure clang uses the same VFS as LLDB. m_compiler->setVirtualFileSystem( FileSystem::Instance().GetVirtualFileSystem()); - m_compiler->createFileManager(); // 2. Configure the compiler with a set of default options that are // appropriate for most situations. From e2ad55499197db540d09e7201b9b80366a0908c3 Mon Sep 17 00:00:00 2001 From: Pranav Bhandarkar Date: Wed, 22 Oct 2025 12:18:56 -0500 Subject: [PATCH 140/530] [Flang][mlir] - Translation of delayed privatization for deferred target-tasks (#155348) This PR adds support for translation of the private clause on deferred target tasks - that is `omp.target` operations with the `nowait` clause. An offloading call for a deferred target-task is not blocking - the offloading (target-generating) host task continues its execution after issuing the offloading call. Therefore, the key problem we need to solve is to ensure that the data needed for private variables to be initialized in the target task persists even after the host task has completed. We do this in a new pass called `PrepareForOMPOffloadPrivatizationPass`. For a privatized variable that needs its host counterpart for initialization (such as the shape of the data from the descriptor when an allocatable is privatized or the value of the data when an allocatable is firstprivatized), - the pass allocates memory on the heap. - it then initializes this memory by using the `init` and `copy` (for firstprivate) regions of the corresponding `omp::PrivateClauseOp`. - Finally the memory allocated on the heap is freed using the `dealloc` region of the same `omp::PrivateClauseOp` instance. This step is not straightforward though, because we cannot simply free the memory that's going to be used by another thread without any synchronization. So, for deallocation, we create a `omp.task` after the `omp.target` and synchronize the two with a dummy dependency (using the `depend` clause). In this newly created `omp.task` we do the deallocation. --- .../flang/Optimizer/Passes/Pipelines.h | 1 + flang/lib/Optimizer/Passes/Pipelines.cpp | 6 + flang/test/Fir/basic-program.fir | 1 + .../mlir/Dialect/OpenMP/CMakeLists.txt | 2 + .../Dialect/OpenMP/Transforms/CMakeLists.txt | 5 + .../mlir/Dialect/OpenMP/Transforms/Passes.h | 26 ++ .../mlir/Dialect/OpenMP/Transforms/Passes.td | 26 ++ .../Dialect/LLVMIR/Transforms/CMakeLists.txt | 1 + mlir/lib/Dialect/OpenMP/CMakeLists.txt | 2 + .../Dialect/OpenMP/Transforms/CMakeLists.txt | 14 + .../OpenMPOffloadPrivatizationPrepare.cpp | 442 ++++++++++++++++++ mlir/lib/RegisterAllPasses.cpp | 2 + .../OpenMP/OpenMPToLLVMIRTranslation.cpp | 11 +- ...ffload-privatization-prepare-by-value.mlir | 157 +++++++ .../omp-offload-privatization-prepare.mlir | 201 ++++++++ mlir/test/Target/LLVMIR/openmp-todo.mlir | 18 - 16 files changed, 888 insertions(+), 27 deletions(-) create mode 100644 mlir/include/mlir/Dialect/OpenMP/Transforms/CMakeLists.txt create mode 100644 mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.h create mode 100644 mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td create mode 100644 mlir/lib/Dialect/OpenMP/Transforms/CMakeLists.txt create mode 100644 mlir/lib/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.cpp create mode 100644 mlir/test/Dialect/OpenMP/omp-offload-privatization-prepare-by-value.mlir create mode 100644 mlir/test/Dialect/OpenMP/omp-offload-privatization-prepare.mlir diff --git a/flang/include/flang/Optimizer/Passes/Pipelines.h b/flang/include/flang/Optimizer/Passes/Pipelines.h index 682dd829239ef..70b9341347244 100644 --- a/flang/include/flang/Optimizer/Passes/Pipelines.h +++ b/flang/include/flang/Optimizer/Passes/Pipelines.h @@ -22,6 +22,7 @@ #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/LLVMIR/LLVMAttrs.h" +#include "mlir/Dialect/OpenMP/Transforms/Passes.h" #include "mlir/Pass/PassManager.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "mlir/Transforms/Passes.h" diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp index 6dae39b26976d..103e736accca0 100644 --- a/flang/lib/Optimizer/Passes/Pipelines.cpp +++ b/flang/lib/Optimizer/Passes/Pipelines.cpp @@ -426,6 +426,12 @@ void createMLIRToLLVMPassPipeline(mlir::PassManager &pm, // Add codegen pass pipeline. fir::createDefaultFIRCodeGenPassPipeline(pm, config, inputFilename); + + // Run a pass to prepare for translation of delayed privatization in the + // context of deferred target tasks. + addPassConditionally(pm, disableFirToLlvmIr, [&]() { + return mlir::omp::createPrepareForOMPOffloadPrivatizationPass(); + }); } } // namespace fir diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir index 5159c91dafa5c..6d2beae4da1c8 100644 --- a/flang/test/Fir/basic-program.fir +++ b/flang/test/Fir/basic-program.fir @@ -161,4 +161,5 @@ func.func @_QQmain() { // PASSES-NEXT: LowerNontemporalPass // PASSES-NEXT: FIRToLLVMLowering // PASSES-NEXT: ReconcileUnrealizedCasts +// PASSES-NEXT: PrepareForOMPOffloadPrivatizationPass // PASSES-NEXT: LLVMIRLoweringPass diff --git a/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt b/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt index b6c8dbac13c0b..691163d9ce9ac 100644 --- a/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt @@ -1,3 +1,5 @@ +add_subdirectory(Transforms) + set(LLVM_TARGET_DEFINITIONS ${LLVM_MAIN_INCLUDE_DIR}/llvm/Frontend/OpenMP/OMP.td) mlir_tablegen(OmpCommon.td --gen-directive-decl --directives-dialect=OpenMP) add_mlir_dialect_tablegen_target(omp_common_td) diff --git a/mlir/include/mlir/Dialect/OpenMP/Transforms/CMakeLists.txt b/mlir/include/mlir/Dialect/OpenMP/Transforms/CMakeLists.txt new file mode 100644 index 0000000000000..22f0d92ea4cbf --- /dev/null +++ b/mlir/include/mlir/Dialect/OpenMP/Transforms/CMakeLists.txt @@ -0,0 +1,5 @@ +set(LLVM_TARGET_DEFINITIONS Passes.td) +mlir_tablegen(Passes.h.inc -gen-pass-decls -name OpenMP) +add_public_tablegen_target(MLIROpenMPPassIncGen) + +add_mlir_doc(Passes OpenMPPasses ./ -gen-pass-doc) diff --git a/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.h b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.h new file mode 100644 index 0000000000000..21b6d1f466558 --- /dev/null +++ b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.h @@ -0,0 +1,26 @@ +//===- Passes.h - OpenMP Pass Construction and Registration -----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_OPENMP_TRANSFORMS_PASSES_H +#define MLIR_DIALECT_OPENMP_TRANSFORMS_PASSES_H + +#include "mlir/Pass/Pass.h" + +namespace mlir { + +namespace omp { + +/// Generate the code for registering conversion passes. +#define GEN_PASS_DECL +#define GEN_PASS_REGISTRATION +#include "mlir/Dialect/OpenMP/Transforms/Passes.h.inc" + +} // namespace omp +} // namespace mlir + +#endif // MLIR_DIALECT_LLVMIR_TRANSFORMS_PASSES_H diff --git a/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td new file mode 100644 index 0000000000000..1fde7e08ab433 --- /dev/null +++ b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td @@ -0,0 +1,26 @@ +//===-- Passes.td - OpenMP pass definition file ------------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_OPENMP_TRANSFORMS_PASSES +#define MLIR_DIALECT_OPENMP_TRANSFORMS_PASSES + +include "mlir/Pass/PassBase.td" + +def PrepareForOMPOffloadPrivatizationPass : Pass<"omp-offload-privatization-prepare", "ModuleOp"> { + let summary = "Prepare OpenMP maps for privatization for deferred target tasks"; + let description = [{ + When generating LLVMIR for privatized variables in an OpenMP offloading directive (eg. omp::TargetOp) + that creates a deferred target task (when the nowait clause is used), we need to copy the privatized + variable out of the stack of the generating task and into the heap so that the deferred target task + can still access it. However, if such a privatized variable is also mapped, typically the case for + allocatables, then the corresponding `omp::MapInfoOp` needs to be fixed up to map the new heap-allocated + variable and not the original variable. + }]; + let dependentDialects = ["LLVM::LLVMDialect"]; +} +#endif // MLIR_DIALECT_OPENMP_TRANSFORMS_PASSES diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt b/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt index d4ff0955c5d0e..37a45d478a1fb 100644 --- a/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt @@ -18,4 +18,5 @@ add_mlir_dialect_library(MLIRLLVMIRTransforms MLIRPass MLIRTransforms MLIRNVVMDialect + MLIROpenMPDialect ) diff --git a/mlir/lib/Dialect/OpenMP/CMakeLists.txt b/mlir/lib/Dialect/OpenMP/CMakeLists.txt index 57a6d3445c151..f3c02da458508 100644 --- a/mlir/lib/Dialect/OpenMP/CMakeLists.txt +++ b/mlir/lib/Dialect/OpenMP/CMakeLists.txt @@ -1,3 +1,5 @@ +add_subdirectory(Transforms) + add_mlir_dialect_library(MLIROpenMPDialect IR/OpenMPDialect.cpp diff --git a/mlir/lib/Dialect/OpenMP/Transforms/CMakeLists.txt b/mlir/lib/Dialect/OpenMP/Transforms/CMakeLists.txt new file mode 100644 index 0000000000000..b9b8eda9ed51b --- /dev/null +++ b/mlir/lib/Dialect/OpenMP/Transforms/CMakeLists.txt @@ -0,0 +1,14 @@ +add_mlir_dialect_library(MLIROpenMPTransforms + OpenMPOffloadPrivatizationPrepare.cpp + + DEPENDS + MLIROpenMPPassIncGen + + LINK_LIBS PUBLIC + MLIRIR + MLIRFuncDialect + MLIRLLVMDialect + MLIROpenMPDialect + MLIRPass + MLIRTransforms + ) diff --git a/mlir/lib/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.cpp b/mlir/lib/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.cpp new file mode 100644 index 0000000000000..db54eaa2628a8 --- /dev/null +++ b/mlir/lib/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.cpp @@ -0,0 +1,442 @@ +//===- OpenMPOffloadPrivatizationPrepare.cpp - Prepare OMP privatization --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Analysis/SliceAnalysis.h" +#include "mlir/Dialect/LLVMIR/FunctionCallUtils.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/OpenMP/OpenMPDialect.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/Dominance.h" +#include "mlir/IR/IRMapping.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Support/LLVM.h" +#include "llvm/Support/DebugLog.h" +#include "llvm/Support/FormatVariadic.h" +#include +#include +#include + +//===----------------------------------------------------------------------===// +// A pass that prepares OpenMP code for translation of delayed privatization +// in the context of deferred target tasks. Deferred target tasks are created +// when the nowait clause is used on the target directive. +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "omp-prepare-for-offload-privatization" + +namespace mlir { +namespace omp { + +#define GEN_PASS_DEF_PREPAREFOROMPOFFLOADPRIVATIZATIONPASS +#include "mlir/Dialect/OpenMP/Transforms/Passes.h.inc" + +} // namespace omp +} // namespace mlir + +using namespace mlir; +namespace { + +//===----------------------------------------------------------------------===// +// PrepareForOMPOffloadPrivatizationPass +//===----------------------------------------------------------------------===// + +class PrepareForOMPOffloadPrivatizationPass + : public omp::impl::PrepareForOMPOffloadPrivatizationPassBase< + PrepareForOMPOffloadPrivatizationPass> { + + void runOnOperation() override { + ModuleOp mod = getOperation(); + + // In this pass, we make host-allocated privatized variables persist for + // deferred target tasks by copying them to the heap. Once the target task + // is done, this heap memory is freed. Since all of this happens on the host + // we can skip device modules. + auto offloadModuleInterface = + dyn_cast(mod.getOperation()); + if (offloadModuleInterface && offloadModuleInterface.getIsTargetDevice()) + return; + + getOperation()->walk([&](omp::TargetOp targetOp) { + if (!hasPrivateVars(targetOp) || !isTargetTaskDeferred(targetOp)) + return; + IRRewriter rewriter(&getContext()); + OperandRange privateVars = targetOp.getPrivateVars(); + SmallVector newPrivVars; + Value fakeDependVar; + omp::TaskOp cleanupTaskOp; + + newPrivVars.reserve(privateVars.size()); + std::optional privateSyms = targetOp.getPrivateSyms(); + for (auto [privVarIdx, privVarSymPair] : + llvm::enumerate(llvm::zip_equal(privateVars, *privateSyms))) { + Value privVar = std::get<0>(privVarSymPair); + Attribute privSym = std::get<1>(privVarSymPair); + + omp::PrivateClauseOp privatizer = findPrivatizer(targetOp, privSym); + if (!privatizer.needsMap()) { + newPrivVars.push_back(privVar); + continue; + } + bool isFirstPrivate = privatizer.getDataSharingType() == + omp::DataSharingClauseType::FirstPrivate; + + Value mappedValue = targetOp.getMappedValueForPrivateVar(privVarIdx); + auto mapInfoOp = cast(mappedValue.getDefiningOp()); + + if (mapInfoOp.getMapCaptureType() == omp::VariableCaptureKind::ByCopy) { + newPrivVars.push_back(privVar); + continue; + } + + // For deferred target tasks (!$omp target nowait), we need to keep + // a copy of the original, i.e. host variable being privatized so + // that it is available when the target task is eventually executed. + // We do this by first allocating as much heap memory as is needed by + // the original variable. Then, we use the init and copy regions of the + // privatizer, an instance of omp::PrivateClauseOp to set up the heap- + // allocated copy. + // After the target task is done, we need to use the dealloc region + // of the privatizer to clean up everything. We also need to free + // the heap memory we allocated. But due to the deferred nature + // of the target task, we cannot simply deallocate right after the + // omp.target operation else we may end up freeing memory before + // its eventual use by the target task. So, we create a dummy + // dependence between the target task and new omp.task. In the omp.task, + // we do all the cleanup. So, we end up with the following structure + // + // omp.target map_entries(..) ... nowait depend(out:fakeDependVar) { + // ... + // omp.terminator + // } + // omp.task depend(in: fakeDependVar) { + // /*cleanup_code*/ + // omp.terminator + // } + // fakeDependVar is the address of the first heap-allocated copy of the + // host variable being privatized. + + bool needsCleanupTask = !privatizer.getDeallocRegion().empty(); + + // Allocate heap memory that corresponds to the type of memory + // pointed to by varPtr + // For boxchars this won't be a pointer. But, MapsForPrivatizedSymbols + // should have mapped the pointer to the boxchar so use that as varPtr. + Value varPtr = mapInfoOp.getVarPtr(); + Type varType = mapInfoOp.getVarType(); + bool isPrivatizedByValue = + !isa(privVar.getType()); + + assert(isa(varPtr.getType())); + Value heapMem = + allocateHeapMem(targetOp, varPtr, varType, mod, rewriter); + if (!heapMem) + targetOp.emitError( + "Unable to allocate heap memory when trying to move " + "a private variable out of the stack and into the " + "heap for use by a deferred target task"); + + if (needsCleanupTask && !fakeDependVar) + fakeDependVar = heapMem; + + // The types of private vars should match before and after the + // transformation. In particular, if the type is a pointer, + // simply record the newly allocated malloc location as the + // new private variable. If, however, the type is not a pointer + // then, we need to load the value from the newly allocated + // location. We'll insert that load later after we have updated + // the malloc'd location with the contents of the original + // variable. + if (!isPrivatizedByValue) + newPrivVars.push_back(heapMem); + + // We now need to copy the original private variable into the newly + // allocated location in the heap. + // Find the earliest insertion point for the copy. This will be before + // the first in the list of omp::MapInfoOp instances that use varPtr. + // After the copy these omp::MapInfoOp instances will refer to heapMem + // instead. + Operation *varPtrDefiningOp = varPtr.getDefiningOp(); + DenseSet users; + if (varPtrDefiningOp) { + users.insert(varPtrDefiningOp->user_begin(), + varPtrDefiningOp->user_end()); + } else { + auto blockArg = cast(varPtr); + users.insert(blockArg.user_begin(), blockArg.user_end()); + } + auto usesVarPtr = [&users](Operation *op) -> bool { + return users.count(op); + }; + + SmallVector chainOfOps; + chainOfOps.push_back(mapInfoOp); + for (auto member : mapInfoOp.getMembers()) { + omp::MapInfoOp memberMap = + cast(member.getDefiningOp()); + if (usesVarPtr(memberMap)) + chainOfOps.push_back(memberMap); + if (memberMap.getVarPtrPtr()) { + Operation *defOp = memberMap.getVarPtrPtr().getDefiningOp(); + if (defOp && usesVarPtr(defOp)) + chainOfOps.push_back(defOp); + } + } + + DominanceInfo dom; + llvm::sort(chainOfOps, [&](Operation *l, Operation *r) { + return dom.dominates(l, r); + }); + + rewriter.setInsertionPoint(chainOfOps.front()); + + Operation *firstOp = chainOfOps.front(); + Location loc = firstOp->getLoc(); + + // Create a llvm.func for 'region' that is marked always_inline and call + // it. + auto createAlwaysInlineFuncAndCallIt = + [&](Region ®ion, llvm::StringRef funcName, + llvm::ArrayRef args, bool returnsValue) -> Value { + assert(!region.empty() && "region cannot be empty"); + LLVM::LLVMFuncOp func = createFuncOpForRegion( + loc, mod, region, funcName, rewriter, returnsValue); + auto call = rewriter.create(loc, func, args); + return call.getResult(); + }; + + Value moldArg, newArg; + if (isPrivatizedByValue) { + moldArg = rewriter.create(loc, varType, varPtr); + newArg = rewriter.create(loc, varType, heapMem); + } else { + moldArg = varPtr; + newArg = heapMem; + } + + Value initializedVal; + if (!privatizer.getInitRegion().empty()) + initializedVal = createAlwaysInlineFuncAndCallIt( + privatizer.getInitRegion(), + llvm::formatv("{0}_{1}", privatizer.getSymName(), "init").str(), + {moldArg, newArg}, /*returnsValue=*/true); + else + initializedVal = newArg; + + if (isFirstPrivate && !privatizer.getCopyRegion().empty()) + initializedVal = createAlwaysInlineFuncAndCallIt( + privatizer.getCopyRegion(), + llvm::formatv("{0}_{1}", privatizer.getSymName(), "copy").str(), + {moldArg, initializedVal}, /*returnsValue=*/true); + + if (isPrivatizedByValue) + (void)rewriter.create(loc, initializedVal, heapMem); + + // clone origOp, replace all uses of varPtr with heapMem and + // erase origOp. + auto cloneModifyAndErase = [&](Operation *origOp) -> Operation * { + Operation *clonedOp = rewriter.clone(*origOp); + rewriter.replaceAllOpUsesWith(origOp, clonedOp); + rewriter.modifyOpInPlace(clonedOp, [&]() { + clonedOp->replaceUsesOfWith(varPtr, heapMem); + }); + rewriter.eraseOp(origOp); + return clonedOp; + }; + + // Now that we have set up the heap-allocated copy of the private + // variable, rewrite all the uses of the original variable with + // the heap-allocated variable. + rewriter.setInsertionPoint(targetOp); + rewriter.setInsertionPoint(cloneModifyAndErase(mapInfoOp)); + + // Fix any members that may use varPtr to now use heapMem + for (auto member : mapInfoOp.getMembers()) { + auto memberMapInfoOp = cast(member.getDefiningOp()); + if (!usesVarPtr(memberMapInfoOp)) + continue; + rewriter.setInsertionPoint(cloneModifyAndErase(memberMapInfoOp)); + + if (memberMapInfoOp.getVarPtrPtr()) { + Operation *varPtrPtrdefOp = + memberMapInfoOp.getVarPtrPtr().getDefiningOp(); + rewriter.setInsertionPoint(cloneModifyAndErase(varPtrPtrdefOp)); + } + } + + // If the type of the private variable is not a pointer, + // which is typically the case with !fir.boxchar types, then + // we need to ensure that the new private variable is also + // not a pointer. Insert a load from heapMem right before + // targetOp. + if (isPrivatizedByValue) { + rewriter.setInsertionPoint(targetOp); + auto newPrivVar = rewriter.create(mapInfoOp.getLoc(), + varType, heapMem); + newPrivVars.push_back(newPrivVar); + } + + // Deallocate + if (needsCleanupTask) { + if (!cleanupTaskOp) { + assert(fakeDependVar && + "Need a valid value to set up a dependency"); + rewriter.setInsertionPointAfter(targetOp); + omp::TaskOperands taskOperands; + auto inDepend = omp::ClauseTaskDependAttr::get( + rewriter.getContext(), omp::ClauseTaskDepend::taskdependin); + taskOperands.dependKinds.push_back(inDepend); + taskOperands.dependVars.push_back(fakeDependVar); + cleanupTaskOp = omp::TaskOp::create(rewriter, loc, taskOperands); + Block *taskBlock = rewriter.createBlock(&cleanupTaskOp.getRegion()); + rewriter.setInsertionPointToEnd(taskBlock); + rewriter.create(cleanupTaskOp.getLoc()); + } + rewriter.setInsertionPointToStart( + &*cleanupTaskOp.getRegion().getBlocks().begin()); + (void)createAlwaysInlineFuncAndCallIt( + privatizer.getDeallocRegion(), + llvm::formatv("{0}_{1}", privatizer.getSymName(), "dealloc") + .str(), + {initializedVal}, /*returnsValue=*/false); + llvm::FailureOr freeFunc = + LLVM::lookupOrCreateFreeFn(rewriter, mod); + assert(llvm::succeeded(freeFunc) && + "Could not find free in the module"); + (void)rewriter.create(loc, freeFunc.value(), + ValueRange{heapMem}); + } + } + assert(newPrivVars.size() == privateVars.size() && + "The number of private variables must match before and after " + "transformation"); + if (fakeDependVar) { + omp::ClauseTaskDependAttr outDepend = omp::ClauseTaskDependAttr::get( + rewriter.getContext(), omp::ClauseTaskDepend::taskdependout); + SmallVector newDependKinds; + if (!targetOp.getDependVars().empty()) { + std::optional dependKinds = targetOp.getDependKinds(); + assert(dependKinds && "bad depend clause in omp::TargetOp"); + llvm::copy(*dependKinds, std::back_inserter(newDependKinds)); + } + newDependKinds.push_back(outDepend); + ArrayAttr newDependKindsAttr = + ArrayAttr::get(rewriter.getContext(), newDependKinds); + targetOp.getDependVarsMutable().append(fakeDependVar); + targetOp.setDependKindsAttr(newDependKindsAttr); + } + rewriter.setInsertionPoint(targetOp); + targetOp.getPrivateVarsMutable().clear(); + targetOp.getPrivateVarsMutable().assign(newPrivVars); + }); + } + +private: + bool hasPrivateVars(omp::TargetOp targetOp) const { + return !targetOp.getPrivateVars().empty(); + } + + bool isTargetTaskDeferred(omp::TargetOp targetOp) const { + return targetOp.getNowait(); + } + + template + omp::PrivateClauseOp findPrivatizer(OpTy op, Attribute privSym) const { + SymbolRefAttr privatizerName = llvm::cast(privSym); + omp::PrivateClauseOp privatizer = + SymbolTable::lookupNearestSymbolFrom( + op, privatizerName); + return privatizer; + } + + // Get the (compile-time constant) size of varType as per the + // given DataLayout dl. + std::int64_t getSizeInBytes(const DataLayout &dl, Type varType) const { + llvm::TypeSize size = dl.getTypeSize(varType); + unsigned short alignment = dl.getTypeABIAlignment(varType); + return llvm::alignTo(size, alignment); + } + + LLVM::LLVMFuncOp getMalloc(ModuleOp mod, IRRewriter &rewriter) const { + llvm::FailureOr mallocCall = + LLVM::lookupOrCreateMallocFn(rewriter, mod, rewriter.getI64Type()); + assert(llvm::succeeded(mallocCall) && + "Could not find malloc in the module"); + return mallocCall.value(); + } + + Value allocateHeapMem(omp::TargetOp targetOp, Value privVar, Type varType, + ModuleOp mod, IRRewriter &rewriter) const { + OpBuilder::InsertionGuard guard(rewriter); + Value varPtr = privVar; + Operation *definingOp = varPtr.getDefiningOp(); + BlockArgument blockArg; + if (!definingOp) { + blockArg = mlir::dyn_cast(varPtr); + rewriter.setInsertionPointToStart(blockArg.getParentBlock()); + } else { + rewriter.setInsertionPoint(definingOp); + } + Location loc = definingOp ? definingOp->getLoc() : blockArg.getLoc(); + LLVM::LLVMFuncOp mallocFn = getMalloc(mod, rewriter); + + assert(mod.getDataLayoutSpec() && + "MLIR module with no datalayout spec not handled yet"); + + const DataLayout &dl = DataLayout(mod); + std::int64_t distance = getSizeInBytes(dl, varType); + + Value sizeBytes = rewriter.create( + loc, mallocFn.getFunctionType().getParamType(0), distance); + + auto mallocCallOp = + rewriter.create(loc, mallocFn, ValueRange{sizeBytes}); + return mallocCallOp.getResult(); + } + + // Create a function for srcRegion and attribute it to be always_inline. + // The big assumption here is that srcRegion is one of init, copy or dealloc + // regions of a omp::PrivateClauseop. Accordingly, the return type is assumed + // to either be the same as the types of the two arguments of the region (for + // init and copy regions) or void as would be the case for dealloc regions. + LLVM::LLVMFuncOp createFuncOpForRegion(Location loc, ModuleOp mod, + Region &srcRegion, + llvm::StringRef funcName, + IRRewriter &rewriter, + bool returnsValue = false) { + + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPoint(mod.getBody(), mod.getBody()->end()); + Region clonedRegion; + IRMapping mapper; + srcRegion.cloneInto(&clonedRegion, mapper); + + SmallVector paramTypes; + llvm::copy(srcRegion.getArgumentTypes(), std::back_inserter(paramTypes)); + Type resultType = returnsValue + ? srcRegion.getArgument(0).getType() + : LLVM::LLVMVoidType::get(rewriter.getContext()); + LLVM::LLVMFunctionType funcType = + LLVM::LLVMFunctionType::get(resultType, paramTypes); + + LLVM::LLVMFuncOp func = + LLVM::LLVMFuncOp::create(rewriter, loc, funcName, funcType); + func.setAlwaysInline(true); + rewriter.inlineRegionBefore(clonedRegion, func.getRegion(), + func.getRegion().end()); + for (auto &block : func.getRegion().getBlocks()) { + if (isa(block.getTerminator())) { + omp::YieldOp yieldOp = cast(block.getTerminator()); + rewriter.setInsertionPoint(yieldOp); + rewriter.replaceOpWithNewOp(yieldOp, TypeRange(), + yieldOp.getOperands()); + } + } + return func; + } +}; +} // namespace diff --git a/mlir/lib/RegisterAllPasses.cpp b/mlir/lib/RegisterAllPasses.cpp index dd413d2de8710..d7e321a61d4ac 100644 --- a/mlir/lib/RegisterAllPasses.cpp +++ b/mlir/lib/RegisterAllPasses.cpp @@ -33,6 +33,7 @@ #include "mlir/Dialect/MemRef/Transforms/Passes.h" #include "mlir/Dialect/NVGPU/Transforms/Passes.h" #include "mlir/Dialect/OpenACC/Transforms/Passes.h" +#include "mlir/Dialect/OpenMP/Transforms/Passes.h" #include "mlir/Dialect/Quant/Transforms/Passes.h" #include "mlir/Dialect/SCF/Transforms/Passes.h" #include "mlir/Dialect/SPIRV/Transforms/Passes.h" @@ -80,6 +81,7 @@ void mlir::registerAllPasses() { memref::registerMemRefPasses(); shard::registerShardPasses(); ml_program::registerMLProgramPasses(); + omp::registerOpenMPPasses(); quant::registerQuantPasses(); registerSCFPasses(); registerShapePasses(); diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index b851414ece118..f28454075f1d3 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -357,14 +357,8 @@ static LogicalResult checkImplementationStatus(Operation &op) { result = todo("priority"); }; auto checkPrivate = [&todo](auto op, LogicalResult &result) { - if constexpr (std::is_same_v, omp::TargetOp>) { - // Privatization is supported only for included target tasks. - if (!op.getPrivateVars().empty() && op.getNowait()) - result = todo("privatization for deferred target tasks"); - } else { - if (!op.getPrivateVars().empty() || op.getPrivateSyms()) - result = todo("privatization"); - } + if (!op.getPrivateVars().empty() || op.getPrivateSyms()) + result = todo("privatization"); }; auto checkReduction = [&todo](auto op, LogicalResult &result) { if (isa(op)) @@ -451,7 +445,6 @@ static LogicalResult checkImplementationStatus(Operation &op) { checkDevice(op, result); checkInReduction(op, result); checkIsDevicePtr(op, result); - checkPrivate(op, result); }) .Default([](Operation &) { // Assume all clauses for an operation can be translated unless they are diff --git a/mlir/test/Dialect/OpenMP/omp-offload-privatization-prepare-by-value.mlir b/mlir/test/Dialect/OpenMP/omp-offload-privatization-prepare-by-value.mlir new file mode 100644 index 0000000000000..8972a083e2c47 --- /dev/null +++ b/mlir/test/Dialect/OpenMP/omp-offload-privatization-prepare-by-value.mlir @@ -0,0 +1,157 @@ +// RUN: mlir-opt --mlir-disable-threading -omp-offload-privatization-prepare --split-input-file %s | FileCheck %s + +module attributes {dlti.dl_spec = #dlti.dl_spec = dense<32> : vector<4xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, f80 = dense<128> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.mangling_mode" = "e", "dlti.legal_int_widths" = array, "dlti.stack_alignment" = 128 : i64>} { + llvm.func @free(!llvm.ptr) + llvm.func @malloc(i64) -> !llvm.ptr + + omp.private {type = firstprivate} @private_eye : i32 copy { + ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + %0 = llvm.load %arg0 : !llvm.ptr -> i32 + llvm.store %0, %arg1 : i32, !llvm.ptr + omp.yield(%arg1 : !llvm.ptr) + } + omp.private {type = firstprivate} @boxchar_firstprivate : !llvm.struct<(ptr, i64)> init { + ^bb0(%arg0: !llvm.struct<(ptr, i64)>, %arg1: !llvm.struct<(ptr, i64)>): + %0 = llvm.extractvalue %arg0[0] : !llvm.struct<(ptr, i64)> + %1 = llvm.extractvalue %arg0[1] : !llvm.struct<(ptr, i64)> + %8 = llvm.call @malloc(%1) {bindc_name = "", uniq_name = ""} : (i64) -> !llvm.ptr + %9 = llvm.mlir.undef : !llvm.struct<(ptr, i64)> + %10 = llvm.insertvalue %8, %9[0] : !llvm.struct<(ptr, i64)> + %11 = llvm.insertvalue %1, %10[1] : !llvm.struct<(ptr, i64)> + omp.yield(%11 : !llvm.struct<(ptr, i64)>) + } copy { + ^bb0(%arg0: !llvm.struct<(ptr, i64)>, %arg1: !llvm.struct<(ptr, i64)>): + %3 = llvm.extractvalue %arg0[0] : !llvm.struct<(ptr, i64)> + %4 = llvm.extractvalue %arg0[1] : !llvm.struct<(ptr, i64)> + %5 = llvm.extractvalue %arg1[0] : !llvm.struct<(ptr, i64)> + %6 = llvm.extractvalue %arg1[1] : !llvm.struct<(ptr, i64)> + %7 = llvm.icmp "slt" %6, %4 : i64 + %8 = llvm.select %7, %6, %4 : i1, i64 + "llvm.intr.memmove"(%5, %3, %8) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () + omp.yield(%arg1 : !llvm.struct<(ptr, i64)>) + } dealloc { + ^bb0(%arg0: !llvm.struct<(ptr, i64)>): + %0 = llvm.extractvalue %arg0[0] : !llvm.struct<(ptr, i64)> + %1 = llvm.extractvalue %arg0[1] : !llvm.struct<(ptr, i64)> + llvm.call @free(%0) : (!llvm.ptr) -> () + omp.yield + } + + llvm.func @target_boxchar_(%arg0: !llvm.ptr {fir.bindc_name = "l"}) attributes {fir.internal_name = "_QPtarget_boxchar", frame_pointer = #llvm.framePointerKind, target_cpu = "x86-64"} { + %0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr + %2 = llvm.mlir.constant(1 : i64) : i64 + %3 = llvm.alloca %2 x !llvm.struct<(ptr, i64)> : (i64) -> !llvm.ptr + %4 = llvm.mlir.constant(1 : index) : i64 + %5 = llvm.mlir.constant(0 : index) : i64 + %6 = llvm.mlir.constant(0 : i32) : i32 + %7 = llvm.mlir.constant(1 : i64) : i64 + %8 = llvm.mlir.constant(1 : i64) : i64 + %9 = llvm.load %arg0 : !llvm.ptr -> i32 + %10 = llvm.icmp "sgt" %9, %6 : i32 + %11 = llvm.select %10, %9, %6 : i1, i32 + %12 = llvm.mlir.constant(1 : i64) : i64 + %13 = llvm.sext %11 : i32 to i64 + %14 = llvm.alloca %13 x i8 {bindc_name = "char_var"} : (i64) -> !llvm.ptr + %15 = llvm.mlir.undef : !llvm.struct<(ptr, i64)> + %16 = llvm.sext %11 : i32 to i64 + %17 = llvm.insertvalue %14, %15[0] : !llvm.struct<(ptr, i64)> + %18 = llvm.insertvalue %16, %17[1] : !llvm.struct<(ptr, i64)> + llvm.store %18, %3 : !llvm.struct<(ptr, i64)>, !llvm.ptr + %19 = llvm.load %3 : !llvm.ptr -> !llvm.struct<(ptr, i64)> + %20 = llvm.extractvalue %19[0] : !llvm.struct<(ptr, i64)> + %21 = llvm.extractvalue %19[1] : !llvm.struct<(ptr, i64)> + %22 = llvm.sub %21, %4 : i64 + %23 = omp.map.bounds lower_bound(%5 : i64) upper_bound(%22 : i64) extent(%21 : i64) stride(%4 : i64) start_idx(%5 : i64) {stride_in_bytes = true} + %24 = llvm.getelementptr %3[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64)> + %25 = omp.map.info var_ptr(%3 : !llvm.ptr, i8) map_clauses(implicit, to) capture(ByRef) var_ptr_ptr(%24 : !llvm.ptr) bounds(%23) -> !llvm.ptr + %26 = omp.map.info var_ptr(%3 : !llvm.ptr, !llvm.struct<(ptr, i64)>) map_clauses(to) capture(ByRef) members(%25 : [0] : !llvm.ptr) -> !llvm.ptr + %27 = omp.map.info var_ptr(%1 : !llvm.ptr, i32) map_clauses(to) capture(ByCopy) -> !llvm.ptr + omp.target nowait map_entries(%26 -> %arg1, %27 -> %arg2, %25 -> %arg3 : !llvm.ptr, !llvm.ptr, !llvm.ptr) private(@boxchar_firstprivate %18 -> %arg4 [map_idx=0], @private_eye %1 -> %arg5 [map_idx=1] : !llvm.struct<(ptr, i64)>, !llvm.ptr) { + omp.terminator + } + llvm.return + } +} +// CHECK-LABEL: llvm.func @target_boxchar_( +// CHECK-SAME: %[[ARG0:.*]]: !llvm.ptr {fir.bindc_name = "l"}) attributes {fir.internal_name = "_QPtarget_boxchar", frame_pointer = #llvm.framePointerKind, target_cpu = "x86-64"} { +// CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(1 : i64) : i64 +// CHECK: %[[VAL_1:.*]] = llvm.alloca %[[VAL_0]] x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr +// CHECK: %[[VAL_2:.*]] = llvm.mlir.constant(1 : i64) : i64 +// CHECK: %[[VAL_3:.*]] = llvm.mlir.constant(16 : i64) : i64 +// CHECK: %[[HEAP0:.*]] = llvm.call @malloc(%[[VAL_3]]) : (i64) -> !llvm.ptr +// CHECK: %[[VAL_5:.*]] = llvm.alloca %[[VAL_2]] x !llvm.struct<(ptr, i64)> : (i64) -> !llvm.ptr +// CHECK: %[[VAL_6:.*]] = llvm.mlir.constant(1 : index) : i64 +// CHECK: %[[VAL_7:.*]] = llvm.mlir.constant(0 : index) : i64 +// CHECK: %[[VAL_8:.*]] = llvm.mlir.constant(0 : i32) : i32 +// CHECK: %[[VAL_9:.*]] = llvm.mlir.constant(1 : i64) : i64 +// CHECK: %[[VAL_10:.*]] = llvm.mlir.constant(1 : i64) : i64 +// CHECK: %[[VAL_11:.*]] = llvm.load %[[ARG0]] : !llvm.ptr -> i32 +// CHECK: %[[VAL_12:.*]] = llvm.icmp "sgt" %[[VAL_11]], %[[VAL_8]] : i32 +// CHECK: %[[VAL_13:.*]] = llvm.select %[[VAL_12]], %[[VAL_11]], %[[VAL_8]] : i1, i32 +// CHECK: %[[VAL_14:.*]] = llvm.mlir.constant(1 : i64) : i64 +// CHECK: %[[VAL_15:.*]] = llvm.sext %[[VAL_13]] : i32 to i64 +// CHECK: %[[VAL_16:.*]] = llvm.alloca %[[VAL_15]] x i8 {bindc_name = "char_var"} : (i64) -> !llvm.ptr +// CHECK: %[[VAL_17:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_18:.*]] = llvm.sext %[[VAL_13]] : i32 to i64 +// CHECK: %[[VAL_19:.*]] = llvm.insertvalue %[[VAL_16]], %[[VAL_17]][0] : !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_20:.*]] = llvm.insertvalue %[[VAL_18]], %[[VAL_19]][1] : !llvm.struct<(ptr, i64)> +// CHECK: llvm.store %[[VAL_20]], %[[VAL_5]] : !llvm.struct<(ptr, i64)>, !llvm.ptr +// CHECK: %[[VAL_21:.*]] = llvm.load %[[VAL_5]] : !llvm.ptr -> !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_22:.*]] = llvm.extractvalue %[[VAL_21]][0] : !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_23:.*]] = llvm.extractvalue %[[VAL_21]][1] : !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_24:.*]] = llvm.sub %[[VAL_23]], %[[VAL_6]] : i64 +// CHECK: %[[VAL_25:.*]] = omp.map.bounds lower_bound(%[[VAL_7]] : i64) upper_bound(%[[VAL_24]] : i64) extent(%[[VAL_23]] : i64) stride(%[[VAL_6]] : i64) start_idx(%[[VAL_7]] : i64) {stride_in_bytes = true} +// CHECK: %[[VAL_26:.*]] = llvm.load %[[VAL_5]] : !llvm.ptr -> !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_27:.*]] = llvm.load %[[HEAP0]] : !llvm.ptr -> !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_28:.*]] = llvm.call @boxchar_firstprivate_init(%[[VAL_26]], %[[VAL_27]]) : (!llvm.struct<(ptr, i64)>, !llvm.struct<(ptr, i64)>) -> !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_29:.*]] = llvm.call @boxchar_firstprivate_copy(%[[VAL_26]], %[[VAL_28]]) : (!llvm.struct<(ptr, i64)>, !llvm.struct<(ptr, i64)>) -> !llvm.struct<(ptr, i64)> +// CHECK: llvm.store %[[VAL_29]], %[[HEAP0]] : !llvm.struct<(ptr, i64)>, !llvm.ptr +// CHECK: %[[VAL_30:.*]] = omp.map.info var_ptr(%[[VAL_1]] : !llvm.ptr, i32) map_clauses(to) capture(ByCopy) -> !llvm.ptr +// CHECK: %[[VAL_31:.*]] = llvm.getelementptr %[[HEAP0]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_32:.*]] = omp.map.info var_ptr(%[[HEAP0]] : !llvm.ptr, i8) map_clauses(implicit, to) capture(ByRef) var_ptr_ptr(%[[VAL_31]] : !llvm.ptr) bounds(%[[VAL_25]]) -> !llvm.ptr +// CHECK: %[[VAL_33:.*]] = omp.map.info var_ptr(%[[HEAP0]] : !llvm.ptr, !llvm.struct<(ptr, i64)>) map_clauses(to) capture(ByRef) members(%[[VAL_32]] : [0] : !llvm.ptr) -> !llvm.ptr +// CHECK: %[[VAL_34:.*]] = llvm.load %[[HEAP0]] : !llvm.ptr -> !llvm.struct<(ptr, i64)> +// CHECK: omp.target depend(taskdependout -> %[[HEAP0]] : !llvm.ptr) nowait map_entries(%[[VAL_33]] -> %[[VAL_35:.*]], %[[VAL_30]] -> %[[VAL_36:.*]], %[[VAL_32]] -> %[[VAL_37:.*]] : !llvm.ptr, !llvm.ptr, !llvm.ptr) private(@boxchar_firstprivate %[[VAL_34]] -> %[[VAL_38:.*]] [map_idx=0], @private_eye %[[VAL_1]] -> %[[VAL_39:.*]] [map_idx=1] : !llvm.struct<(ptr, i64)>, !llvm.ptr) { +// CHECK: omp.terminator +// CHECK: } +// CHECK: omp.task depend(taskdependin -> %[[HEAP0]] : !llvm.ptr) { +// CHECK: llvm.call @boxchar_firstprivate_dealloc(%[[VAL_29]]) : (!llvm.struct<(ptr, i64)>) -> () +// CHECK: llvm.call @free(%[[HEAP0]]) : (!llvm.ptr) -> () +// CHECK: omp.terminator +// CHECK: } +// CHECK: llvm.return +// CHECK: } + +// CHECK-LABEL: llvm.func @boxchar_firstprivate_init( +// CHECK-SAME: %[[ARG0:.*]]: !llvm.struct<(ptr, i64)>, +// CHECK-SAME: %[[ARG1:.*]]: !llvm.struct<(ptr, i64)>) -> !llvm.struct<(ptr, i64)> attributes {always_inline} { +// CHECK: %[[VAL_0:.*]] = llvm.extractvalue %[[ARG0]][0] : !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_1:.*]] = llvm.extractvalue %[[ARG0]][1] : !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_2:.*]] = llvm.call @malloc(%[[VAL_1]]) {bindc_name = "", uniq_name = ""} : (i64) -> !llvm.ptr +// CHECK: %[[VAL_3:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_4:.*]] = llvm.insertvalue %[[VAL_2]], %[[VAL_3]][0] : !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_5:.*]] = llvm.insertvalue %[[VAL_1]], %[[VAL_4]][1] : !llvm.struct<(ptr, i64)> +// CHECK: llvm.return %[[VAL_5]] : !llvm.struct<(ptr, i64)> +// CHECK: } + +// CHECK-LABEL: llvm.func @boxchar_firstprivate_copy( +// CHECK-SAME: %[[ARG0:.*]]: !llvm.struct<(ptr, i64)>, +// CHECK-SAME: %[[ARG1:.*]]: !llvm.struct<(ptr, i64)>) -> !llvm.struct<(ptr, i64)> attributes {always_inline} { +// CHECK: %[[VAL_0:.*]] = llvm.extractvalue %[[ARG0]][0] : !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_1:.*]] = llvm.extractvalue %[[ARG0]][1] : !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_2:.*]] = llvm.extractvalue %[[ARG1]][0] : !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_3:.*]] = llvm.extractvalue %[[ARG1]][1] : !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_4:.*]] = llvm.icmp "slt" %[[VAL_3]], %[[VAL_1]] : i64 +// CHECK: %[[VAL_5:.*]] = llvm.select %[[VAL_4]], %[[VAL_3]], %[[VAL_1]] : i1, i64 +// CHECK: "llvm.intr.memmove"(%[[VAL_2]], %[[VAL_0]], %[[VAL_5]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () +// CHECK: llvm.return %[[ARG1]] : !llvm.struct<(ptr, i64)> +// CHECK: } + +// CHECK-LABEL: llvm.func @boxchar_firstprivate_dealloc( +// CHECK-SAME: %[[ARG0:.*]]: !llvm.struct<(ptr, i64)>) attributes {always_inline} { +// CHECK: %[[VAL_0:.*]] = llvm.extractvalue %[[ARG0]][0] : !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_1:.*]] = llvm.extractvalue %[[ARG0]][1] : !llvm.struct<(ptr, i64)> +// CHECK: llvm.call @free(%[[VAL_0]]) : (!llvm.ptr) -> () +// CHECK: llvm.return +// CHECK: } diff --git a/mlir/test/Dialect/OpenMP/omp-offload-privatization-prepare.mlir b/mlir/test/Dialect/OpenMP/omp-offload-privatization-prepare.mlir new file mode 100644 index 0000000000000..0377d4962cba9 --- /dev/null +++ b/mlir/test/Dialect/OpenMP/omp-offload-privatization-prepare.mlir @@ -0,0 +1,201 @@ +// RUN: mlir-opt --mlir-disable-threading -omp-offload-privatization-prepare --split-input-file %s | FileCheck %s + +module attributes {dlti.dl_spec = #dlti.dl_spec = dense<32> : vector<4xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, f80 = dense<128> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.mangling_mode" = "e", "dlti.legal_int_widths" = array, "dlti.stack_alignment" = 128 : i64>} { + llvm.func @free(!llvm.ptr) + llvm.func @malloc(i64) -> !llvm.ptr + + omp.private {type = firstprivate} @firstprivatizer : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> init { + ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + %0 = llvm.mlir.constant(48 : i64) : i64 + %1 = llvm.call @malloc(%0) : (i64) -> !llvm.ptr + %2 = llvm.getelementptr %arg1[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + llvm.store %1, %2 : !llvm.ptr, !llvm.ptr + omp.yield(%arg1 : !llvm.ptr) + } copy { + ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + %0 = llvm.mlir.constant(48 : i32) : i32 + "llvm.intr.memcpy"(%arg1, %arg0, %0) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () + omp.yield(%arg1 : !llvm.ptr) + } dealloc { + ^bb0(%arg0: !llvm.ptr): + llvm.call @free(%arg0) : (!llvm.ptr) -> () + omp.yield + } + omp.private {type = firstprivate} @firstprivatizer_1 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> init { + ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + %0 = llvm.mlir.constant(48 : i64) : i64 + %1 = llvm.call @malloc(%0) : (i64) -> !llvm.ptr + %2 = llvm.getelementptr %arg1[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + llvm.store %1, %2 : !llvm.ptr, !llvm.ptr + omp.yield(%arg1 : !llvm.ptr) + } copy { + ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + %0 = llvm.mlir.constant(48 : i32) : i32 + "llvm.intr.memcpy"(%arg1, %arg0, %0) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () + omp.yield(%arg1 : !llvm.ptr) + } dealloc { + ^bb0(%arg0: !llvm.ptr): + llvm.call @free(%arg0) : (!llvm.ptr) -> () + omp.yield + } + + llvm.func internal @firstprivate_test(%arg0: !llvm.ptr {fir.bindc_name = "ptr0"}, %arg1: !llvm.ptr {fir.bindc_name = "ptr1"}) { + %0 = llvm.mlir.constant(1 : i32) : i32 + %1 = llvm.mlir.constant(0 : index) : i64 + %5 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr + %19 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {bindc_name = "local"} : (i32) -> !llvm.ptr + %20 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {bindc_name = "glocal"} : (i32) -> !llvm.ptr + %21 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i32) -> !llvm.ptr + %33 = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + llvm.store %33, %19 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr + llvm.store %33, %20 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr + llvm.store %0, %21 : i32, !llvm.ptr + %124 = omp.map.info var_ptr(%21 : !llvm.ptr, i32) map_clauses(implicit) capture(ByCopy) -> !llvm.ptr {name = "i"} + %150 = llvm.getelementptr %19[0, 7, %1, 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + %151 = llvm.load %150 : !llvm.ptr -> i64 + %152 = llvm.getelementptr %19[0, 7, %1, 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + %153 = llvm.load %152 : !llvm.ptr -> i64 + %154 = llvm.getelementptr %19[0, 7, %1, 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + %155 = llvm.load %154 : !llvm.ptr -> i64 + %156 = llvm.sub %153, %1 : i64 + %157 = omp.map.bounds lower_bound(%1 : i64) upper_bound(%156 : i64) extent(%153 : i64) stride(%155 : i64) start_idx(%151 : i64) {stride_in_bytes = true} + %158 = llvm.getelementptr %19[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + %159 = omp.map.info var_ptr(%19 : !llvm.ptr, i32) map_clauses(descriptor_base_addr, to) capture(ByRef) var_ptr_ptr(%158 : !llvm.ptr) bounds(%157) -> !llvm.ptr {name = ""} + %160 = omp.map.info var_ptr(%19 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(always, descriptor, to) capture(ByRef) members(%159 : [0] : !llvm.ptr) -> !llvm.ptr + %1501 = llvm.getelementptr %20[0, 7, %1, 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + %1511 = llvm.load %1501 : !llvm.ptr -> i64 + %1521 = llvm.getelementptr %20[0, 7, %1, 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + %1531 = llvm.load %1521 : !llvm.ptr -> i64 + %1541 = llvm.getelementptr %20[0, 7, %1, 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + %1551 = llvm.load %1541 : !llvm.ptr -> i64 + %1561 = llvm.sub %1531, %1 : i64 + %1571 = omp.map.bounds lower_bound(%1 : i64) upper_bound(%1561 : i64) extent(%1531 : i64) stride(%1551 : i64) start_idx(%1511 : i64) {stride_in_bytes = true} + %1581 = llvm.getelementptr %20[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + %1591 = omp.map.info var_ptr(%20 : !llvm.ptr, i32) map_clauses(descriptor_base_addr, to) capture(ByRef) var_ptr_ptr(%1581 : !llvm.ptr) bounds(%1571) -> !llvm.ptr {name = ""} + %1601 = omp.map.info var_ptr(%20 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(always, descriptor, to) capture(ByRef) members(%1591 : [0] : !llvm.ptr) -> !llvm.ptr + + // Test with two firstprivate variables so that we test that even if there are multiple variables to be cleaned up + // only one cleanup omp.task is generated. + omp.target nowait map_entries(%124 -> %arg2, %160 -> %arg5, %159 -> %arg8, %1601 -> %arg9, %1591 -> %arg10 : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) private(@firstprivatizer %19 -> %arg11 [map_idx=1], @firstprivatizer_1 %20 -> %arg12 [map_idx=3] : !llvm.ptr, !llvm.ptr) { + omp.terminator + } + %166 = llvm.mlir.constant(48 : i32) : i32 + %167 = llvm.getelementptr %19[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + %168 = llvm.load %167 : !llvm.ptr -> !llvm.ptr + llvm.call @free(%168) : (!llvm.ptr) -> () + llvm.return + } + +} +// CHECK-LABEL: llvm.func @free(!llvm.ptr) +// CHECK: llvm.func @malloc(i64) -> !llvm.ptr + + +// CHECK-LABEL: llvm.func internal @firstprivate_test( +// CHECK-SAME: %[[ARG0:.*]]: !llvm.ptr {fir.bindc_name = "ptr0"}, +// CHECK-SAME: %[[ARG1:.*]]: !llvm.ptr {fir.bindc_name = "ptr1"}) { +// CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(1 : i32) : i32 +// CHECK: %[[VAL_1:.*]] = llvm.mlir.constant(0 : index) : i64 +// CHECK: %[[VAL_2:.*]] = llvm.alloca %[[VAL_0]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr +// CHECK: %[[VAL_3:.*]] = llvm.mlir.constant(48 : i64) : i64 +// CHECK: %[[HEAP0:.*]] = llvm.call @malloc(%[[VAL_3]]) : (i64) -> !llvm.ptr +// CHECK: %[[VAL_5:.*]] = llvm.alloca %[[VAL_0]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {bindc_name = "local"} : (i32) -> !llvm.ptr +// CHECK: %[[VAL_6:.*]] = llvm.mlir.constant(48 : i64) : i64 +// CHECK: %[[HEAP1:.*]] = llvm.call @malloc(%[[VAL_6]]) : (i64) -> !llvm.ptr +// CHECK: %[[VAL_8:.*]] = llvm.alloca %[[VAL_0]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {bindc_name = "glocal"} : (i32) -> !llvm.ptr +// CHECK: %[[VAL_9:.*]] = llvm.alloca %[[VAL_0]] x i32 {bindc_name = "i"} : (i32) -> !llvm.ptr +// CHECK: %[[VAL_10:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> +// CHECK: llvm.store %[[VAL_10]], %[[VAL_5]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr +// CHECK: llvm.store %[[VAL_10]], %[[VAL_8]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr +// CHECK: llvm.store %[[VAL_0]], %[[VAL_9]] : i32, !llvm.ptr +// CHECK: %[[VAL_11:.*]] = omp.map.info var_ptr(%[[VAL_9]] : !llvm.ptr, i32) map_clauses(implicit) capture(ByCopy) -> !llvm.ptr {name = "i"} +// CHECK: %[[VAL_12:.*]] = llvm.getelementptr %[[VAL_5]][0, 7, %[[VAL_1]], 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> +// CHECK: %[[VAL_13:.*]] = llvm.load %[[VAL_12]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_14:.*]] = llvm.getelementptr %[[VAL_5]][0, 7, %[[VAL_1]], 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> +// CHECK: %[[VAL_15:.*]] = llvm.load %[[VAL_14]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_16:.*]] = llvm.getelementptr %[[VAL_5]][0, 7, %[[VAL_1]], 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> +// CHECK: %[[VAL_17:.*]] = llvm.load %[[VAL_16]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_18:.*]] = llvm.sub %[[VAL_15]], %[[VAL_1]] : i64 +// CHECK: %[[VAL_19:.*]] = omp.map.bounds lower_bound(%[[VAL_1]] : i64) upper_bound(%[[VAL_18]] : i64) extent(%[[VAL_15]] : i64) stride(%[[VAL_17]] : i64) start_idx(%[[VAL_13]] : i64) {stride_in_bytes = true} +// CHECK: %[[VAL_20:.*]] = llvm.call @firstprivatizer_init(%[[VAL_5]], %[[HEAP0]]) : (!llvm.ptr, !llvm.ptr) -> !llvm.ptr +// CHECK: %[[VAL_21:.*]] = llvm.call @firstprivatizer_copy(%[[VAL_5]], %[[VAL_20]]) : (!llvm.ptr, !llvm.ptr) -> !llvm.ptr +// CHECK: %[[VAL_22:.*]] = llvm.getelementptr %[[VAL_8]][0, 7, %[[VAL_1]], 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> +// CHECK: %[[VAL_23:.*]] = llvm.load %[[VAL_22]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_24:.*]] = llvm.getelementptr %[[VAL_8]][0, 7, %[[VAL_1]], 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> +// CHECK: %[[VAL_25:.*]] = llvm.load %[[VAL_24]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_26:.*]] = llvm.getelementptr %[[VAL_8]][0, 7, %[[VAL_1]], 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> +// CHECK: %[[VAL_27:.*]] = llvm.load %[[VAL_26]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_28:.*]] = llvm.sub %[[VAL_25]], %[[VAL_1]] : i64 +// CHECK: %[[VAL_29:.*]] = omp.map.bounds lower_bound(%[[VAL_1]] : i64) upper_bound(%[[VAL_28]] : i64) extent(%[[VAL_25]] : i64) stride(%[[VAL_27]] : i64) start_idx(%[[VAL_23]] : i64) {stride_in_bytes = true} +// CHECK: %[[VAL_30:.*]] = llvm.call @firstprivatizer_1_init(%[[VAL_8]], %[[HEAP1]]) : (!llvm.ptr, !llvm.ptr) -> !llvm.ptr +// CHECK: %[[VAL_31:.*]] = llvm.call @firstprivatizer_1_copy(%[[VAL_8]], %[[VAL_30]]) : (!llvm.ptr, !llvm.ptr) -> !llvm.ptr +// CHECK: %[[VAL_32:.*]] = llvm.getelementptr %[[HEAP0]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> +// CHECK: %[[VAL_33:.*]] = omp.map.info var_ptr(%[[HEAP0]] : !llvm.ptr, i32) map_clauses({{.*}}to{{.*}}) capture(ByRef) var_ptr_ptr(%[[VAL_32]] : !llvm.ptr) bounds(%[[VAL_19]]) -> !llvm.ptr {name = ""} +// CHECK: %[[VAL_34:.*]] = omp.map.info var_ptr(%[[HEAP0]] : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(always,{{.*}}to) capture(ByRef) members(%[[VAL_33]] : [0] : !llvm.ptr) -> !llvm.ptr +// CHECK: %[[VAL_35:.*]] = llvm.getelementptr %[[HEAP1]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> +// CHECK: %[[VAL_36:.*]] = omp.map.info var_ptr(%[[HEAP1]] : !llvm.ptr, i32) map_clauses({{.*}}to{{.*}}) capture(ByRef) var_ptr_ptr(%[[VAL_35]] : !llvm.ptr) bounds(%[[VAL_29]]) -> !llvm.ptr {name = ""} +// CHECK: %[[VAL_37:.*]] = omp.map.info var_ptr(%[[HEAP1]] : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(always,{{.*}}to) capture(ByRef) members(%[[VAL_36]] : [0] : !llvm.ptr) -> !llvm.ptr +// CHECK: omp.target depend(taskdependout -> %[[HEAP0]] : !llvm.ptr) nowait map_entries(%[[VAL_11]] -> %[[VAL_38:.*]], %[[VAL_34]] -> %[[VAL_39:.*]], %[[VAL_33]] -> %[[VAL_40:.*]], %[[VAL_37]] -> %[[VAL_41:.*]], %[[VAL_36]] -> %[[VAL_42:.*]] : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) private(@firstprivatizer %[[HEAP0]] -> %[[VAL_43:.*]] [map_idx=1], @firstprivatizer_1 %[[HEAP1]] -> %[[VAL_44:.*]] [map_idx=3] : !llvm.ptr, !llvm.ptr) { +// CHECK: omp.terminator +// CHECK: } +// CHECK: omp.task depend(taskdependin -> %[[HEAP0]] : !llvm.ptr) { +// CHECK: llvm.call @firstprivatizer_1_dealloc(%[[VAL_31]]) : (!llvm.ptr) -> () +// CHECK: llvm.call @free(%[[HEAP1]]) : (!llvm.ptr) -> () +// CHECK: llvm.call @firstprivatizer_dealloc(%[[VAL_21]]) : (!llvm.ptr) -> () +// CHECK: llvm.call @free(%[[HEAP0]]) : (!llvm.ptr) -> () +// CHECK: omp.terminator +// CHECK: } +// CHECK: %[[VAL_45:.*]] = llvm.mlir.constant(48 : i32) : i32 +// CHECK: %[[VAL_46:.*]] = llvm.getelementptr %[[VAL_5]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> +// CHECK: %[[VAL_47:.*]] = llvm.load %[[VAL_46]] : !llvm.ptr -> !llvm.ptr +// CHECK: llvm.call @free(%[[VAL_47]]) : (!llvm.ptr) -> () +// CHECK: llvm.return +// CHECK: } + +// CHECK-LABEL: llvm.func @firstprivatizer_init( +// CHECK-SAME: %[[ARG0:.*]]: !llvm.ptr, +// CHECK-SAME: %[[ARG1:.*]]: !llvm.ptr) -> !llvm.ptr attributes {always_inline} { +// CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(48 : i64) : i64 +// CHECK: %[[VAL_1:.*]] = llvm.call @malloc(%[[VAL_0]]) : (i64) -> !llvm.ptr +// CHECK: %[[VAL_2:.*]] = llvm.getelementptr %[[ARG1]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> +// CHECK: llvm.store %[[VAL_1]], %[[VAL_2]] : !llvm.ptr, !llvm.ptr +// CHECK: llvm.return %[[ARG1]] : !llvm.ptr +// CHECK: } + +// CHECK-LABEL: llvm.func @firstprivatizer_copy( +// CHECK-SAME: %[[ARG0:.*]]: !llvm.ptr, +// CHECK-SAME: %[[ARG1:.*]]: !llvm.ptr) -> !llvm.ptr attributes {always_inline} { +// CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(48 : i32) : i32 +// CHECK: "llvm.intr.memcpy"(%[[ARG1]], %[[ARG0]], %[[VAL_0]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () +// CHECK: llvm.return %[[ARG1]] : !llvm.ptr +// CHECK: } + +// CHECK-LABEL: llvm.func @firstprivatizer_dealloc( +// CHECK-SAME: %[[ARG0:.*]]: !llvm.ptr) attributes {always_inline} { +// CHECK: llvm.call @free(%[[ARG0]]) : (!llvm.ptr) -> () +// CHECK: llvm.return +// CHECK: } + +// CHECK-LABEL: llvm.func @firstprivatizer_1_init( +// CHECK-SAME: %[[ARG0:.*]]: !llvm.ptr, +// CHECK-SAME: %[[ARG1:.*]]: !llvm.ptr) -> !llvm.ptr attributes {always_inline} { +// CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(48 : i64) : i64 +// CHECK: %[[VAL_1:.*]] = llvm.call @malloc(%[[VAL_0]]) : (i64) -> !llvm.ptr +// CHECK: %[[VAL_2:.*]] = llvm.getelementptr %[[ARG1]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> +// CHECK: llvm.store %[[VAL_1]], %[[VAL_2]] : !llvm.ptr, !llvm.ptr +// CHECK: llvm.return %[[ARG1]] : !llvm.ptr +// CHECK: } + +// CHECK-LABEL: llvm.func @firstprivatizer_1_copy( +// CHECK-SAME: %[[ARG0:.*]]: !llvm.ptr, +// CHECK-SAME: %[[ARG1:.*]]: !llvm.ptr) -> !llvm.ptr attributes {always_inline} { +// CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(48 : i32) : i32 +// CHECK: "llvm.intr.memcpy"(%[[ARG1]], %[[ARG0]], %[[VAL_0]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () +// CHECK: llvm.return %[[ARG1]] : !llvm.ptr +// CHECK: } + +// CHECK-LABEL: llvm.func @firstprivatizer_1_dealloc( +// CHECK-SAME: %[[ARG0:.*]]: !llvm.ptr) attributes {always_inline} { +// CHECK: llvm.call @free(%[[ARG0]]) : (!llvm.ptr) -> () +// CHECK: llvm.return +// CHECK: } diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir index 2fa4470bb8300..af6d254cfd3c3 100644 --- a/mlir/test/Target/LLVMIR/openmp-todo.mlir +++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir @@ -249,24 +249,6 @@ llvm.func @target_is_device_ptr(%x : !llvm.ptr) { // ----- -omp.private {type = firstprivate} @x.privatizer : i32 copy { -^bb0(%mold: !llvm.ptr, %private: !llvm.ptr): - %0 = llvm.load %mold : !llvm.ptr -> i32 - llvm.store %0, %private : i32, !llvm.ptr - omp.yield(%private: !llvm.ptr) -} -llvm.func @target_firstprivate(%x : !llvm.ptr) { - %0 = omp.map.info var_ptr(%x : !llvm.ptr, i32) map_clauses(to) capture(ByRef) -> !llvm.ptr - // expected-error@below {{not yet implemented: Unhandled clause privatization for deferred target tasks in omp.target operation}} - // expected-error@below {{LLVM Translation failed for operation: omp.target}} - omp.target nowait map_entries(%0 -> %blockarg0 : !llvm.ptr) private(@x.privatizer %x -> %arg0 [map_idx=0] : !llvm.ptr) { - omp.terminator - } - llvm.return -} - -// ----- - llvm.func @target_enter_data_depend(%x: !llvm.ptr) { // expected-error@below {{not yet implemented: Unhandled clause depend in omp.target_enter_data operation}} // expected-error@below {{LLVM Translation failed for operation: omp.target_enter_data}} From 0e807a4261913f73ce68052cb2707b4e7ae89e7b Mon Sep 17 00:00:00 2001 From: Justin Bogner Date: Wed, 22 Oct 2025 10:32:34 -0700 Subject: [PATCH 141/530] [SPIRV][HLSL] Fix assert with cbuffers through constexpr (#164555) The comment here pointed out that RAUW would fall over given a constantexpr, but then proceeded to just do what RAUW does by hand, which falls over in the same way. Instead, convert constantexprs involving cbuffer globals to instructions before processing them. The test update just modifies the existing cbuffer test, since it implied it was trying to test this exact case anyways. --- llvm/lib/Target/SPIRV/SPIRVCBufferAccess.cpp | 14 ++++++++------ llvm/test/CodeGen/SPIRV/hlsl-resources/cbuffer.ll | 6 ++---- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/SPIRV/SPIRVCBufferAccess.cpp b/llvm/lib/Target/SPIRV/SPIRVCBufferAccess.cpp index f7fb886e7391d..3ca0b40cac93e 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCBufferAccess.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVCBufferAccess.cpp @@ -35,6 +35,7 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicsSPIRV.h" #include "llvm/IR/Module.h" +#include "llvm/IR/ReplaceConstant.h" #define DEBUG_TYPE "spirv-cbuffer-access" using namespace llvm; @@ -57,6 +58,12 @@ static bool replaceCBufferAccesses(Module &M) { if (!CBufMD) return false; + SmallVector CBufferGlobals; + for (const hlsl::CBufferMapping &Mapping : *CBufMD) + for (const hlsl::CBufferMember &Member : Mapping.Members) + CBufferGlobals.push_back(Member.GV); + convertUsersOfConstantsToInstructions(CBufferGlobals); + for (const hlsl::CBufferMapping &Mapping : *CBufMD) { Instruction *HandleDef = findHandleDef(Mapping.Handle); if (!HandleDef) { @@ -80,12 +87,7 @@ static bool replaceCBufferAccesses(Module &M) { Value *GetPointerCall = Builder.CreateIntrinsic( PtrType, Intrinsic::spv_resource_getpointer, {HandleDef, IndexVal}); - // We cannot use replaceAllUsesWith here because some uses may be - // ConstantExprs, which cannot be replaced with non-constants. - SmallVector Users(MemberGV->users()); - for (User *U : Users) { - U->replaceUsesOfWith(MemberGV, GetPointerCall); - } + MemberGV->replaceAllUsesWith(GetPointerCall); } } diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/cbuffer.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/cbuffer.ll index 4d32e66d017c9..6d41875798ebc 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-resources/cbuffer.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/cbuffer.ll @@ -1,5 +1,5 @@ ; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.6-vulkan1.3-library %s -o - | FileCheck %s -; Test that uses of cbuffer members inside ConstantExprs are handled correctly. +; Test that uses of cbuffer members are handled correctly. ; CHECK-DAG: OpDecorate %[[MyCBuffer:[0-9]+]] DescriptorSet 0 ; CHECK-DAG: OpDecorate %[[MyCBuffer]] Binding 0 @@ -37,10 +37,8 @@ entry: ; CHECK: %[[tmp_ptr:[0-9]+]] = OpAccessChain {{%[0-9]+}} %[[tmp]] %[[uint_0]] %[[uint_0]] ; CHECK: %[[v_ptr:.+]] = OpAccessChain %[[_ptr_Uniform_v4float]] %[[tmp]] %[[uint_0]] %[[uint_1]] ; CHECK: %[[s_ptr_gep:[0-9]+]] = OpInBoundsAccessChain %[[_ptr_Uniform_float]] %[[tmp_ptr]] %[[uint_0]] %[[uint_1]] - %gep = getelementptr inbounds %MyStruct, ptr addrspace(12) @s, i32 0, i32 0, i32 1 - ; CHECK: %[[s_val:.+]] = OpLoad %[[float]] %[[s_ptr_gep]] - %load_from_gep = load float, ptr addrspace(12) %gep, align 4 + %load_from_gep = load float, ptr addrspace(12) getelementptr inbounds (%MyStruct, ptr addrspace(12) @s, i32 0, i32 0, i32 1), align 4 ; CHECK: %[[v_val:.+]] = OpLoad %[[v4float]] %[[v_ptr]] %load_v = load <4 x float>, ptr addrspace(12) @v, align 16 From e9c7966046803a43b711d2b309e189ea8a3e0e85 Mon Sep 17 00:00:00 2001 From: Justin Bogner Date: Wed, 22 Oct 2025 10:35:16 -0700 Subject: [PATCH 142/530] [DirectX] Fix crash when naming buffers of arrays (#164553) DXILResource was falling over trying to name a resource type that contained an array, such as `StructuredBuffer`. Handle this by walking through array types to gather the dimensions. --- llvm/lib/Analysis/DXILResource.cpp | 8 ++++++++ llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll | 9 ++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp index f9bf09262dd1f..6f19a68dcd194 100644 --- a/llvm/lib/Analysis/DXILResource.cpp +++ b/llvm/lib/Analysis/DXILResource.cpp @@ -255,6 +255,12 @@ static void formatTypeName(SmallString<64> &Dest, StringRef Name, if (!ContainedType) return; + SmallVector ArrayDimensions; + while (ArrayType *AT = dyn_cast(ContainedType)) { + ArrayDimensions.push_back(AT->getNumElements()); + ContainedType = AT->getElementType(); + } + StringRef ElementName; ElementType ET = toDXILElementType(ContainedType, IsSigned); if (ET != ElementType::Invalid) { @@ -271,6 +277,8 @@ static void formatTypeName(SmallString<64> &Dest, StringRef Name, DestStream << "<" << ElementName; if (const FixedVectorType *VTy = dyn_cast(ContainedType)) DestStream << VTy->getNumElements(); + for (uint64_t Dim : ArrayDimensions) + DestStream << "[" << Dim << "]"; DestStream << ">"; } diff --git a/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll b/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll index 4f13f4789cd66..56798c8382d45 100644 --- a/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll +++ b/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll @@ -28,6 +28,11 @@ define void @test() { @llvm.dx.resource.handlefrombinding(i32 0, i32 10, i32 1, i32 0, ptr @SB.str) ; CHECK: %"StructuredBuffer" = type { %struct.S } + ; StructuredBuffer + %struct1 = call target("dx.RawBuffer", [3 x [2 x float]], 0, 0) + @llvm.dx.resource.handlefrombinding(i32 0, i32 12, i32 1, i32 0, ptr null) + ; CHECK: %"StructuredBuffer" = type { [3 x [2 x float]] } + ; ByteAddressBuffer %byteaddr = call target("dx.RawBuffer", i8, 0, 0) @llvm.dx.resource.handlefrombinding(i32 0, i32 20, i32 1, i32 0, ptr null) @@ -40,12 +45,14 @@ define void @test() { ; CHECK-NEXT: @[[T1:.*]] = external constant %"Buffer" ; CHECK-NEXT: @[[T2:.*]] = external constant %"Buffer" ; CHECK-NEXT: @[[S0:.*]] = external constant %"StructuredBuffer" +; CHECK-NEXT: @[[S1:.*]] = external constant %"StructuredBuffer" ; CHECK-NEXT: @[[B0:.*]] = external constant %ByteAddressBuffer ; CHECK: !{i32 0, ptr @[[T0]], !"A" ; CHECK: !{i32 1, ptr @[[T1]], !"" ; CHECK: !{i32 2, ptr @[[T2]], !"" ; CHECK: !{i32 3, ptr @[[S0]], !"SB" -; CHECK: !{i32 4, ptr @[[B0]], !"" +; CHECK: !{i32 4, ptr @[[S1]], !"" +; CHECK: !{i32 5, ptr @[[B0]], !"" attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) } From 735b1ad667ac7373c89ccc0f0e757ef418f8f790 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Wed, 22 Oct 2025 10:46:37 -0700 Subject: [PATCH 143/530] [bazel] Fix Formatting (#164672) To make the CI happy again. --- utils/bazel/llvm-project-overlay/libc/BUILD.bazel | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index 2d9433f6e4dc7..599bc4b3d8bbf 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -3093,8 +3093,8 @@ libc_support_library( libc_support_library( name = "__support_sincosf_utils", hdrs = [ - "src/__support/math/sincosf_utils.h", "src/__support/math/sincosf_float_eval.h", + "src/__support/math/sincosf_utils.h", ], deps = [ ":__support_fputil_double_double", From 6c8ad30a82a7ed240cbc3abc07c6759905fecd5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Wed, 22 Oct 2025 07:55:11 -1000 Subject: [PATCH 144/530] [flang][cuda] Do not consider function result as host array (#164669) The function result in a device function is not a host array. Avoid triggering the error `Host array 'res' cannot be present in device context` for this. --- flang/lib/Semantics/check-cuda.cpp | 3 +++ flang/test/Semantics/cuf09.cuf | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/flang/lib/Semantics/check-cuda.cpp b/flang/lib/Semantics/check-cuda.cpp index 3d2db6a9c8aa9..caa9bdd28f1f4 100644 --- a/flang/lib/Semantics/check-cuda.cpp +++ b/flang/lib/Semantics/check-cuda.cpp @@ -131,6 +131,9 @@ struct FindHostArray return (*this)(x.base()); } Result operator()(const Symbol &symbol) const { + if (symbol.IsFuncResult()) { + return nullptr; + } if (const auto *details{ symbol.GetUltimate().detailsIf()}) { if (details->IsArray() && diff --git a/flang/test/Semantics/cuf09.cuf b/flang/test/Semantics/cuf09.cuf index 9178b0a63adbe..df6568df9b480 100644 --- a/flang/test/Semantics/cuf09.cuf +++ b/flang/test/Semantics/cuf09.cuf @@ -36,6 +36,12 @@ module m if (i .le. N) a(i) = m(i) end subroutine + attributes(device) function devfct(r1, r2) result(res) + real(4), intent(in) :: r1(3), r2(3) + real(4) :: res(3) + res = r1 - r2 ! Do not error on function result + end function + attributes(global) subroutine hostparameter(a) integer :: a(*) i = threadIdx%x From c70d0812ba17953b3e405eb2b01dd892b94585f1 Mon Sep 17 00:00:00 2001 From: "S. VenkataKeerthy" <31350914+svkeerthy@users.noreply.github.com> Date: Wed, 22 Oct 2025 10:58:38 -0700 Subject: [PATCH 145/530] [MIR2Vec] Handle Operands (#163281) Handling opcodes in embedding computation. - Revamped MIR Vocabulary with four sections - `Opcodes`, `Common Operands`, `Physical Registers`, and `Virtual Registers` - Operands broadly fall into 3 categories -- the generic MO types that are common across architectures, physical and virtual register classes. We handle these categories separately in MIR2Vec. (Though we have same classes for both physical and virtual registers, their embeddings vary). --- llvm/include/llvm/CodeGen/MIR2Vec.h | 138 ++++++- .../models/x86SeedEmbeddingVocab100D.json | 70 ++-- llvm/lib/CodeGen/MIR2Vec.cpp | 335 +++++++++++++--- .../Inputs/mir2vec_dummy_2D_vocab.json | 18 +- .../Inputs/mir2vec_dummy_3D_vocab.json | 18 +- .../Inputs/mir2vec_inconsistent_dims.json | 11 +- .../MIR2Vec/Inputs/mir2vec_zero_vocab.json | 11 +- .../Inputs/reference_x86_vocab_print.txt | 291 ++++++++++++++ .../reference_x86_vocab_wo=0.5_print.txt | 291 ++++++++++++++ llvm/test/CodeGen/MIR2Vec/if-else.mir | 8 +- .../MIR2Vec/mir2vec-basic-symbolic.mir | 22 +- .../CodeGen/MIR2Vec/vocab-error-handling.ll | 6 +- llvm/unittests/CodeGen/MIR2VecTest.cpp | 369 +++++++++++++++--- 13 files changed, 1422 insertions(+), 166 deletions(-) diff --git a/llvm/include/llvm/CodeGen/MIR2Vec.h b/llvm/include/llvm/CodeGen/MIR2Vec.h index f6b0571f5dac6..4bcbad7b53082 100644 --- a/llvm/include/llvm/CodeGen/MIR2Vec.h +++ b/llvm/include/llvm/CodeGen/MIR2Vec.h @@ -35,6 +35,8 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/PassManager.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" @@ -61,7 +63,7 @@ class MIREmbedder; class SymbolicMIREmbedder; extern llvm::cl::OptionCategory MIR2VecCategory; -extern cl::opt OpcWeight; +extern cl::opt OpcWeight, CommonOperandWeight, RegOperandWeight; using Embedding = ir2vec::Embedding; using MachineInstEmbeddingsMap = DenseMap; @@ -74,31 +76,114 @@ class MIRVocabulary { friend class llvm::MIR2VecVocabLegacyAnalysis; using VocabMap = std::map; -private: - // Define vocabulary layout - adapted for MIR + // MIRVocabulary Layout: + // +-------------------+-----------------------------------------------------+ + // | Entity Type | Description | + // +-------------------+-----------------------------------------------------+ + // | 1. Opcodes | Target specific opcodes derived from TII, grouped | + // | | by instruction semantics. | + // | 2. Common Operands| All common operand types, except register operands, | + // | | defined by MachineOperand::MachineOperandType enum. | + // | 3. Physical | Register classes defined by the target, specialized | + // | Reg classes | by physical registers. | + // | 4. Virtual | Register classes defined by the target, specialized | + // | Reg classes | by virtual and physical registers. | + // +-------------------+-----------------------------------------------------+ + + /// Layout information for the MIR vocabulary. Defines the starting index + /// and size of each section in the vocabulary. struct { size_t OpcodeBase = 0; - size_t OperandBase = 0; + size_t CommonOperandBase = 0; + size_t PhyRegBase = 0; + size_t VirtRegBase = 0; size_t TotalEntries = 0; } Layout; - enum class Section : unsigned { Opcodes = 0, MaxSections }; + enum class Section : unsigned { + Opcodes = 0, + CommonOperands = 1, + PhyRegisters = 2, + VirtRegisters = 3, + MaxSections + }; ir2vec::VocabStorage Storage; - mutable std::set UniqueBaseOpcodeNames; + std::set UniqueBaseOpcodeNames; + SmallVector RegisterOperandNames; + + // Some instructions have optional register operands that may be NoRegister. + // We return a zero vector in such cases. + Embedding ZeroEmbedding; + + // We have specialized MO_Register handling in the Register operand section, + // so we don't include it here. Also, no MO_DbgInstrRef for now. + static constexpr StringLiteral CommonOperandNames[] = { + "Immediate", "CImmediate", "FPImmediate", "MBB", + "FrameIndex", "ConstantPoolIndex", "TargetIndex", "JumpTableIndex", + "ExternalSymbol", "GlobalAddress", "BlockAddress", "RegisterMask", + "RegisterLiveOut", "Metadata", "MCSymbol", "CFIIndex", + "IntrinsicID", "Predicate", "ShuffleMask"}; + static_assert(std::size(CommonOperandNames) == MachineOperand::MO_Last - 1 && + "Common operand names size changed, update accordingly"); + const TargetInstrInfo &TII; - void generateStorage(const VocabMap &OpcodeMap); + const TargetRegisterInfo &TRI; + const MachineRegisterInfo &MRI; + + void generateStorage(const VocabMap &OpcodeMap, + const VocabMap &CommonOperandMap, + const VocabMap &PhyRegMap, const VocabMap &VirtRegMap); void buildCanonicalOpcodeMapping(); + void buildRegisterOperandMapping(); /// Get canonical index for a machine opcode unsigned getCanonicalOpcodeIndex(unsigned Opcode) const; + /// Get index for a common (non-register) machine operand + unsigned + getCommonOperandIndex(MachineOperand::MachineOperandType OperandType) const; + + /// Get index for a register machine operand + unsigned getRegisterOperandIndex(Register Reg) const; + + // Accessors for operand types + const Embedding & + operator[](MachineOperand::MachineOperandType OperandType) const { + unsigned LocalIndex = getCommonOperandIndex(OperandType); + return Storage[static_cast(Section::CommonOperands)][LocalIndex]; + } + + const Embedding &operator[](Register Reg) const { + // Reg is sometimes NoRegister (0) for optional operands. We return a zero + // vector in this case. + if (!Reg.isValid()) + return ZeroEmbedding; + // TODO: Implement proper stack slot handling for MIR2Vec embeddings. + // Stack slots represent frame indices and should have their own + // embedding strategy rather than defaulting to register class 0. + // Consider: 1) Separate vocabulary section for stack slots + // 2) Stack slot size/alignment based embeddings + // 3) Frame index based categorization + if (Reg.isStack()) + return ZeroEmbedding; + + unsigned LocalIndex = getRegisterOperandIndex(Reg); + auto SectionID = + Reg.isPhysical() ? Section::PhyRegisters : Section::VirtRegisters; + return Storage[static_cast(SectionID)][LocalIndex]; + } + public: /// Static method for extracting base opcode names (public for testing) static std::string extractBaseOpcodeName(StringRef InstrName); - /// Get canonical index for base name (public for testing) + /// Get indices from opcode or operand names. These are public for testing. + /// String based lookups are inefficient and should be avoided in general. unsigned getCanonicalIndexForBaseName(StringRef BaseName) const; + unsigned getCanonicalIndexForOperandName(StringRef OperandName) const; + unsigned getCanonicalIndexForRegisterClass(StringRef RegName, + bool IsPhysical = true) const; /// Get the string key for a vocabulary entry at the given position std::string getStringKey(unsigned Pos) const; @@ -111,6 +196,14 @@ class MIRVocabulary { return Storage[static_cast(Section::Opcodes)][LocalIndex]; } + const Embedding &operator[](MachineOperand Operand) const { + auto OperandType = Operand.getType(); + if (OperandType == MachineOperand::MO_Register) + return operator[](Operand.getReg()); + else + return operator[](OperandType); + } + // Iterator access using const_iterator = ir2vec::VocabStorage::const_iterator; const_iterator begin() const { return Storage.begin(); } @@ -120,18 +213,25 @@ class MIRVocabulary { MIRVocabulary() = delete; /// Factory method to create MIRVocabulary from vocabulary map - static Expected create(VocabMap &&Entries, - const TargetInstrInfo &TII); + static Expected + create(VocabMap &&OpcMap, VocabMap &&CommonOperandsMap, VocabMap &&PhyRegMap, + VocabMap &&VirtRegMap, const TargetInstrInfo &TII, + const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI); /// Create a dummy vocabulary for testing purposes. static Expected - createDummyVocabForTest(const TargetInstrInfo &TII, unsigned Dim = 1); + createDummyVocabForTest(const TargetInstrInfo &TII, + const TargetRegisterInfo &TRI, + const MachineRegisterInfo &MRI, unsigned Dim = 1); /// Total number of entries in the vocabulary size_t getCanonicalSize() const { return Storage.size(); } private: - MIRVocabulary(VocabMap &&Entries, const TargetInstrInfo &TII); + MIRVocabulary(VocabMap &&OpcMap, VocabMap &&CommonOperandsMap, + VocabMap &&PhyRegMap, VocabMap &&VirtRegMap, + const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, + const MachineRegisterInfo &MRI); }; /// Base class for MIR embedders @@ -144,11 +244,13 @@ class MIREmbedder { const unsigned Dimension; /// Weight for opcode embeddings - const float OpcWeight; + const float OpcWeight, CommonOperandWeight, RegOperandWeight; MIREmbedder(const MachineFunction &MF, const MIRVocabulary &Vocab) : MF(MF), Vocab(Vocab), Dimension(Vocab.getDimension()), - OpcWeight(mir2vec::OpcWeight) {} + OpcWeight(mir2vec::OpcWeight), + CommonOperandWeight(mir2vec::CommonOperandWeight), + RegOperandWeight(mir2vec::RegOperandWeight) {} /// Function to compute embeddings. Embedding computeEmbeddings() const; @@ -208,11 +310,11 @@ class SymbolicMIREmbedder : public MIREmbedder { class MIR2VecVocabLegacyAnalysis : public ImmutablePass { using VocabVector = std::vector; using VocabMap = std::map; - VocabMap StrVocabMap; - VocabVector Vocab; + std::optional Vocab; StringRef getPassName() const override; - Error readVocabulary(); + Error readVocabulary(VocabMap &OpcVocab, VocabMap &CommonOperandVocab, + VocabMap &PhyRegVocabMap, VocabMap &VirtRegVocabMap); protected: void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -275,4 +377,4 @@ MachineFunctionPass *createMIR2VecPrinterLegacyPass(raw_ostream &OS); } // namespace llvm -#endif // LLVM_CODEGEN_MIR2VEC_H \ No newline at end of file +#endif // LLVM_CODEGEN_MIR2VEC_H diff --git a/llvm/lib/Analysis/models/x86SeedEmbeddingVocab100D.json b/llvm/lib/Analysis/models/x86SeedEmbeddingVocab100D.json index 0afe5c77d0461..f026b0d183a01 100644 --- a/llvm/lib/Analysis/models/x86SeedEmbeddingVocab100D.json +++ b/llvm/lib/Analysis/models/x86SeedEmbeddingVocab100D.json @@ -1,5 +1,5 @@ { - "entities" : { + "Opcodes" : { "ABS_Fp":[0.07323841750621796, -0.006006906274706125, 0.09751169383525848, -0.011089739389717579, 0.06642112135887146, -0.015824640169739723, -0.021592319011688232, -0.0035401300992816687, 0.06047678738832474, -0.007392085622996092, 0.07134906202554703, -0.019624482840299606, -0.10975595563650131, -0.007685789838433266, 0.07451746612787247, 0.06384266912937164, -0.08230067789554596, 0.050922468304634094, 0.013724055141210556, 0.015687907114624977, -0.018451329320669174, 0.046987198293209076, -0.037734340876340866, -0.07235030829906464, 0.10218106210231781, 0.08037368208169937, -0.029537858441472054, -0.047520823776721954, -0.022125739604234695, -0.03125226870179176, -0.02882847562432289, 0.013811410404741764, 0.0023568253964185715, 0.017958490177989006, -0.05359291657805443, -0.03606243059039116, 0.07840022444725037, -0.016711654141545296, -0.038644544780254364, 0.05886651948094368, -0.011418955400586128, -0.04882095381617546, 0.04027162492275238, 0.001088760793209076, 0.03045983798801899, -0.10998888313770294, -0.0097441291436553, 0.015445191413164139, 0.030951637774705887, -0.06309321522712708, -0.019475746899843216, -0.029662512242794037, 0.05312168970704079, 0.05355998873710632, 0.05060160160064697, -0.053278811275959015, -0.01803833432495594, 0.010853713378310204, -0.053911495953798294, 0.06630647927522659, -0.08671313524246216, 0.0699775293469429, -0.08346731215715408, -0.045348167419433594, 0.06779918074607849, 0.008865933865308762, 0.05460203066468239, 0.007126103155314922, 0.0012282058596611023, 0.06817980855703354, 0.0216530654579401, 0.03552381321787834, 0.015414077788591385, -0.06002715229988098, 0.05233345925807953, 0.0782286673784256, 0.04220856353640556, -0.005762201733887196, 0.004772072657942772, 0.004578332882374525, 0.002619141712784767, 0.024511393159627914, -0.10089710354804993, 0.018322769552469254, 0.020811809226870537, -0.03358744457364082, -0.06896928697824478, -0.007399350870400667, -0.044467780739068985, -0.08094192296266556, -0.09795571863651276, 0.08391229063272476, -0.04749457910656929, 0.0029586481396108866, -5.354872337193228e-05, 0.005788655485957861, 0.015252145007252693, 0.06928747892379761, 0.041780371218919754, 0.016391364857554436], "ADC":[-0.07533542811870575, -0.01729339174926281, 0.04298720881342888, 0.015697332099080086, -0.04403507336974144, -0.059322185814380646, -0.050977922976017, 0.027526788413524628, -0.07009710371494293, -0.025621667504310608, 0.0352291613817215, -0.011538374237716198, 0.03682859241962433, -0.09788215160369873, -0.07216927409172058, -0.03659192472696304, 0.05676230415701866, -0.06369645893573761, -0.04756825789809227, 0.005865555722266436, 0.022270306944847107, -0.042112063616514206, 0.07008901983499527, 0.07748222351074219, -0.1020870953798294, -0.008511601015925407, -0.05725255608558655, -0.07881367206573486, 0.05627593398094177, -0.0005361076910048723, 0.03351512551307678, 0.04348289221525192, -0.08322969079017639, -0.02161242999136448, -0.07805898040533066, 0.04819482937455177, -0.061123576015233994, -0.010114834643900394, -0.04676959663629532, -0.008176938630640507, 0.010575453750789165, -0.04312445595860481, 0.00376943894661963, -0.0691257119178772, 0.03553615137934685, 0.10397598147392273, 0.009375158697366714, 0.001147320494055748, 0.026351911947131157, -0.0194610096514225, -0.05202522128820419, 0.014047946780920029, -0.040036872029304504, 0.06963572651147842, 0.04827437922358513, -0.06908547878265381, 0.024857567623257637, -0.03304143249988556, 0.02291242778301239, 0.07687342166900635, -0.05110599845647812, -0.00873416755348444, 0.026205750182271004, 0.045064594596624374, -0.03565925359725952, 0.09580051153898239, -0.02518773265182972, 0.047807395458221436, -0.03548192232847214, 0.08286304026842117, -0.053511787205934525, 0.02892065793275833, -0.0495525486767292, 0.02590095065534115, -0.006982128601521254, 0.006042638327926397, -0.07269058376550674, 0.02401554025709629, -0.05660006031394005, -0.026029467582702637, 0.05318204686045647, 0.06714116781949997, -0.0023821850772947073, 0.05028798058629036, -0.005811943672597408, -0.003296421840786934, -0.005409242119640112, -0.10150349885225296, -0.06406981498003006, 0.02553202211856842, -0.002790689468383789, 0.0663856491446495, 0.09109167754650116, -0.04678672179579735, 0.022019781172275543, 0.007821275852620602, 0.022490357980132103, -0.058503177016973495, 0.08841150254011154, -0.00892670825123787], "ADD":[-0.037626221776008606, 0.006784931290894747, 0.10051396489143372, -0.0014993306249380112, -0.0323498398065567, -0.03148593008518219, -0.014100957661867142, -0.020252650603652, 0.014126972295343876, -0.1295478343963623, 0.08520576357841492, -0.02513248659670353, 0.03539956361055374, -0.07019674777984619, -0.019069846719503403, 0.016678515821695328, -0.009174983017146587, -0.019034702330827713, -0.024083402007818222, -0.07829779386520386, -0.007908892817795277, -0.07924024760723114, -0.034599609673023224, 0.05271153524518013, 0.0016642026603221893, -0.03938138112425804, 0.0019624519627541304, 0.03562740981578827, 0.07340876758098602, 0.09457183629274368, -0.06507840752601624, 0.00246993126347661, -0.004548616707324982, 0.058226197957992554, -0.021043049171566963, -0.0599520243704319, -0.03138553351163864, 0.03265950828790665, 0.004963710438460112, -0.003248866181820631, -0.04021746292710304, 0.038208190351724625, -0.02256007120013237, 0.10770396143198013, 0.013757425360381603, 0.040707558393478394, -0.00694271270185709, -0.012331271544098854, 0.004992029629647732, -0.032236646860837936, 0.01055158581584692, 0.04604483023285866, 0.09973260760307312, 0.07322807610034943, 0.06853726506233215, 0.004230210557579994, -0.04007832333445549, 0.16341225802898407, -0.01683313027024269, -0.01998194307088852, -0.035919081419706345, -0.055582448840141296, 0.008072910830378532, -0.0054771858267486095, -0.013343624770641327, 0.014230597764253616, -0.06542462855577469, 0.015897123143076897, -0.06011642515659332, 0.07983837276697159, 0.026512078940868378, 0.014883842319250107, -0.015171286650002003, 4.1508101276122034e-05, -0.048078570514917374, -0.052594274282455444, -0.07897629588842392, -0.01334046758711338, -0.06180298700928688, 0.022423526272177696, 0.07393807917833328, 0.022332284599542618, 0.04279463365674019, 0.04075624793767929, 0.007524204906076193, -0.024405587464571, 0.0011822516098618507, -0.0019135301699861884, 0.10789427906274796, -0.040499038994312286, 0.011574117466807365, 0.048836030066013336, 0.0380941741168499, -0.047072283923625946, -0.01285380870103836, -0.038019485771656036, -0.06277137994766235, -0.0034404860343784094, -0.031123748049139977, 0.04279843345284462], @@ -47,7 +47,6 @@ "CVTSS":[-0.06638028472661972, -0.011326023377478123, 0.008208844810724258, 0.007368308026343584, 0.009791173972189426, -0.03396046161651611, 0.02250068075954914, -0.057750262320041656, -0.04949551820755005, 0.02559898979961872, -0.025012727826833725, -0.05923935025930405, 0.005058884620666504, 0.008716589771211147, -0.017511164769530296, -0.07095059007406235, -0.06573225557804108, -0.028140492737293243, 0.11092227697372437, 0.02664722129702568, -0.01997300609946251, 0.0798712745308876, -0.022800235077738762, 0.09157945215702057, 0.025709187611937523, -0.09037603437900543, -0.07092109322547913, -0.04094154015183449, -0.025702493265271187, 0.015247789211571217, 0.06089004501700401, 0.051023274660110474, -0.04670926183462143, 0.04763137549161911, -0.035940639674663544, 0.002320673782378435, -0.005764417815953493, -0.07975194603204727, -0.0038822791539132595, 0.06728507578372955, 0.020742014050483704, 0.08809743821620941, -0.061493389308452606, -0.0485445000231266, -0.022268671542406082, 0.08475345373153687, -0.0030403153505176306, -0.05737586319446564, -0.07930854707956314, -0.01657176949083805, 0.04658877104520798, 0.005716703832149506, -0.04288295656442642, -0.08686209470033646, -0.07359853386878967, 0.02947128191590309, -0.03684910386800766, -0.03841136023402214, 0.01288131158798933, -0.04918907582759857, -0.05579863488674164, 0.06267702579498291, -0.0034505922812968493, 0.034628838300704956, 0.04280426353216171, 0.042202845215797424, 0.012274117209017277, 0.025021208450198174, -0.07867497205734253, 0.03826712444424629, 0.017088277265429497, 0.037250861525535583, -0.016143174842000008, -0.06754780560731888, -0.013957766816020012, 0.1060054823756218, 0.014829001389443874, 0.06808885931968689, 0.022929415106773376, -0.10870063304901123, -0.002258410444483161, 0.009293666109442711, 0.08529872447252274, -0.018672339618206024, -0.06721168756484985, 0.04180533438920975, -0.0031767592299729586, -0.023869113996624947, -0.00011912015179404989, -0.034519728273153305, 0.0022619885858148336, -0.00573525857180357, -0.033912476152181625, 0.059763263911008835, -0.048703599721193314, -0.07433722168207169, 0.04105979949235916, 0.0022583131212741137, 0.03093089908361435, -0.05187990516424179], "CVTTSD":[-0.08537309616804123, 0.0010597433429211378, 0.07481679320335388, 0.05997887998819351, -0.0376993790268898, 0.10309506952762604, 0.07795511186122894, 0.0833413377404213, 0.056095756590366364, 0.05851535126566887, -0.057075001299381256, 0.020756129175424576, -0.08901876956224442, 0.02559811621904373, -0.016971183940768242, -0.04282280057668686, -0.005386374890804291, -0.06672719866037369, -0.09664622694253922, 0.06042492762207985, -0.042353514581918716, 0.06194235011935234, -0.025712836533784866, -0.029526079073548317, 0.044016264379024506, 0.036507125943899155, -0.038406822830438614, 0.006118632387369871, -0.0495009683072567, -0.07487531006336212, -0.07304015755653381, 0.042621925473213196, -0.06314127147197723, 0.03934277594089508, -0.09373295307159424, -0.05887934938073158, 0.010626542381942272, -0.050934500992298126, -0.037448156625032425, 0.01178495679050684, -0.07045318186283112, 0.10210251808166504, -0.07279546558856964, 0.04947654530405998, -0.039519909769296646, 0.07030976563692093, -0.011039734818041325, 0.01187387015670538, -0.0840335488319397, -0.005615191534161568, -0.06869980692863464, -0.012282256036996841, -0.013054385781288147, -0.0711965560913086, 0.015505223535001278, 0.0693473145365715, 0.012862266041338444, -0.04747828096151352, 0.023439936339855194, 0.03891129791736603, -0.04998844489455223, -0.04673001170158386, 0.02121424488723278, 0.0501207634806633, 0.07420068979263306, -0.014888633042573929, 0.007586659397929907, 0.01340668834745884, -0.09216003119945526, 0.09335170686244965, 0.023272672668099403, 0.030810026451945305, 0.05792044475674629, -0.020374637097120285, -0.02717672660946846, 0.028085753321647644, 0.08691198378801346, 0.061656054109334946, -0.07689087092876434, 0.0407567173242569, 0.010403914377093315, -0.03389676660299301, 0.07075867801904678, 0.002534526167437434, -0.026066122576594353, 0.005012217443436384, 0.08335569500923157, -0.02732011303305626, 0.03854125738143921, 0.03336648270487785, -0.10646265000104904, -0.003997548017650843, 0.09871185570955276, 0.0275016650557518, 0.015653448179364204, 0.07066125422716141, 0.05811227858066559, 0.046357106417417526, 0.047027964144945145, 0.07407277077436447], "CVTTSS":[-0.07762601226568222, 0.051891617476940155, 0.02840222790837288, 0.012996217235922813, -0.04709569737315178, -0.011790127493441105, 0.07787185907363892, 0.07411551475524902, 0.04010153189301491, 0.000911108567379415, -0.09610971063375473, 0.042953960597515106, 0.01613607630133629, -0.07504888623952866, -0.04967263713479042, 0.06148393824696541, -0.018901845440268517, 0.08033818751573563, -0.06893469393253326, -0.036083199083805084, 0.08206851035356522, 0.08462843298912048, 0.06728347390890121, -0.03210798278450966, -0.019102206453680992, 0.0723310112953186, 0.009836986660957336, -0.057902153581380844, 0.007954364642500877, -0.015247606672346592, 0.08317636698484421, -0.030078981071710587, -0.003329804167151451, -0.00047014118172228336, -0.02859017252922058, -0.07635723054409027, -0.008230162784457207, 0.03107159025967121, -0.009525406174361706, 0.06515175849199295, -0.06525594741106033, -0.028639627620577812, -0.0781184732913971, 0.009911812841892242, 0.011008340865373611, -0.04294031485915184, -0.04256690293550491, -1.394751961925067e-05, -0.029347950592637062, -0.031849224120378494, 0.012988862581551075, -0.0009693846222944558, -0.019299298524856567, 0.0045416890643537045, -0.04690401256084442, -0.04800841212272644, 0.0020325451623648405, -0.02004505693912506, 0.04130777344107628, -0.033602941781282425, -0.06956057250499725, -0.008079515770077705, 0.0033002288546413183, 0.03853915259242058, 0.08760882169008255, -0.04805464297533035, 0.02319355681538582, 0.018974801525473595, -0.08521144837141037, -0.05224936082959175, -0.023577861487865448, 0.01627342589199543, 0.024244949221611023, 0.09439363330602646, -0.007235093507915735, 0.055853620171546936, -0.00885567907243967, 0.02217228338122368, 0.05414341762661934, -0.0278383269906044, -0.000764147553127259, 0.045272815972566605, -0.009049531072378159, 0.05590446665883064, -0.05074811726808548, -0.06311893463134766, -0.026139337569475174, 0.01067473366856575, -0.043730076402425766, -0.07134802639484406, -0.11087869852781296, 0.05522335693240166, -0.07894640415906906, -0.06710508465766907, -0.022497203201055527, 0.0777427926659584, -0.07944057136774063, 0.05494234338402748, -0.04788406938314438, -0.032921578735113144], - "ConstantPoolIndex":[0.041396364569664, -0.032536957412958145, -0.01450332161039114, -0.006678386591374874, 0.058945223689079285, 0.02544882893562317, -0.03047209233045578, -0.07739393413066864, -0.09328317642211914, -0.01668739691376686, -0.024649402126669884, -0.0379607230424881, -0.11910244077444077, -0.020992999896407127, -0.007654233835637569, -0.005232746247202158, -0.05641235038638115, -0.030478237196803093, -0.11095637828111649, -0.029757868498563766, 0.007831704802811146, -0.06478779017925262, -0.029330771416425705, -0.016729608178138733, 0.016851121559739113, -0.08636923134326935, 0.09819734841585159, -0.06862954050302505, -0.054081980139017105, -0.11573795974254608, 0.025045182555913925, -0.045820001512765884, -0.03937136381864548, -0.0006095073185861111, 0.010480350814759731, 0.04263518005609512, -0.07309181243181229, 0.030367357656359673, 0.05174611508846283, -0.07616177201271057, 0.08458246290683746, -0.05704038590192795, -0.08539492636919022, -0.027642514556646347, -0.01617196388542652, 0.025178344920277596, 0.009598441421985626, -0.02391812391579151, -0.007018273696303368, 0.08220435678958893, 0.019317878410220146, -0.07800780981779099, 0.008812256157398224, -0.08796992152929306, -0.018406951799988747, 0.06285018473863602, 0.0247958917170763, -0.010797450318932533, 0.042904313653707504, 0.04307369515299797, 0.03591239079833031, 0.0318138487637043, -0.052741825580596924, -0.05960077419877052, 0.05289359390735626, -0.07335714250802994, -0.07966916263103485, 0.06509458273649216, -0.014078558422625065, 0.05966315418481827, -0.10191051661968231, 0.038503143936395645, 0.08414285629987717, -0.09167703986167908, -0.03125883638858795, 0.00029595239902846515, -0.05052953213453293, 0.06109768897294998, 0.027757229283452034, 0.07064288854598999, 0.025423981249332428, 0.04430470988154411, 0.006646708585321903, 0.011614424176514149, -0.058028463274240494, -0.026873555034399033, -0.045714568346738815, -0.009242760017514229, -0.08255617320537567, 0.03060135245323181, -0.019932182505726814, -0.07189206779003143, 0.01935136877000332, 0.05297813192009926, 0.004497232846915722, -0.08383949100971222, -0.0008196682319976389, 0.03524069860577583, 0.023135961964726448, 0.00863903108984232], "DEC":[0.0634445771574974, -0.06605149805545807, 0.03212125599384308, 0.030006375163793564, -0.08837386220693588, -0.016591178253293037, -0.03157195448875427, 0.005282422062009573, 0.04301748052239418, -0.035375431180000305, -0.050481121987104416, -0.10733921080827713, -0.03802337497472763, -0.0745977833867073, -0.03943190351128578, -0.014895747415721416, 0.004689200781285763, -0.05872263386845589, -0.02043316885828972, 0.017881838604807854, -0.02151746302843094, 0.049130357801914215, -0.0980888232588768, -0.0012140831677243114, -0.03892286866903305, -0.050167523324489594, -0.06817777454853058, 0.011282221414148808, 0.0848090872168541, -0.04859968274831772, -0.005405630450695753, 0.09327276051044464, -0.031913015991449356, -0.07784294337034225, -0.039762917906045914, -0.0004000961489509791, -0.03763844072818756, -0.024915525689721107, 0.04509890824556351, 0.05546657368540764, -0.055939678102731705, -0.0467451736330986, -0.030023904517292976, -0.010519847273826599, 0.009574057534337044, 0.023444844409823418, 0.007250144146382809, 0.060414351522922516, -0.0011268716771155596, -0.10112253576517105, -0.068567655980587, -0.044332459568977356, 0.0022569731809198856, -0.012019195593893528, 0.0016708170296624303, 0.01029527559876442, -0.024694599211215973, -0.0428738109767437, 0.053816765546798706, 0.09999147802591324, 0.06608963757753372, 0.014324366115033627, 0.022997796535491943, -0.012565241195261478, -0.008212191984057426, -0.012308428063988686, -0.09830988198518753, -0.04177428036928177, 0.03759279474616051, 0.06749766319990158, -0.08330990374088287, -0.06375840306282043, 0.0471678152680397, 0.06524914503097534, 0.09668447077274323, 0.07395336031913757, -0.06081546097993851, 0.0322561152279377, -0.05461571738123894, 0.022349894046783447, 0.0981096625328064, 0.019211066886782646, 0.10566835105419159, 0.004508140496909618, 0.030159158632159233, 0.1076640635728836, -0.004145997576415539, 0.08043811470270157, 0.030684711411595345, 0.07909402251243591, -0.015952520072460175, 0.027102122083306313, 0.017120881006121635, 0.0860346332192421, 0.06145261228084564, -0.01827210932970047, 0.027506740763783455, 0.08201386034488678, -0.09402544051408768, -0.07927247136831284], "DIV":[0.08121486008167267, -0.06398852169513702, -0.007856910116970539, 0.09644383192062378, 0.0013691268395632505, 0.03523438796401024, -0.04342259466648102, -0.011761687695980072, 0.021194210276007652, -0.0386938601732254, -0.004948849324136972, -0.08348845690488815, 0.005121953319758177, -0.06682730466127396, -0.004115825518965721, 0.015023703686892986, 0.042783256620168686, 0.08872916549444199, -0.03392689675092697, -0.014770613051950932, 0.001988545060157776, -0.05145770683884621, -0.029310323297977448, 0.06324473023414612, -0.08066411316394806, 0.006997138261795044, 0.004352204035967588, -0.060964930802583694, 0.02948148362338543, 0.052747759968042374, -0.05635778605937958, -0.014655586332082748, 0.015838103368878365, -0.04539657384157181, 0.031915292143821716, 0.05234432592988014, -0.012030252255499363, 0.06431112438440323, -0.027869969606399536, -0.006431832443922758, 0.025956276804208755, 0.047651831060647964, -0.01758543774485588, 0.07249220460653305, -0.049627624452114105, -0.007435495033860207, 0.0015833197394385934, 2.190603845519945e-05, 0.03457536920905113, 0.03895196691155434, -0.037442032247781754, 0.003120564157143235, -0.0690622553229332, -0.04405339062213898, 0.016464274376630783, -0.05068953335285187, 0.009520933963358402, 0.05033525079488754, 0.030095860362052917, 0.08773164451122284, -0.03623930364847183, -0.0076989103108644485, 0.0133424773812294, 0.025229837745428085, 0.018198521807789803, 0.011319941841065884, -0.005582685582339764, -0.03598775342106819, -0.0565820187330246, 0.08609189838171005, 0.035601116716861725, -0.007436969317495823, -0.018040914088487625, -0.04825054481625557, -0.014956142753362656, 0.03343576192855835, -0.0739198625087738, 0.038971979171037674, -0.03691745549440384, -0.0371851809322834, 0.08137080073356628, 0.03924981504678726, -0.06499960273504257, 0.047913506627082825, -0.0464070662856102, 0.04404731094837189, -0.03972303494811058, 0.03341617435216904, 0.05367732420563698, -0.04457789286971092, -0.07455608248710632, 0.007865827530622482, 0.04562194645404816, -0.03552774339914322, -0.007738951593637466, 0.09388759732246399, -0.015701837837696075, 0.033921483904123306, -0.017276542261242867, 0.04943705350160599], "DIVPDrm":[0.04179735854268074, 0.008989601396024227, 0.0027430830523371696, 0.06804384291172028, -0.06657993793487549, 0.033647675067186356, -0.03707171231508255, 0.08443991839885712, -0.054565757513046265, 0.0765392854809761, -0.08189049363136292, 0.02573087066411972, 0.018917549401521683, 0.079402856528759, -0.011117411777377129, 0.06308865547180176, -0.045432765036821365, -0.05054701492190361, -0.009618235751986504, -0.0594516322016716, 0.07967120409011841, 0.08030137419700623, -0.0768255814909935, -0.061036787927150726, 0.004279104992747307, -0.09737113863229752, 0.07295801490545273, -0.027599459514021873, 0.0045133912935853004, -0.048141367733478546, 0.0003157609316986054, -0.014835191890597343, 0.01462356187403202, -0.03225003555417061, 0.06723359227180481, 0.05244021862745285, 0.07099424302577972, -0.09206876158714294, 0.06154841184616089, -0.022400988265872, 0.034042902290821075, 0.002528816694393754, -0.04578591138124466, -0.023195132613182068, -0.07696253061294556, -0.03475971147418022, 0.03545870631933212, -0.021839862689375877, 0.0036371496971696615, 0.07372148334980011, -0.0596211701631546, -0.06768393516540527, -0.032637521624565125, 0.008432515896856785, 0.007569535635411739, -0.0034237385261803865, 0.05811845883727074, 0.013580343686044216, -0.03924565017223358, -0.025963587686419487, 0.03800642117857933, -0.04651957005262375, -0.033428385853767395, -0.053251899778842926, -0.04647624120116234, -0.034290049225091934, 0.003906013211235404, -0.05534028634428978, 0.04434804245829582, -0.08216925710439682, -0.011801591143012047, -0.006801240611821413, -0.07483590394258499, -0.06332433968782425, -0.005107037723064423, -0.008274846710264683, -0.07277056574821472, 0.03865613043308258, -0.0472225658595562, 0.009775533340871334, 0.055412523448467255, -0.014846398495137691, -0.008565607480704784, -0.018367087468504906, 0.038180120289325714, 0.06085506081581116, -0.02658388763666153, 0.006586031056940556, 0.0761575847864151, -0.007659312803298235, -0.10445686429738998, 0.01846102997660637, 0.02885548584163189, 0.0437043160200119, -0.012576445005834103, 0.04055696353316307, 0.002144219819456339, -0.08052077144384384, 0.03422001749277115, 0.03888843208551407], @@ -62,12 +61,9 @@ "DIV_Fp":[0.0013771128142252564, -0.03939857333898544, 0.06826473772525787, -0.055852942168712616, 0.021110225468873978, -0.07429434359073639, -0.01439732313156128, 0.047745198011398315, 0.03544871136546135, -0.006474921014159918, -0.05228240415453911, 0.00804696511477232, 0.0025021089240908623, 0.049810487776994705, -0.009595588780939579, 0.0507207065820694, -0.040155258029699326, 0.013851179741322994, -0.09630413353443146, -0.012529753148555756, 0.08176414668560028, 0.05994131416082382, 0.0013053410220891237, -0.035347871482372284, -0.06649265438318253, 0.07997933030128479, -0.042037565261125565, -0.06072461977601051, 0.09246786683797836, -0.0072363922372460365, 0.01850724034011364, 0.03905143961310387, -0.07601091265678406, -0.04824458062648773, -0.014410853385925293, -0.06455439329147339, -0.0593516007065773, -0.047922395169734955, -0.07904111593961716, -0.05896637961268425, -0.05629009008407593, -0.08674604445695877, 0.017179397866129875, -0.0020149857737123966, 0.02413070574402809, 0.024688012897968292, 0.027266085147857666, -0.015890855342149734, -0.00813567265868187, 0.024672919884324074, -0.020992467179894447, 0.019298823550343513, 0.022587062790989876, 0.06570186465978622, 0.061541132628917694, -0.07291612029075623, -0.010421186685562134, 0.032753147184848785, -0.06230449676513672, 0.040921296924352646, 0.05855383351445198, -0.035908423364162445, 0.05353318154811859, -0.013773049227893353, 0.0073576089926064014, 0.016397720202803612, 0.03753839433193207, 0.04765179380774498, -0.041083212941884995, -0.013994180597364902, -0.015261827036738396, 0.0982649177312851, 0.05605688691139221, -0.041869863867759705, -0.017181048169732094, 0.03721241280436516, -0.005489564035087824, 0.026647603139281273, -0.07785916328430176, 0.0476430244743824, -0.006558667402714491, 0.06363014876842499, -0.05705825239419937, -0.048359137028455734, -0.09657922387123108, -0.020021332427859306, 0.05151694640517235, 0.0028305412270128727, -0.012787899002432823, -0.09800048917531967, 0.01322718895971775, 0.08181536942720413, -0.04321233555674553, -0.0016350646037608385, -0.03537006303668022, 0.041411954909563065, 0.028577959164977074, 0.01855066418647766, 0.01671769842505455, -0.04467424377799034], "DIV_FpI":[-0.010708109475672245, -0.0732470378279686, -0.033443547785282135, -0.06361733376979828, 0.017653197050094604, -0.030770231038331985, -0.0766882598400116, 0.08713997155427933, -0.0696694403886795, 0.0565333366394043, 0.0079630296677351, 0.009157304652035236, 0.07795052230358124, 0.00863052811473608, 0.009487103670835495, -0.021366223692893982, -0.08859013020992279, -0.052845098078250885, 0.07517100870609283, 0.030445149168372154, -0.031006425619125366, -0.011518558487296104, 0.031634584069252014, 0.006774903275072575, -0.0008412582101300359, 0.05720775946974754, -0.03664165362715721, 0.04671872407197952, -0.04702712222933769, 0.08346223086118698, -0.02042539417743683, 0.005731453187763691, -0.02509506233036518, 0.04370206221938133, -0.06398718804121017, 0.052075039595365524, 0.05920809134840965, 0.0037172543816268444, 0.07034561783075333, -0.018100138753652573, 0.002755390014499426, -0.07121799886226654, 0.03879084065556526, 0.013516174629330635, -0.02845778502523899, 0.019500035792589188, 0.014439111575484276, 0.06561631709337234, -0.10264755040407181, -0.016511712223291397, -0.018063146620988846, 0.08819841593503952, -0.0031949833501130342, -0.07884415239095688, -0.10739012062549591, 0.007700629997998476, 0.049550142139196396, 0.0866587832570076, 0.054501283913850784, 0.10046342760324478, 0.01546743419021368, -0.05334487929940224, -0.02652003802359104, -0.009483176283538342, 0.011785115115344524, 0.04965313896536827, -0.030048802495002747, -0.043639082461595535, -0.004809096921235323, -0.0515226274728775, 0.08381897956132889, 0.003956930246204138, 0.03591177612543106, 0.04015829414129257, -0.03484338894486427, 0.04027436673641205, 0.09792015701532364, 0.013287014327943325, 0.09490979462862015, -0.024792836979031563, 0.04872164502739906, 0.026059577241539955, -0.05917894095182419, -0.011415015906095505, -0.024944854900240898, 0.00499876169487834, -0.06721813976764679, -0.03442658111453056, -0.002175490139052272, 0.0004200722905807197, -0.10891042649745941, 0.021674668416380882, 0.03700282797217369, -0.0014478170778602362, -0.013477527536451817, -0.02742406167089939, -0.01233668439090252, 0.02371596358716488, 0.04435785114765167, -0.03723753243684769], "EH_LABEL":[-0.03541884943842888, 0.012697970494627953, 0.07690317928791046, 0.10800164937973022, -0.033531446009874344, 0.010248170234262943, 0.08690237253904343, -0.018254421651363373, 0.006330807693302631, 0.054908059537410736, 0.05281105265021324, 0.01866377331316471, -0.03986826166510582, 0.012461063452064991, -0.0570770688354969, 0.010465170256793499, -0.0007985670818015933, 0.014928294345736504, -0.08143509179353714, -0.04576095566153526, -0.014382844790816307, 0.09261113405227661, -0.06843073666095734, 0.08642790466547012, 0.010645134374499321, 0.02887858636677265, -0.08228367567062378, 0.06679805368185043, 0.0023300996981561184, 0.02060936950147152, 0.06778941303491592, 0.10305429995059967, 0.06289057433605194, -0.020899023860692978, -0.024045836180448532, 0.0433543361723423, -0.0338776558637619, -0.05156976729631424, -0.02495928853750229, -0.060252029448747635, -0.1022094339132309, 0.014480574987828732, 0.0545940026640892, 0.04824232682585716, 0.06658189743757248, 0.11414545774459839, 0.08304191380739212, -0.03313761577010155, -0.056730128824710846, -0.005165864247828722, -0.00412571569904685, -0.0007486011018045247, -0.03322497382760048, 0.0425163209438324, 0.0785580724477768, 0.015084332786500454, 0.049294766038656235, 0.07518152892589569, -0.008224403485655785, -0.08819448202848434, -0.020814890041947365, 0.054976895451545715, -0.06431052833795547, 0.026952102780342102, -0.02861913852393627, -0.05228573456406593, -0.08044329285621643, 0.02844928950071335, 0.06669115275144577, -0.005387885496020317, 0.05081101134419441, -0.0627083107829094, -0.0785573348402977, -0.042252350598573685, -0.0632990375161171, -0.042457811534404755, -0.07097408920526505, 0.0032806433737277985, 0.039354246109724045, 0.054314617067575455, 0.04231691733002663, 0.00793430395424366, -0.06007056310772896, -0.06129273772239685, -0.008646488189697266, -0.024291129782795906, -0.06316813081502914, 0.02861824445426464, 0.029990797862410545, -0.014714590273797512, 0.005561451427638531, 0.06847269088029861, 0.05630529299378395, -0.015434914268553257, 0.08646618574857712, -0.0025325058959424496, -0.0046173883602023125, -0.04263639822602272, -0.021261105313897133, 0.02382785640656948], - "ExternalSymbol":[0.014755810610949993, -0.049842361360788345, -0.06733497977256775, 0.05401315540075302, 0.061938412487506866, 0.02437831088900566, -0.06823863834142685, 0.03685877099633217, 0.02961423434317112, -0.04944299906492233, -0.1271103173494339, 0.030452819541096687, 0.019848955795168877, -0.03185190260410309, 0.06586895883083344, 0.0007315169204957783, 0.010839227586984634, -0.09547370672225952, -0.01799146644771099, -0.02204788289964199, 0.048699937760829926, 0.004187166225165129, 0.004053634125739336, -0.04464051127433777, -0.005158414598554373, -0.0416896678507328, -0.024279240518808365, -0.05358913540840149, -0.04719633609056473, -0.07180647552013397, 0.02559211477637291, 0.04657098650932312, 0.08353757858276367, -0.0023563469294458628, 0.046847302466630936, -0.03508693352341652, 0.0696689784526825, 0.054716791957616806, -0.012037037871778011, 0.019885245710611343, 0.01824580691754818, -0.06719563156366348, -0.05447190999984741, 0.08877509087324142, -0.01375679112970829, -0.014463561587035656, -0.049798283725976944, 0.06304343044757843, -0.007584648672491312, -0.016156170517206192, 0.024602508172392845, 0.004940119571983814, -0.04088609293103218, 0.0026271860115230083, 0.00787595845758915, -0.01889132149517536, -0.041029710322618484, 0.07343143969774246, -0.02505693957209587, -0.04825644940137863, 0.060728199779987335, 0.00460366066545248, 0.020744791254401207, 0.04238201677799225, -0.024090539664030075, -0.05792662873864174, 0.07639332860708237, -0.07511764764785767, -0.08259762078523636, 0.07901840656995773, -0.000285966758383438, 0.021390466019511223, -0.07818973809480667, -0.02385067008435726, -0.0014113716315478086, -0.055170729756355286, 0.00946732610464096, 0.02471417747437954, 0.07941421121358871, 0.006746167317032814, -0.06766024231910706, -0.089698426425457, 0.01933225803077221, -0.06994582712650299, -0.10149082541465759, 0.06007266044616699, -0.14545120298862457, -0.03447172790765762, 0.03258124738931656, 0.04966919496655464, 0.023691890761256218, -0.014501980505883694, 0.05896589905023575, 0.04760534316301346, -0.017742110416293144, 0.0019451226107776165, -0.01854461058974266, -0.04744676500558853, -0.017504630610346794, 0.05197983980178833], "FLDCW":[-0.0138143515214324, 0.021748993545770645, 7.070673746056855e-05, -0.0897645577788353, 0.09824047237634659, -0.07988506555557251, -0.03454058617353439, 0.0019847718067467213, 0.04983500763773918, 0.03934836760163307, -0.01007675752043724, -0.07798215001821518, -0.08095540851354599, 0.002752745756879449, 0.030696945264935493, 0.017224561423063278, 0.00200466881506145, 0.055515315383672714, -0.06178406998515129, -0.07683275640010834, 0.06503588706254959, 0.06047344580292702, 0.017141321673989296, -0.021984437480568886, -0.05550537258386612, -0.10371828079223633, 0.04531969875097275, 0.04299109801650047, 0.008607891388237476, -0.015554985031485558, -0.08462150394916534, 0.01943030022084713, -0.03486369550228119, -0.06457459926605225, -0.0051103211008012295, 0.05992105230689049, 0.0358397401869297, -0.04655934497714043, -0.018018357455730438, -0.057540085166692734, 0.0061888862401247025, -0.013676634058356285, -0.05362136662006378, 0.06076344475150108, 0.014500541612505913, 0.04466172680258751, 0.025775697082281113, 0.034106262028217316, -0.045596618205308914, 0.022729532793164253, 0.0068075573071837425, 0.033541467040777206, 0.04034329950809479, -0.05922241508960724, -0.11147011071443558, 0.10801365971565247, 0.028543133288621902, -0.076783187687397, 0.0018997815204784274, -0.030598029494285583, 0.04199691861867905, -0.09739390015602112, 0.06310229748487473, -0.03830089420080185, -0.03836864233016968, 0.02324736677110195, 0.10289694368839264, -0.08237223327159882, 0.09511970728635788, -0.022883199155330658, 0.07018155604600906, 0.021149639040231705, 0.06003378704190254, 0.020026177167892456, -0.019267164170742035, 0.06961971521377563, -0.004955677315592766, -0.07218261808156967, 0.08104820549488068, -0.0418921560049057, -0.0317075252532959, 0.020996741950511932, -0.009143776260316372, 0.05348548665642738, -0.0625229999423027, -0.06267517060041428, -0.09454416483640671, -0.043331023305654526, -0.06992270052433014, -0.027888890355825424, -0.08271876722574234, -0.05188243091106415, -0.010446823202073574, 0.05846165865659714, -0.010190286673605442, -0.03009830228984356, 0.03426814824342728, -0.03598400205373764, -0.1076725572347641, -0.028831692412495613], "FNSTCW":[-0.08537304401397705, 0.014420966617763042, 0.026950713247060776, -0.008387862704694271, -0.0038766334764659405, 0.026867343112826347, -0.030130255967378616, -0.04617878049612045, -0.007106459699571133, -0.0215947013348341, 0.007403566502034664, 0.032729458063840866, 0.0008728280663490295, -0.017559584230184555, 0.017324298620224, -0.014857987873256207, -0.03798896074295044, -0.05294371768832207, 0.05491216480731964, -0.04219334200024605, -0.024796022102236748, 0.033826109021902084, 0.04021430388092995, 0.015585671178996563, -0.025553781539201736, -0.011536196805536747, 0.021523986011743546, 0.01087264809757471, -0.023965656757354736, 0.021311553195118904, -0.0554395355284214, -9.890173532767221e-05, -0.0012819130206480622, -0.055725399404764175, 0.008443817496299744, 0.014645406976342201, 0.09493250399827957, 0.005851465743035078, -0.0346904918551445, -0.018780557438731194, -0.0024646760430186987, -0.04922417551279068, -0.025316428393125534, -0.047623440623283386, 0.04252983629703522, 0.008884137496352196, 0.024444259703159332, 0.11018849164247513, 0.06603030860424042, 0.10775407403707504, -0.06696148216724396, 0.07046543061733246, 0.03569186478853226, 0.06831049919128418, 0.10069368779659271, -0.07917457073926926, 0.07819988578557968, 0.0325605608522892, 0.028253860771656036, -0.03586380183696747, 0.08094784617424011, -0.08532348275184631, 0.08135068416595459, 0.08752897381782532, 0.07736475020647049, 0.03881741315126419, 0.01930568367242813, 0.01373430248349905, 0.07003094255924225, 0.021482432261109352, 0.0606292188167572, 0.005889599211513996, -0.06958997994661331, 0.04857232794165611, 0.09418252855539322, 0.030624384060502052, -0.05853968486189842, 0.0978643149137497, 0.042890243232250214, -0.06594833731651306, -0.00445757107809186, 0.028062766417860985, 0.04270890727639198, 0.049651019275188446, -0.10246159136295319, -0.04101993143558502, -0.06874924898147583, -0.047776881605386734, 0.060615722090005875, 0.022016024217009544, -0.0476866140961647, -0.09320542216300964, -0.06186588481068611, 0.030679777264595032, -0.01664678566157818, -0.02508559450507164, -0.0495455376803875, 0.02986457757651806, 0.0242463406175375, -0.03076062723994255], - "FrameIndex":[0.05219179764389992, -0.01926516741514206, -0.021848104894161224, -0.008528115227818489, 0.02989117242395878, -0.012461756356060505, -0.050973404198884964, 0.026713935658335686, 0.01968700997531414, -0.001058116089552641, 0.009182002395391464, 0.03877940773963928, 0.070717453956604, -0.0028735792730003595, 0.0528000183403492, -0.015265910886228085, 0.007753959856927395, 0.01596899703145027, -0.07933179289102554, -0.02578687109053135, 0.02417992427945137, -0.03462255373597145, 0.04385964199900627, 0.004388607107102871, 0.03716951236128807, 0.04064105078577995, 0.07711678743362427, 0.0068300217390060425, -0.05443308874964714, -0.010809220373630524, -0.03124961629509926, 0.004911563824862242, -0.09201066941022873, 0.051436200737953186, 0.015400445088744164, 0.07804328948259354, -0.02971532940864563, -0.0003241244703531265, -0.02131350338459015, -0.09173687547445297, -0.01707594096660614, 0.0025449323002249002, 0.08701702952384949, 0.10675988346338272, -0.05082142353057861, 0.021581847220659256, -0.04104776680469513, 0.08402986079454422, -0.06109907105565071, 0.015201682224869728, 0.04374992102384567, -0.028573378920555115, -0.07767742872238159, 0.07216905802488327, 0.020538095384836197, -0.01229778677225113, 0.003033912740647793, -0.0007747758063487709, -0.09185474365949631, -0.02851664461195469, -0.009441743604838848, 0.05500328913331032, -0.002983751241117716, -0.09198789298534393, -0.051319632679224014, -0.054626885801553726, -0.020108554512262344, 0.0010591084137558937, -0.009138713590800762, 0.07223176956176758, -0.022099260240793228, 0.016025206074118614, -0.05320229008793831, 0.025131219998002052, 0.06626036763191223, 0.07639450579881668, -0.027084894478321075, 0.06581225991249084, -0.017618829384446144, -0.03859466314315796, -0.03385398909449577, 0.018783841282129288, -0.0730312392115593, 0.06957981735467911, -0.03065340407192707, 0.020685074850916862, -0.05311165004968643, 0.09466810524463654, 0.00955914705991745, -0.013919183053076267, -0.05540250986814499, -0.03087283857166767, -0.009688221849501133, 0.016239993274211884, -0.012926830910146236, -0.027712060138583183, -0.06342892348766327, -0.011996395885944366, 0.05536693334579468, -0.04359230771660805], "FsFLD":[-0.0508677139878273, -0.05399654433131218, -0.07149481028318405, -0.047971777617931366, 0.0019320917781442404, -0.007547610439360142, 0.0815814733505249, -0.12202084064483643, -0.08665104955434799, 0.03356856107711792, -0.15713559091091156, -0.0400867722928524, -0.006232412997633219, 0.044278621673583984, 0.09549921005964279, -0.029399411752820015, 0.01864752173423767, -0.04044967144727707, 0.05652021989226341, -0.09881851822137833, 0.025765251368284225, -0.02329906076192856, -0.06028103083372116, 0.09247462451457977, -0.04210466891527176, 0.03263019770383835, -0.03578515350818634, 0.0314578041434288, 0.003650028258562088, 0.04645871743559837, -0.010650137439370155, 0.015904754400253296, 0.018990037962794304, -0.005266033578664064, 0.038479309529066086, 0.008642041124403477, -0.049301791936159134, 0.09484748542308807, 0.005372038576751947, -0.08711376041173935, 0.07584445923566818, 0.09458201378583908, -0.00032702009775675833, 0.048093944787979126, -0.08043119311332703, 0.049779392778873444, -0.006967591121792793, -0.07319328933954239, 0.01582382619380951, -0.006244257558137178, -0.011940727941691875, -0.0013992231106385589, -0.028953444212675095, 0.010995968244969845, -0.005534093361347914, -0.04907146096229553, -0.0039899349212646484, 0.05501222237944603, 0.041574396193027496, 0.030038336291909218, -0.0402531623840332, 0.07675039023160934, 0.01103806123137474, -0.006072944961488247, -0.025336718186736107, 0.06967771798372269, -0.025075508281588554, 0.0031819106079638004, -0.015812508761882782, -0.12114851176738739, 0.07704214751720428, 0.1273191273212433, -0.014406625181436539, -0.031106390058994293, -0.0602225735783577, 0.016253838315606117, -0.059025105088949203, -0.04163780063390732, 0.01571997068822384, 0.025686416774988174, 0.032261066138744354, -0.016690189018845558, 0.014042876660823822, 0.009416786953806877, -0.012661219574511051, 0.013285082764923573, 0.03095356747508049, 0.008239349350333214, 0.0444798618555069, -0.05153216794133186, -0.010029821656644344, -0.015202880837023258, 0.06329496204853058, -0.0590473897755146, 0.08585292845964432, -0.08594027906656265, 0.06057215481996536, 0.01079416275024414, -0.04006461799144745, 0.029236430302262306], - "GlobalAddress":[0.021709734573960304, -0.03253590315580368, -0.04603651538491249, -0.02350226789712906, 0.02841794677078724, 0.01920732669532299, 0.053104616701602936, 0.03941836208105087, -0.01895466446876526, -0.030471740290522575, 0.010719750076532364, 0.020050356164574623, 0.03648754581809044, -0.021573888137936592, -0.02554452419281006, -3.637039117165841e-05, 0.05989491194486618, -0.006903402041643858, -0.08826262503862381, -0.028047384694218636, -0.04230065643787384, -0.05190899223089218, 0.06145390123128891, 0.0005839569494128227, -4.391977927298285e-05, -0.01880771853029728, 0.09660127758979797, 0.04333353415131569, 0.06461602449417114, -0.06010710820555687, -0.0690189078450203, 0.04574553668498993, -0.07640431076288223, 0.01879746839404106, 0.02076675370335579, 0.04869573190808296, 0.025147439911961555, 0.05311164632439613, 0.05711919441819191, 0.049520380795001984, 0.041169121861457825, -0.0603964701294899, -0.04195070639252663, 0.07676130533218384, -0.015161959454417229, 0.02903268299996853, -0.027548301964998245, 0.04705912992358208, -0.11194053292274475, -0.008245207369327545, -0.07792827486991882, -0.019468743354082108, 0.05482499673962593, -0.0028855702839791775, 0.05478052794933319, 0.07484771311283112, -0.011742575094103813, 0.00923923309892416, -0.05074375122785568, 0.06956734508275986, -0.045990440994501114, 0.007280972320586443, 0.040920473635196686, -0.09143709391355515, -0.06105270981788635, -0.0021254979074001312, -0.09519167989492416, 0.06324268877506256, -0.0693386048078537, -0.05100148543715477, 0.010643817484378815, -0.008162467740476131, -0.08811189234256744, -0.08640385419130325, 0.0077143507078289986, 0.030832089483737946, -0.01504515577107668, 0.07277517020702362, 0.02581198327243328, -0.052599068731069565, -0.06478387117385864, 0.01634707674384117, -0.021173706278204918, 0.030482977628707886, -0.09826494008302689, 0.07716016471385956, -0.10845024883747101, 0.04479274898767471, -0.015128640457987785, -0.03491876646876335, 0.05239150673151016, -0.03427724912762642, 0.06768845021724701, -0.04174086079001427, -0.05136744678020477, 0.0037109211552888155, -0.030324269086122513, -0.06928850710391998, -0.0395960658788681, 0.07726000994443893], "IDIV":[-0.03631015121936798, -0.07882149517536163, -0.010781447403132915, -0.025117948651313782, 0.01618420146405697, 0.044446997344493866, 0.011386583559215069, -0.00582836102694273, -0.012903614901006222, 0.006322081200778484, -0.07392880320549011, -0.1300479620695114, -0.05186808854341507, -0.06542935222387314, 0.08297666162252426, 0.03790606930851936, -0.07716395705938339, 0.02288512885570526, -0.038660015910863876, -0.04705967381596565, -0.00015759489906486124, -0.06133948266506195, -0.022438891232013702, -0.012017307803034782, 0.01929904706776142, 0.007114879786968231, 0.00567955756559968, -0.041199274361133575, 0.08304950594902039, 0.044402915984392166, -0.10634922981262207, -0.009510381147265434, 0.009772839024662971, -0.048219580203294754, -0.0321214459836483, 0.008684953674674034, 0.009846106171607971, 0.011280585080385208, 0.0310650784522295, 0.05677618831396103, 0.025418052449822426, -0.022629115730524063, 0.0074129728600382805, 0.1081111952662468, -0.03284893184900284, 0.002745774807408452, 0.05030296742916107, 0.04322626441717148, 0.005321172997355461, 0.03260405734181404, -0.051505692303180695, -0.033541131764650345, -0.03955534100532532, 0.047906432300806046, 0.02181984856724739, -0.0026405092794448137, 0.03350621834397316, -0.10710552334785461, -0.01533215120434761, -0.06872875243425369, -0.015413723886013031, -0.007149300072342157, -0.03660491481423378, -0.003503897227346897, -0.02898445539176464, 0.040071532130241394, 0.019684670493006706, -0.10101661086082458, -0.08199643343687057, 0.05637385696172714, -0.03792939707636833, 0.03106122836470604, -0.0590706542134285, -0.03607700765132904, -0.09597010910511017, -0.005815848242491484, 0.017992950975894928, 0.0007907312246970832, 0.04653536528348923, -0.03997295722365379, 0.006737773306667805, 0.11695551127195358, 0.022216010838747025, 0.041878726333379745, -0.035456813871860504, 0.04327021911740303, -0.03799387812614441, 0.10658515244722366, 0.010188632644712925, 0.09275273978710175, 0.09797771275043488, -0.12400814890861511, 0.03475511074066162, -0.08061601221561432, 0.022533612325787544, -0.11562027782201767, -0.026964085176587105, 0.08614259958267212, -0.025526022538542747, 0.040927182883024216], "ILD_Fp":[0.01509383600205183, -0.044326793402433395, -0.051242612302303314, -0.053859174251556396, -0.013097256422042847, -0.06370041519403458, 0.06120477616786957, 0.050328709185123444, -0.04184471070766449, 0.023432370275259018, -0.06435256451368332, 0.02055867575109005, 0.08239544183015823, 0.012251744978129864, -0.05063817650079727, 0.04293346777558327, -0.05919358506798744, -0.03159564360976219, -0.0037220751401036978, -0.001002405071631074, -0.026786377653479576, -0.07405146211385727, 0.044357798993587494, 0.08067265897989273, -0.05229390412569046, -0.06903751194477081, 0.010448710061609745, 0.006885232869535685, -0.052135784178972244, 0.08535145968198776, 0.041820794343948364, -0.020588336512446404, 0.07256042212247849, -0.017755955457687378, -0.032768987119197845, 0.06633710861206055, -0.03427698463201523, -0.10930930078029633, 0.05371936410665512, -0.06794329732656479, -0.014769122004508972, -0.07577606290578842, 0.07853815704584122, -0.09360899031162262, 0.05865737050771713, -0.034065186977386475, 0.05096115916967392, 0.0888199508190155, -0.03904300555586815, 0.03125728294253349, -0.0634637326002121, 0.03385297581553459, 0.027269205078482628, -0.07597903162240982, 0.008366324007511139, -0.03017764538526535, 0.011727942153811455, -0.04941355064511299, 0.027957690879702568, 0.09743025153875351, 0.004836047999560833, -0.028614182025194168, 0.016423141583800316, 0.0895770713686943, 0.025168858468532562, 0.030979957431554794, 0.016665387898683548, 0.025412173941731453, -0.035893514752388, -0.05403519794344902, 0.02931641787290573, 0.07742571830749512, -0.07045850157737732, -0.03433118015527725, -0.03651195392012596, -0.04036823660135269, -0.08663841336965561, 0.05561026185750961, 0.06927209347486496, -0.010819001123309135, -0.10697789490222931, 0.009881369769573212, 0.055065181106328964, -0.06379911303520203, 0.04137800633907318, 0.030417418107390404, -0.03515362739562988, -0.09139228612184525, 0.029920026659965515, 0.027388064190745354, -0.06739232689142227, 0.07639766484498978, -0.044223885983228683, 0.02472294308245182, -0.052025098353624344, 0.014643780887126923, 7.120784721337259e-05, 0.018760213628411293, -0.002873474732041359, 0.015561423264443874], "IMPLICIT_DEF":[-0.026583483442664146, -0.03995991870760918, 0.03633055090904236, -0.04622741788625717, -0.02326572686433792, 0.02231338992714882, -0.014788332395255566, -0.09906739741563797, 0.022785643115639687, -0.014632754027843475, -0.1041543111205101, 0.05013664439320564, -0.08690599352121353, -0.08063319325447083, 0.030247388407588005, -0.09707676619291306, 0.03499408811330795, 0.012669776566326618, 0.06481463462114334, -0.040453050285577774, -0.0489707849919796, -0.07584276050329208, 0.001047363504767418, 0.08496157824993134, 0.02357148937880993, -0.06866959482431412, 0.09267362207174301, 0.030527250841259956, -0.031355831772089005, 0.02419896423816681, -0.02442512847483158, 0.029297800734639168, 0.10321355611085892, 0.06579483300447464, -0.012722077779471874, 0.10042434185743332, -0.004708406049758196, 0.007217984646558762, 0.0753282904624939, -0.07088368386030197, -0.07383686304092407, 0.06410741060972214, 0.06312107294797897, 0.06989452987909317, 0.03766098991036415, -0.0008440924575552344, -0.023516006767749786, -0.04153933748602867, 0.07342316210269928, 0.05416297912597656, -0.02841850183904171, 0.04128013551235199, -0.001023625023663044, 0.005061942618340254, -0.06027042120695114, 0.025808431208133698, 0.027118714526295662, -0.08965771645307541, 0.012222534976899624, 0.008590211160480976, -0.01785023882985115, 0.03389652445912361, 0.0038459128700196743, 0.021088456735014915, -0.060241442173719406, 0.052924126386642456, -0.03849414363503456, 0.0044007860124111176, 0.05139085650444031, -0.06002991273999214, 0.026294095441699028, 0.06567239761352539, 0.1145782321691513, -0.02774081937968731, -0.07959162443876266, -0.00901349913328886, -0.09212079644203186, -0.016664501279592514, -0.019095804542303085, 0.05008011311292648, -0.016630882397294044, -0.007292845752090216, 0.01243519689887762, 0.011623953469097614, -0.0202464796602726, 0.08120717853307724, 0.04192841053009033, -0.014358888380229473, 0.0402902215719223, -0.05741799250245094, 0.0023748986423015594, -0.0007613254711031914, -0.11052780598402023, -0.08283583074808121, -0.018524790182709694, -0.09601832926273346, 0.037600427865982056, -0.06403559446334839, -0.08838459849357605, 0.01904650405049324], @@ -75,12 +71,10 @@ "INC":[-0.04204729199409485, -0.04558457434177399, -0.004308773670345545, 0.08560862392187119, -0.025844622403383255, -0.01385454647243023, -0.06715847551822662, 0.04059276729822159, 0.0008142509614117444, -0.04987747594714165, 0.05252164602279663, -0.07536070048809052, 0.012251293286681175, -0.01428443193435669, 0.028742481023073196, -0.024608345702290535, 0.009724774397909641, -0.024144234135746956, -0.04345421493053436, -0.03454094007611275, -0.03657921776175499, -0.025569358840584755, 0.04140102490782738, -0.02267373353242874, -0.05346262827515602, -0.07470668852329254, -0.03458420932292938, -0.015982985496520996, 0.013558092527091503, -0.029305797070264816, 0.026653757318854332, -0.00041234202217310667, 0.038508299738168716, 0.08509717136621475, 0.0016276738606393337, -0.013578594662249088, 0.05669381096959114, 0.0274334829300642, 0.023921431973576546, -0.02701006643474102, -0.09357035905122757, 0.07844959199428558, -0.03195708245038986, 0.044196177273988724, 0.017355425283312798, -0.04172753170132637, -0.07773707062005997, 0.018204662948846817, -0.07242465019226074, 0.07735569030046463, 0.03859752044081688, 0.08490721136331558, 0.04661087319254875, 0.015468046069145203, 0.02267235703766346, -0.030244702473282814, -0.043930262327194214, -0.015585970133543015, -0.004605699330568314, 0.0052457586862146854, -0.027553195133805275, -0.06406774371862411, 0.008009923622012138, -0.09624558687210083, 0.07006736844778061, 0.052846722304821014, -0.029392898082733154, -0.0659954622387886, -0.10725440829992294, 0.04428407922387123, 0.02606845460832119, 0.018936248496174812, -0.013534934259951115, 0.03338829427957535, -0.06049540638923645, 0.007389454171061516, 0.030835872516036034, -0.026952944695949554, -0.008518273010849953, 0.07688802480697632, 0.03663042560219765, -0.09961165487766266, -0.02765841968357563, 0.06263019144535065, -0.003026304766535759, -0.0023868512362241745, -0.052803706377744675, 0.04688272252678871, 0.08415349572896957, -0.044724639505147934, 0.01759890839457512, 0.022962408140301704, 0.00944716576486826, -0.084384024143219, -0.02845100499689579, -0.05094959959387779, -0.08001884073019028, 0.0449872724711895, -0.05161838233470917, 0.015422065742313862], "INLINEASM":[0.09296883642673492, -0.007579821161925793, 0.05054628103971481, 0.0011402746895328164, -0.02369365282356739, -0.040429845452308655, 0.048763860017061234, -0.012725423090159893, -0.017820369452238083, -0.0700153335928917, -0.00037883210461586714, 0.06301063299179077, 0.0503254272043705, 0.023893356323242188, -0.07308998703956604, 0.058056626468896866, -0.002504807896912098, -0.03528450429439545, -0.0775352418422699, -0.08423604816198349, 0.01841139607131481, 0.07128658145666122, 0.01363592129200697, 0.05391324311494827, 0.04803359508514404, 0.06145099550485611, -0.03153276443481445, 0.019207997247576714, 0.07138897478580475, 0.06972941011190414, 0.06482893973588943, -0.019937975332140923, -0.00694684125483036, 0.0624234639108181, 0.08495642989873886, 0.017590269446372986, -0.0075670769438147545, 0.05114367976784706, 0.031221428886055946, -0.07108655571937561, -0.018287384882569313, 0.035706836730241776, -0.0794610008597374, -0.03627452626824379, -0.06174106150865555, -0.036826081573963165, -0.030408767983317375, 0.008271732367575169, -0.09423738718032837, 0.004248321522027254, 6.044749170541763e-05, 0.011095447465777397, -0.10245273262262344, -0.07278212904930115, -0.00845671258866787, 0.008961541578173637, 0.019341865554451942, 0.010205359198153019, 0.0724569708108902, -0.08050914853811264, -0.057010360062122345, 0.05053231865167618, -0.04844024032354355, 0.057458631694316864, 0.007486356887966394, -0.029497744515538216, 0.009812748059630394, -0.05314056575298309, 0.11012034863233566, -0.0647352784872055, 0.017479702830314636, -0.027027146890759468, -0.015448061749339104, 0.06321517378091812, -0.06948030740022659, 0.030430838465690613, -0.022251488640904427, -0.0358838327229023, 0.020705783739686012, -0.10970951616764069, -0.07724311202764511, 0.03224516287446022, 0.004828427918255329, 0.07738938182592392, -0.0036471053026616573, 0.06867322325706482, -0.07092054188251495, -0.024759342893958092, -0.054835252463817596, 0.019259851425886154, 0.011149682104587555, -0.09652992337942123, 0.050764426589012146, -0.0809553936123848, -0.04605351760983467, 0.0399462915956974, 0.05396333709359169, -0.01706104166805744, -0.031266387552022934, 0.020599452778697014], "IST_Fp":[-0.046584248542785645, -0.07452045381069183, 0.03998621925711632, 0.03091888502240181, 0.016272397711873055, -0.00985297653824091, -0.007199955638498068, -0.03536335751414299, 0.01673988439142704, 0.07562774419784546, 0.023876583203673363, -0.008683494292199612, 0.04009688273072243, -0.03663905709981918, -0.014492983929812908, 0.07349997758865356, 0.028999919071793556, -0.07499339431524277, -0.03727814927697182, -0.046455491334199905, -0.032447993755340576, 0.02374599315226078, -0.044662121683359146, -0.025333719328045845, 0.037562429904937744, 0.0006656686891801655, -0.00804421491920948, 0.06697870045900345, 0.04367857426404953, -0.0583018884062767, 0.03050180710852146, 0.053111929446458817, -0.04168881103396416, -0.027295507490634918, 0.057777389883995056, 0.08833678811788559, -0.026598922908306122, 0.005393106956034899, -0.05517015606164932, -0.0731138065457344, 0.07386088371276855, -0.07228095829486847, 0.023828018456697464, -0.0025013380218297243, -0.012031037360429764, 0.029700662940740585, -0.101964570581913, 0.0899822935461998, 0.013285316526889801, 0.002607472240924835, 0.04784732311964035, -0.044669900089502335, -0.04348702356219292, -0.07007527351379395, -0.016267215833067894, 0.059609103947877884, -0.036534957587718964, 0.013465121388435364, 0.10186120122671127, 0.015473871491849422, -0.08443709462881088, -0.004981503821909428, 0.06996916979551315, 0.011159068904817104, -0.07315052300691605, 0.024891534820199013, 0.0426689088344574, 0.008847315795719624, -0.06540054082870483, -0.09095568209886551, 0.053956128656864166, -0.010535894893109798, 0.035168495029211044, 0.04921877756714821, -0.07781729847192764, 0.006958760786801577, -0.05714801698923111, -0.06458019465208054, -0.055241748690605164, -0.007552466355264187, -0.02490214817225933, -0.014270482584834099, 0.03710750862956047, 0.003406278323382139, -0.044638775289058685, -0.09159127622842789, -0.025353819131851196, -0.07952282577753067, -0.02874225378036499, -0.06654132902622223, 0.0031955954618752003, 0.0602104589343071, -0.09261002391576767, -0.06175351142883301, 0.01194009743630886, -0.0348934531211853, 0.04460763558745384, -0.08773446083068848, 0.04335169121623039, 0.054603610187768936], - "Immediate":[-0.039664868265390396, 0.028720445930957794, -0.057207897305488586, 0.04179477319121361, 0.04477043077349663, 0.020050648599863052, -0.056656818836927414, -0.025030966848134995, -0.04394019395112991, 0.04849115386605263, 0.012325904332101345, 0.06731707602739334, 0.04568001255393028, -0.04773757979273796, -0.012142524123191833, -0.03986259177327156, -0.027249159291386604, -0.04930245876312256, -0.10542229562997818, -0.05678592994809151, -0.038303568959236145, -0.07283245027065277, 0.0217409897595644, -0.01139344647526741, 0.006936497986316681, -0.04702157527208328, 0.09977010637521744, -0.035237088799476624, 0.028822069987654686, -0.0691431537270546, -0.0829710066318512, -0.1289154589176178, -0.08470306545495987, -0.06731563061475754, 0.06642980873584747, 0.026025734841823578, -0.04049745202064514, 0.030080674216151237, 0.04203929752111435, 0.06834205985069275, 0.04315062239766121, 0.00788890291005373, 0.03426999971270561, 0.08819636702537537, 0.004112098831683397, 0.03392210975289345, 0.010541473515331745, 0.08045777678489685, -0.02914009988307953, 0.0624285452067852, 0.03299122676253319, -0.05355033650994301, -0.07568570226430893, 0.08106201142072678, 0.0376802459359169, -0.04886564612388611, -0.10992937535047531, -0.00761816743761301, -0.014918084256350994, 0.03816765174269676, -0.04981819912791252, 0.00031993765151128173, 0.011382698081433773, -0.029902901500463486, -0.0117422454059124, -0.057965945452451706, -0.09519924223423004, 0.020727403461933136, -0.04526710882782936, 0.09883677959442139, 0.018033087253570557, -0.003035350237041712, -0.06968960911035538, -0.09893210977315903, -0.01264366414397955, 0.017397744581103325, -0.08519260585308075, 0.09382850676774979, -0.055508699268102646, -0.026548130437731743, -0.013868317008018494, -0.03162496164441109, 0.06089535728096962, -0.01583624631166458, -0.060260944068431854, 0.06709896773099899, -0.09333796799182892, -0.02887417934834957, -0.03424007445573807, -0.01687423326075077, 0.11968979239463806, -0.08361987769603729, 0.09037765115499496, -0.04322688281536102, -0.040831610560417175, -0.061376459896564484, -0.03485504537820816, 0.016033072024583817, 0.004106835462152958, -0.03354674205183983], "Int_MemBarrier":[0.0418969988822937, -0.06285926699638367, -0.018717624247074127, -0.0031687396112829447, 0.04023218899965286, 0.08492552489042282, -0.06942103803157806, 0.005588027182966471, -0.08964942395687103, 0.055396437644958496, -0.06732998788356781, 0.06981600075960159, -0.05258888751268387, -0.06051918491721153, 0.02948639541864395, -0.04473342001438141, 0.01574157550930977, -0.04423875734210014, -0.053338322788476944, 0.008577392436563969, 0.10632415115833282, 0.040030092000961304, 0.02552260458469391, 0.026821544393897057, -0.05510386824607849, 0.05976655334234238, -0.0008300095796585083, 0.06861157715320587, -0.049591872841119766, -0.07650840282440186, -0.004643433261662722, -0.03990425914525986, 0.06366871297359467, -0.014906020835042, -0.06371121108531952, 0.0194997675716877, -0.07784571498632431, 0.029953552410006523, 0.06530797481536865, -0.09173597395420074, 0.021494632586836815, 0.052978403866291046, -0.001283245743252337, -0.05061378329992294, 0.04639996960759163, 0.06478390842676163, -0.015909312292933464, 0.013739313930273056, -0.06675873696804047, -0.0704226866364479, 0.020883914083242416, 0.07323179394006729, -0.0010066484101116657, -0.002373248105868697, -0.07056596130132675, 0.024577656760811806, 0.04880139231681824, -0.038608577102422714, 0.07695038616657257, 0.002806240925565362, 0.006876204162836075, 0.006961337756365538, 0.059363361448049545, 0.021191507577896118, -0.06366844475269318, -0.015020458959043026, -0.0815785601735115, 0.004222068004310131, -0.07691111415624619, 0.02711009606719017, 0.014720573090016842, 0.022912023589015007, 0.05272422730922699, 0.08111070841550827, -0.018083568662405014, -0.0418405644595623, 0.08496879786252975, -0.04420621693134308, 0.090696781873703, -0.02872851863503456, -0.024066468700766563, 0.07789512723684311, -0.012021118775010109, 0.041637614369392395, 0.07615016400814056, -0.042834896594285965, 0.05792360380291939, -0.051077719777822495, -0.05241186544299126, 0.006270663347095251, -0.008865885436534882, -0.09101007878780365, 0.009276151657104492, 0.036050815135240555, -0.06729964166879654, -0.014552133157849312, -0.06943532824516296, -0.023805340752005577, -0.058313168585300446, -0.04949163272976875], "JCC_":[-0.03625413775444031, -0.041811503469944, -0.07486920803785324, -0.05052778869867325, 0.021635157987475395, -0.045879144221544266, 0.014834613539278507, -0.03941917419433594, -0.010327291674911976, -0.08194752782583237, 0.049111511558294296, 0.05970187485218048, 0.03878019377589226, -0.08208157867193222, 0.11816514283418655, -0.0021148237865418196, 0.022616155445575714, 0.02145639806985855, -0.056387223303318024, -0.07890307158231735, 0.049655016511678696, -0.09555239230394363, -0.07599814981222153, 0.04143097624182701, -0.029399001970887184, 0.01379090640693903, 0.04894237220287323, 0.04915700852870941, 0.020924754440784454, 0.11983200162649155, -0.045743830502033234, 0.04826069250702858, 0.06473162770271301, 0.032176557928323746, 0.012342192232608795, 0.03632035106420517, -0.011231182143092155, 0.03319219872355461, 0.012383898720145226, 0.017726020887494087, -0.027707353234291077, 0.052987076342105865, -0.06459034234285355, 0.03180805966258049, 0.038370322436094284, -0.018640436232089996, -0.05121193453669548, -0.052741218358278275, 0.0953487753868103, 0.0914265364408493, 0.08409767597913742, -0.009599939920008183, 0.02045055478811264, 0.009363643825054169, -0.00872961338609457, -0.08178623765707016, -0.008178372867405415, -0.005903102457523346, 0.05836755037307739, 0.011602274142205715, -0.02761419117450714, 0.016957316547632217, 0.04471946507692337, 0.005247261840850115, -0.05416998639702797, 0.00770663283765316, -0.06152857095003128, 0.021657155826687813, -0.04485960677266121, -0.0008541923016309738, 0.053551655262708664, 0.062185727059841156, -0.012641278095543385, -0.020507624372839928, -0.02900690771639347, 0.019629495218396187, 0.05620177462697029, -0.07772354781627655, -0.025509009137749672, 0.01923682540655136, 0.03035508468747139, 0.018665296956896782, 0.013450516387820244, 0.06740278005599976, 0.013274379074573517, 0.011593983508646488, 0.02331095188856125, 0.048694003373384476, 0.05861792340874672, -0.021130137145519257, 0.02437412552535534, 0.059087324887514114, 0.024816056713461876, -0.050772879272699356, -0.01114521361887455, -0.028665395453572273, -0.09630053490400314, 0.0039062038995325565, -0.08236120641231537, 0.019473683089017868], "JMP":[-0.021766331046819687, -0.021576769649982452, -0.03795000538229942, 0.10449998080730438, -0.037742577493190765, -0.009156269021332264, 0.015289359726011753, 0.03519408404827118, -0.034353505820035934, 0.03226960077881813, 0.07340928167104721, 0.06086661294102669, 0.05736850947141647, -0.01725650765001774, -0.06702736765146255, -0.014972181059420109, 0.03435607627034187, 0.012023050338029861, 0.03370668366551399, -0.022338073700666428, -0.08280093967914581, -0.08060947060585022, 0.012210523709654808, -0.08165933936834335, 0.0016056479653343558, 0.015586943365633488, 0.11792927235364914, 0.06917431950569153, 0.02870137430727482, 0.01961304247379303, -0.027661900967359543, 0.10504695773124695, -0.03640349581837654, -0.01896090805530548, -0.011636625044047832, -0.04474593698978424, -0.029941411688923836, -0.058342345058918, 0.05885041877627373, 0.05553867667913437, 0.03953809291124344, 0.06787443161010742, 0.002061075298115611, 0.027305128052830696, 0.05792280659079552, 0.08001891523599625, 0.026575665920972824, -0.0171738862991333, -0.010685772635042667, -0.05422135442495346, 0.03660969436168671, 0.03091355785727501, -0.05900857225060463, 0.08500046283006668, -0.08218419551849365, 0.061078935861587524, 0.018783383071422577, 0.047520000487565994, -0.00014930205361451954, 0.002577823819592595, -0.06816059350967407, 0.041743114590644836, 0.03372296690940857, 0.016127480193972588, -0.07235685735940933, 0.024466760456562042, -0.03468412905931473, 0.037008773535490036, -0.060657840222120285, 0.016427740454673767, 0.08229042589664459, -0.061172664165496826, -0.009794612415134907, -0.024358782917261124, -0.06573519110679626, 0.09360098838806152, -0.07428182661533356, -0.02529928646981716, 0.09198813885450363, 0.025180503726005554, 0.03200048953294754, 0.018081925809383392, 0.0034776402171701193, 0.07848992198705673, -0.00043209362775087357, -0.01768604852259159, -0.043686315417289734, 0.04550321400165558, 0.11878672987222672, -0.008190528489649296, 0.003286525374278426, 0.06845948845148087, 0.04892893135547638, -0.053277406841516495, -0.016919657588005066, 0.032096169888973236, 0.02839065156877041, -0.01713993400335312, -0.15167304873466492, -0.02013365738093853], "JMP_":[-0.014233234338462353, -0.0260892603546381, -0.13750334084033966, -0.050227466970682144, -0.042988359928131104, -0.027947310358285904, 0.08639533072710037, -0.16317786276340485, -0.03907149285078049, -0.05328908935189247, -0.03975899517536163, 0.04182944446802139, -0.010540750809013844, -0.11645861715078354, -0.012753792107105255, 0.002367585664615035, -0.05188040807843208, 0.0033823091071099043, -0.01240340806543827, -0.06099176034331322, -0.0015601427294313908, -0.11171454191207886, -0.04928319901227951, 0.05990544706583023, 0.015553089790046215, 0.04499414563179016, -0.034520961344242096, 0.07318194955587387, 0.013978325761854649, 0.07317976653575897, -0.029100794345140457, -0.09544635564088821, 0.030067358165979385, 0.057544808834791183, 0.005057932808995247, 0.005621553864330053, -0.03627946600317955, -0.0391962006688118, 0.03113878332078457, -0.02958066016435623, 0.012381716631352901, 0.011978821828961372, 0.13839371502399445, 0.010590317659080029, 0.06677765399217606, 0.046147286891937256, -0.05033441260457039, -0.020135121420025826, 0.032657306641340256, -0.05044032260775566, 0.05499301478266716, 0.07406507432460785, -0.0011679750168696046, 0.000989275984466076, 0.029161963611841202, 0.02679276280105114, 0.024040302261710167, 0.0710899606347084, -0.0035478041972965, -0.03730632737278938, -0.014350024051964283, 0.1638166308403015, -0.10163120925426483, 0.02900329977273941, -0.05366139113903046, 0.07186686992645264, -0.041340481489896774, 0.0401119627058506, -0.002295189071446657, -0.07949572801589966, -0.011504769325256348, 0.10675538331270218, -0.012056156061589718, -0.00748586468398571, -0.039624687284231186, -0.03555607795715332, -0.06799864768981934, -0.04550764709711075, -0.03302829712629318, -0.008404256775975227, 0.10563746094703674, -0.026095328852534294, 0.07613116502761841, 0.02101682499051094, 0.018749620765447617, 0.0056787943467497826, 0.005889789201319218, 0.03994893655180931, 0.05512934923171997, -0.004684743471443653, -0.01083239447325468, 0.0003112686099484563, 0.024348445236682892, -0.02665846049785614, 0.0064091463573277, -0.02719639055430889, 0.11076066642999649, -0.0014569570776075125, 0.0050220787525177, 0.032427236437797546], - "JumpTableIndex":[-0.007416237145662308, 0.0038157713133841753, 0.05180662125349045, 0.03776901960372925, -0.011749244295060635, -0.02952706068754196, -0.06646136939525604, 0.02088487148284912, -0.001927916775457561, 0.018895410001277924, 0.0509350448846817, 0.057210080325603485, -0.0476078987121582, -0.00016809302906040102, -0.02341553010046482, -0.06734820455312729, 0.02047930844128132, 0.009282611310482025, 0.0038133300840854645, 0.0020261742174625397, -0.09253961592912674, 0.0766557827591896, -0.049570225179195404, -0.11510220915079117, -0.009570423513650894, -0.007274465169757605, 0.07750000059604645, 0.02489926479756832, -0.08297400176525116, 0.048176445066928864, 0.03797437995672226, 0.060842450708150864, 0.020265065133571625, -0.03559373319149017, 0.03493893891572952, -0.0036544676404446363, 0.010211148299276829, -0.06471849977970123, -0.034595828503370285, -0.05245388671755791, -0.0014119939878582954, 0.008752748370170593, -0.020637203007936478, 0.053244929760694504, 0.052053239196538925, 0.014706660993397236, 0.02803724631667137, -0.07983336597681046, 0.03106858767569065, 0.001688914722763002, -0.07647732645273209, -0.028148295357823372, -0.0528123639523983, 0.08006428182125092, -0.06398879736661911, -0.033476538956165314, 0.05217607319355011, -0.03093232959508896, 0.044230975210666656, 0.05123162269592285, -0.05225585401058197, 0.06976816058158875, -0.0014492797199636698, 0.03833283483982086, 0.08385992050170898, -0.04722217097878456, -0.00226160092279315, -0.027254855260252953, -0.09566919505596161, 0.02109321765601635, -0.032354824244976044, 0.08032239973545074, -0.046937450766563416, -0.004326784983277321, -0.026024870574474335, 0.12039119750261307, 0.1016048863530159, 0.06808122247457504, -0.012297546491026878, -0.06450799852609634, 0.015778351575136185, 0.012280710972845554, 0.04002666845917702, 0.04792468994855881, -0.06248988211154938, -0.054222140461206436, 0.018379682675004005, -0.0029111658222973347, 0.016062958166003227, 0.09880068898200989, 0.03846307471394539, 0.04975416138768196, 0.07305088639259338, -0.020941948518157005, -0.020897891372442245, 0.03872328996658325, -0.05682756006717682, 0.09583723545074463, 0.0028475294820964336, -0.05127262324094772], "LCMPXCHG":[0.0649508610367775, -0.04321656376123428, 0.08405561745166779, -0.07786691188812256, -0.05277935788035393, 0.011031142435967922, -0.0015533932019025087, 0.08730415254831314, -0.004414519295096397, 0.04040057212114334, -0.005748671013861895, -0.013907546177506447, 0.1028006374835968, 0.09900037944316864, -0.06475479900836945, 0.024365412071347237, -0.0727076306939125, 0.06610138714313507, -0.026073187589645386, 0.08258920162916183, -0.007938066497445107, 0.07641425728797913, 0.10221290588378906, 0.029036179184913635, -0.024506229907274246, 0.00953623466193676, -0.03283938392996788, -0.07194274663925171, -0.023513879626989365, -0.017550935968756676, -0.037860531359910965, 0.042062658816576004, 0.0501263290643692, 0.02325640618801117, 0.0018605751683935523, 0.012687316164374352, -0.016979143023490906, -0.059858907014131546, -0.07078705728054047, 0.033630695194005966, 0.036799900233745575, -0.03821465000510216, -0.059619177132844925, -0.06309511512517929, 0.0019384543411433697, -0.053095221519470215, 0.00571654736995697, 0.07134073972702026, -0.02115899883210659, 0.021287376061081886, -0.04855392873287201, 0.0103003466501832, -0.008993818424642086, 0.05131004378199577, -0.0734843909740448, 0.017303360626101494, 0.008291462436318398, 0.046435531228780746, -0.055057018995285034, -0.05454597249627113, -0.009126733057200909, -0.0012434959644451737, -0.0846821740269661, -0.017736544832587242, -0.04779898375272751, 0.020568806678056717, -0.061118245124816895, -0.012131555937230587, 0.024907736107707024, -0.0161012914031744, -0.011221951805055141, -0.029136324301362038, 0.04336633160710335, -0.00514700124040246, 0.004810850135982037, 0.014044326730072498, -0.07381691038608551, -0.064864382147789, 0.041784100234508514, 0.06648915261030197, 0.038817185908555984, -0.03421948850154877, 0.019546108320355415, -0.00579161336645484, -0.06579872220754623, -0.01745537295937538, -0.07164284586906433, 0.032588109374046326, 0.009170422330498695, -0.08387100696563721, -0.04743993282318115, 0.05926872417330742, 0.03129392862319946, -0.012995549477636814, 0.007799868006259203, 0.036110181361436844, 0.01603531278669834, -0.09735894203186035, 0.014374110847711563, -0.023844046518206596], "LD_Fp":[0.09850919246673584, 0.022097617387771606, -0.02880568616092205, 0.014175659976899624, -0.03401500731706619, -0.010281442664563656, -0.05501694604754448, -0.041856300085783005, 0.07016798853874207, -0.022585496306419373, -0.007230871357023716, 0.02143889106810093, 0.011802875436842442, -0.011940510012209415, 0.001225354615598917, -0.04420488327741623, 0.058923713862895966, 0.07726655155420303, -0.024950502440333366, -0.005545462481677532, 0.037338823080062866, -0.03718772903084755, 0.08340831100940704, 0.030300375074148178, -0.04332158342003822, -0.10117480903863907, -0.023774733766913414, 0.055412717163562775, 0.07188894599676132, 0.048699796199798584, 0.02051064558327198, -0.05177381634712219, 0.046848755329847336, 0.06421937793493271, 0.014812597073614597, 0.06599052250385284, 0.055128950625658035, 0.057206105440855026, 0.004570540506392717, 0.0006673894240520895, -0.04956628009676933, 0.018173960968852043, 0.009045585989952087, -0.09929032623767853, -0.0734606683254242, 0.009978558868169785, 0.016378602012991905, -0.0809779167175293, 0.028371425345540047, 0.07337132841348648, -0.0712965577840805, -0.07612331956624985, 0.023224541917443275, -0.01886812597513199, 0.049867402762174606, 0.04525093734264374, -0.04347287490963936, 0.04647829011082649, -0.020921878516674042, 0.055911704897880554, 0.0646883100271225, 0.043256886303424835, 0.012135359458625317, 0.06405725330114365, 0.04327752813696861, -0.06879010051488876, -0.02182726003229618, -0.030435195192694664, -0.04794333875179291, 0.03966866061091423, -0.05612926930189133, 0.061092350631952286, -0.047390542924404144, 0.06440525501966476, 0.07119303941726685, 0.036672186106443405, 0.039346762001514435, 0.05825766921043396, -0.05363740026950836, 0.026515239849686623, -0.021117106080055237, -0.061990927904844284, 0.06407181918621063, -0.02918284200131893, 0.06280291080474854, 0.05465791001915932, 0.025043612346053123, -0.015093226917088032, 0.0339696891605854, 0.039516378194093704, -0.005943501368165016, 0.037065502256155014, 0.0036617075093090534, -0.04032375290989876, -0.027956390753388405, -0.028206538408994675, 0.003602939657866955, 0.0015424611046910286, 0.03779160603880882, -0.012583530507981777], "LEA":[-0.07203060388565063, -0.017553633078932762, 0.0402604416012764, -0.03958871215581894, -0.035693515092134476, 0.006020952947437763, 0.06661038845777512, -0.05565638095140457, -0.07512512803077698, 0.015386131592094898, 0.1531272977590561, 0.07126382738351822, -0.018143991008400917, 0.0798688530921936, -0.0836813896894455, -0.005903773941099644, -0.03920849785208702, 0.025672506541013718, -0.017640162259340286, -0.09243063628673553, 0.0272371768951416, 0.04267166927456856, -0.032052017748355865, 0.06952647119760513, -0.03414658084511757, 0.05041181296110153, 0.04035321623086929, 0.04639449715614319, -0.000271787925157696, 0.1057962104678154, -0.031690120697021484, 0.0785541757941246, -0.008634688332676888, 0.035989925265312195, -0.00988205149769783, -0.047323428094387054, -0.018978994339704514, -0.001277003320865333, -0.022872451692819595, -0.034365635365247726, -0.04628191888332367, 0.06221615523099899, 0.01957613043487072, 0.13219280540943146, 0.03662179410457611, 0.046082716435194016, 0.011469600722193718, -0.025702660903334618, -0.08428508788347244, -0.07941769808530807, -0.06742636859416962, 0.0873873308300972, 0.0038614647928625345, 0.02177446149289608, -0.004519546404480934, -0.06213155761361122, -0.011228920891880989, -0.12034870684146881, 0.008946738205850124, 0.009164049290120602, -0.02258075587451458, 0.016061170026659966, 0.0645158663392067, 0.03723616153001785, -0.06451661139726639, -0.005219440441578627, -0.055180057883262634, 0.015841009095311165, -0.01621314138174057, -0.09887613356113434, 0.04894544556736946, -0.07996354252099991, 0.0138346366584301, -0.04036646708846092, -0.07073907554149628, -0.019294722005724907, -0.08181063830852509, -0.002301511587575078, -0.03429428115487099, 0.04098176211118698, 0.0706806555390358, 0.020024126395583153, 0.043529968708753586, 0.060017164796590805, 0.003525135340169072, -0.029752371832728386, -0.021769365295767784, 0.03941021487116814, -0.002250884659588337, -0.08078912645578384, 0.015297000296413898, 0.026888463646173477, 0.048139896243810654, -0.04837239161133766, -0.036249756813049316, -0.027615496888756752, -0.15165935456752777, -0.03756902739405632, 0.015112340450286865, -0.0010633820202201605], @@ -89,13 +83,10 @@ "LXADD":[-0.11344388872385025, 0.08068472892045975, -0.041796449571847916, -0.043138183653354645, -0.049067553132772446, -0.005337296053767204, 0.021436110138893127, -0.035862036049366, -0.05354782193899155, 0.007918866351246834, -0.033625587821006775, 0.048349399119615555, 0.07167208194732666, -0.04589017853140831, -0.023661522194743156, 0.03580676391720772, 0.03326055034995079, 0.041535746306180954, -0.008772681467235088, -0.03362834453582764, -0.008885134011507034, -0.005286931060254574, -0.09389151632785797, 0.015108847059309483, -0.020455803722143173, 0.06477829068899155, 0.012845957651734352, -0.03201524540781975, -0.07100234925746918, 0.046879976987838745, -0.06030888110399246, 0.022502053529024124, -0.10942362248897552, -0.06978410482406616, 0.0714743509888649, 0.057766277343034744, 0.038102924823760986, -0.007761931978166103, -0.11331900954246521, -0.07498679310083389, -0.002573479898273945, -0.005142265930771828, -0.04596858471632004, -0.05356051027774811, 0.10633396357297897, -0.07426618784666061, 0.037482988089323044, 0.10527358204126358, 0.08239476382732391, 0.0678592249751091, -0.014271541498601437, -0.010673552751541138, -0.0767236202955246, 0.0329856239259243, -0.02222914807498455, -0.0019944666419178247, -0.0789676085114479, 0.006855306681245565, -0.012843947857618332, -0.10197136551141739, -0.036981865763664246, 0.04500154033303261, 0.0023044694680720568, -0.0031417198479175568, -0.06536462903022766, -0.02773689292371273, 0.06672050058841705, 0.046953968703746796, 0.009028433822095394, -0.008872197940945625, 0.09054717421531677, 0.009121377021074295, 0.09400534629821777, 0.012045130133628845, -0.014854185283184052, 0.030989984050393105, -0.030203191563487053, 0.09275887161493301, -0.009853487834334373, 0.038435857743024826, 0.05689401552081108, -0.06919367611408234, -0.02360834926366806, -0.08338318765163422, 0.01904873177409172, -0.027271559461951256, -0.05529508367180824, 0.09507890790700912, -0.03128642588853836, 0.026687508448958397, -0.05117009952664375, -0.03872146084904671, 0.08641110360622406, -0.027542488649487495, -0.09849996864795685, 0.05740527808666229, -0.02291804924607277, -0.10829142481088638, 0.008436905220150948, 0.027438905090093613], "MAXSDrr":[-0.06119297072291374, -0.04124095290899277, -0.0296846404671669, -0.045824289321899414, 0.02508155070245266, 0.007925539277493954, 0.043926920741796494, -0.03159729018807411, 0.019068658351898193, -0.013711963780224323, -0.028986897319555283, -0.04561398923397064, 0.04851536825299263, -0.03764308616518974, -0.018207892775535583, 0.016173269599676132, -0.004123492166399956, -0.025343073531985283, -0.09777097404003143, 0.0290510356426239, -0.06969164311885834, -0.06684337556362152, 0.04377250373363495, 0.06861237436532974, -0.046966683119535446, 0.0611143596470356, -0.044503044337034225, 0.023559842258691788, -0.029876690357923508, 0.011016200296580791, 0.07286348938941956, 0.00030023325234651566, 0.08359035104513168, 0.017708808183670044, 0.07800529897212982, -0.08712167292833328, 0.002862636698409915, -0.06735634058713913, 0.03052128478884697, 0.04226242005825043, 0.023851098492741585, -0.04562359303236008, -0.013745550066232681, 0.013936172239482403, -0.0647776871919632, -0.0487772636115551, 0.07015536725521088, -0.030445875599980354, -0.043143901973962784, -0.09556057304143906, 0.047779254615306854, 0.046041958034038544, 0.009388554841279984, 0.04671555384993553, -0.059331271797418594, 0.03360891714692116, 0.03569460287690163, 0.004674405790865421, 0.03280949592590332, -0.011293579824268818, -0.05531742051243782, 0.045912306755781174, 0.04241438955068588, -0.07023770362138748, -0.03889290615916252, 0.019566599279642105, 0.06292827427387238, -0.012180106714367867, -0.009482266381382942, 0.0033363515976816416, -0.028241898864507675, 0.04916750639677048, -0.011430651880800724, 0.05025538429617882, 0.02134493552148342, 0.04370661824941635, 0.08801361173391342, -0.04115797579288483, -0.06421534717082977, -0.051845721900463104, -0.041304778307676315, 0.0507316067814827, 0.049301628023386, -0.013558737933635712, -0.004291698802262545, 0.038709867745637894, -0.0636303573846817, -0.047141704708337784, 0.022303685545921326, 0.07054309546947479, 0.009679436683654785, 0.0638614296913147, -0.046838339418172836, 0.01595005951821804, -0.025526082143187523, -0.0818924531340599, 0.016986405476927757, 0.023154381662607193, 0.06338698416948318, 0.07277237623929977], "MAXSSrr":[0.04370328411459923, 0.007435579318553209, 0.05632773041725159, 0.05872607231140137, -0.02179848775267601, -0.02491024136543274, -0.09028499573469162, -0.073136106133461, 0.0038046056870371103, -0.004702121019363403, 0.06376311928033829, 0.025374436751008034, 0.03343794494867325, -0.03841162100434303, 0.04050759971141815, 0.06359805166721344, -0.05459776520729065, -0.013898322358727455, 0.043059010058641434, 0.008913826197385788, -0.08469206839799881, 0.07041019201278687, -0.08591683208942413, 0.001833248999901116, 0.07940677553415298, -0.025694575160741806, -0.07197162508964539, -0.017312491312623024, -0.037606846541166306, -0.024861449375748634, 0.024707462638616562, -0.00026734761195257306, 0.033847302198410034, 0.05927937477827072, 0.04899705946445465, 0.0770091861486435, -0.09790053963661194, 0.057826053351163864, 0.05768071860074997, 0.01531772967427969, 0.0404951311647892, -0.04033346846699715, 0.05936214700341225, -0.029121382161974907, 0.044257547706365585, -0.10413498431444168, 0.09214437007904053, 0.017709942534565926, 0.026122651994228363, -0.08045665174722672, -0.03744427487254143, 0.09111800789833069, 0.0020880592055618763, 0.07745599746704102, 0.04109589755535126, -0.07718705385923386, -0.045550283044576645, 0.06791391223669052, 0.06261736899614334, -0.04795467481017113, 0.016496436670422554, 0.02853775955736637, -0.038986679166555405, 0.012603304348886013, 0.05299075320363045, 0.0022748250048607588, 0.00884503684937954, 0.1081618219614029, 0.05347983166575432, 0.03069908171892166, 0.015294212847948074, 0.0618034303188324, -0.07555301487445831, -0.0897526815533638, -0.07293840497732162, -0.02863491326570511, 0.01548877265304327, 0.09115951508283615, 0.011775748804211617, -0.009436656720936298, -0.07188120484352112, -0.004493236541748047, 0.0661926344037056, -0.04905804619193077, -0.06685564666986465, 0.06110713630914688, 0.018521195277571678, -0.04577818885445595, 0.07256703823804855, 0.0831693485379219, 0.008730655536055565, 0.04827301949262619, 0.0754026547074318, 0.027548737823963165, -0.07210569083690643, -0.004550515208393335, -0.06998797506093979, -0.014580612070858479, -0.04511459916830063, 0.1119980439543724], - "MBB":[0.0285621527582407, 0.017540860921144485, -0.08473232388496399, -0.004012782592326403, 0.01284435298293829, -0.05268647149205208, 0.05576688051223755, 0.0021535248961299658, -0.03945871442556381, -0.006189210340380669, -0.015129411593079567, -0.08998296409845352, -0.023543253540992737, -0.03973307088017464, 0.03474939242005348, -0.01602775789797306, -0.07461361587047577, -0.016514597460627556, -0.016366377472877502, 0.004728052299469709, -0.023341577500104904, -0.0914730429649353, 0.030636735260486603, -0.03425632417201996, 0.03614623472094536, -0.007019295822829008, -0.0218521635979414, -0.015808485448360443, -0.05414801836013794, 0.029721688479185104, 0.09407073259353638, 0.029655681923031807, -0.005722714588046074, 0.08653672784566879, 0.01633341796696186, -0.07890991121530533, -0.07574641704559326, 0.013483843766152859, -0.0011275253491476178, -0.05623066797852516, -0.03096684440970421, -0.0019136210903525352, 0.005127475131303072, 0.005057196598500013, -0.008401975966989994, -0.0391613207757473, -0.0026145142037421465, 0.05342942103743553, 0.034099776297807693, 0.028928104788064957, -0.006105952430516481, -0.039190810173749924, 0.026784662157297134, -0.07679374516010284, -0.007475676946341991, -0.036650288850069046, 0.00774755235761404, 0.008984091691672802, -0.059830714017152786, 0.042310964316129684, 0.0681624785065651, -0.018189340829849243, -0.014816401526331902, -0.05541539564728737, -0.09348370134830475, 0.003691869555041194, -0.0010735570685938, -0.010131723247468472, -0.041050590574741364, -0.013792471028864384, -0.024337435141205788, 0.07526508718729019, 0.08163300901651382, -0.03508464992046356, -0.01681988686323166, -0.06734774261713028, -0.07656992971897125, -0.03866373747587204, 0.004544078838080168, 0.0585801787674427, -0.021823249757289886, -0.0610244981944561, -0.04469957575201988, -0.011089849285781384, -0.05069964751601219, -0.025694409385323524, -0.0670132040977478, 0.09616350382566452, 0.06308142840862274, -0.10543308407068253, 0.0023751568514853716, -0.06237253174185753, 0.05771911144256592, -0.06010056659579277, -0.016188565641641617, 0.009142348542809486, -0.014255198650062084, -0.02999819628894329, 0.00473234336823225, 0.03976761922240257], - "MCSymbol":[0.05158298835158348, 0.05024643987417221, 0.06704410910606384, 0.0378347709774971, -0.03902719169855118, -0.08626251667737961, 0.03964311257004738, 0.06615762412548065, 0.04361319541931152, 0.03646374121308327, -0.018487416207790375, 0.0024993624538183212, 0.006693041883409023, 0.08311881870031357, 0.021111667156219482, 0.038208797574043274, 0.08689694851636887, -0.03659898787736893, 0.020775076001882553, 0.03553535416722298, 0.06854367256164551, -0.002012243028730154, 0.03658154606819153, 0.03127564862370491, 0.0363621786236763, -0.027205800637602806, -0.05243372917175293, 0.012564878910779953, -0.013430594466626644, -0.04043225944042206, -0.025083716958761215, 0.09665156900882721, 0.005077417939901352, -0.05181048810482025, 0.08925056457519531, 0.0777667909860611, -0.013708796352148056, 0.07754126191139221, 0.08393577486276627, 0.06395212560892105, -0.07428556680679321, -0.052424050867557526, 0.03497577831149101, 0.01964585855603218, -0.0429445318877697, 0.07072066515684128, 0.0017074055504053831, 0.059513408690690994, 0.013262910768389702, -0.07240563631057739, 0.09288764744997025, 0.030620144680142403, -0.046197980642318726, 0.04847298562526703, -0.03942957893013954, -0.0025783153250813484, -0.019526517018675804, 0.038867682218551636, 0.006007499527186155, -0.06366054713726044, 0.004640159662812948, 0.013837787322700024, -0.020015377551317215, -0.010317903012037277, 0.001741019543260336, 0.06261103600263596, -0.03374830260872841, 0.01629183441400528, -0.013137640431523323, 0.026046304032206535, -0.009679407812654972, -0.07085473090410233, 0.03035539574921131, -0.08764562010765076, -0.03820766881108284, -0.04181021824479103, -0.05163294076919556, 0.06666433811187744, -0.08939782530069351, 0.040260378271341324, -0.06847432255744934, 0.09106951206922531, -0.07388591021299362, -0.07479099184274673, -0.001779694459401071, -0.0963745042681694, -0.06515862792730331, -0.08404017239809036, -0.09935544431209564, 0.010541093535721302, -0.04491754248738289, 0.09378639608621597, 0.006655062548816204, 0.06637217849493027, -0.05623293295502663, -0.020134123042225838, 0.005873391404747963, -0.07765494287014008, -0.0008442706312052906, -0.03568055108189583], "MINSDrr":[0.00284420233219862, 0.07673676311969757, 0.08602232486009598, 0.030074521899223328, -0.06255929172039032, -0.10135219246149063, 0.0772649347782135, 0.0045582992024719715, -0.01195931900292635, 0.009085145778954029, -0.04665979743003845, 0.019213048741221428, 0.022454556077718735, -0.05505772680044174, 0.035268958657979965, -0.06431140005588531, -0.001450810581445694, -0.027346337214112282, 0.041191086173057556, -0.0808955729007721, -0.04748200997710228, 0.0653977245092392, 0.042980875819921494, -0.04332194849848747, -0.024661004543304443, 0.09317019581794739, -0.06639514118432999, 0.013383567333221436, 0.051771167665719986, 0.05815904587507248, -0.05226780101656914, 0.079694002866745, -0.017969269305467606, -0.07137028127908707, -0.0011493286583572626, -0.02009846828877926, 0.006549016106873751, 0.0019126685801893473, 0.06168307736515999, -0.025323089212179184, 0.010943768545985222, 0.02157585136592388, -0.012993190437555313, -0.025179127231240273, -0.08958654850721359, -0.04273540899157524, 0.015248515643179417, 0.05456075817346573, 0.05705633386969566, -0.0038763433694839478, 0.08008016645908356, -0.004114328417927027, -0.01975642889738083, -0.014040309935808182, 0.025527596473693848, -0.06883629411458969, 0.06273050606250763, 0.05779215693473816, -0.061573851853609085, 0.01889919489622116, 0.026195447891950607, -0.021544434130191803, -0.0810774490237236, -0.016286203637719154, 0.01799311302602291, -0.08440321683883667, 0.0897485539317131, 0.08083964139223099, -0.006629236973822117, 0.051063962280750275, -0.08597207814455032, 0.029692046344280243, -0.03309508413076401, -0.09422174096107483, 0.0019163102842867374, 0.05546015128493309, -0.05980079993605614, -0.07416199892759323, -0.005134278908371925, 0.07392455637454987, -0.0634748563170433, 0.020546387881040573, -0.019978882744908333, 0.039572179317474365, -0.04754075035452843, -0.06090293824672699, -0.011185224168002605, -0.054661743342876434, 0.027916360646486282, -0.00819246843457222, -0.03119322657585144, 0.019949961453676224, 0.008312772959470749, 0.06788603216409683, 0.041624777019023895, 0.051687415689229965, -0.04819793254137039, -0.0761520192027092, -0.019374510273337364, -0.008435340598225594], "MINSSrr":[-0.06906168162822723, 0.008121289312839508, 0.010413543321192265, 0.052863992750644684, 0.01030051801353693, -0.009280139580368996, 0.016139337792992592, -0.05126945674419403, 0.06733083724975586, -0.01006366591900587, 0.06506948918104172, 0.05012301355600357, -0.07191506028175354, 0.018038516864180565, -0.020798280835151672, 0.08538958430290222, -0.028427604585886, 0.02630189247429371, 0.010489841923117638, 0.10011959075927734, -0.067482590675354, 0.01461686473339796, 0.03908747434616089, -0.015383233316242695, -0.03783239424228668, 0.06359098851680756, -0.052475571632385254, 0.07818790525197983, -0.0030931381043046713, 0.013684416189789772, 0.04222726821899414, 0.04708671569824219, 0.01192860770970583, 0.08628913760185242, -0.06380248814821243, -0.004006511997431517, -0.02817981317639351, -0.11196613311767578, 0.01953534409403801, 0.0034300305414944887, -0.040240559726953506, 0.004963779356330633, -0.06623393297195435, -0.04386508837342262, -0.08431598544120789, -0.023293999955058098, 0.02133636176586151, 0.04054516181349754, 0.04479363188147545, 0.02776535600423813, 0.01497643906623125, 0.026148531585931778, -0.05869835242629051, -0.07451415807008743, -0.009552933275699615, -0.004124804865568876, 0.08342882245779037, 0.05295371264219284, -0.05495591461658478, -0.07350015640258789, -0.05573306977748871, 0.07158630341291428, 0.04162517935037613, 0.0019162269309163094, -0.07742705941200256, -0.05673951655626297, 0.05760834366083145, 0.08143799751996994, 0.09629082679748535, -0.05737840384244919, 0.03762679174542427, 0.022383252158761024, 0.02897579036653042, -0.0929567888379097, 0.04767351970076561, 0.05145186930894852, 0.012956425547599792, 0.04237693175673485, 0.06772835552692413, 0.011290902271866798, -0.06324069201946259, -0.04689439386129379, 0.09521757066249847, 0.05625065788626671, -0.032533977180719376, -0.00987032800912857, -0.08346299827098846, -0.06292857229709625, 0.042861636728048325, 0.08865208923816681, -0.0021774298511445522, 0.010668188333511353, -0.05791740491986275, 0.02240762859582901, -0.022414017468690872, 0.04343479871749878, 0.01852354407310486, -0.004329795949161053, -0.00262851663865149, -0.009029376320540905], "MOV":[-0.03924819082021713, -0.015029003843665123, 0.14121688902378082, -0.05414531007409096, -0.01409768033772707, 0.05467522144317627, -0.0798286497592926, 0.042834796011447906, -0.04328306391835213, -0.12638653814792633, 0.02380293421447277, -0.010002975352108479, -0.03018246777355671, -0.09843093156814575, -0.015159506350755692, -0.03186051547527313, -0.009830419905483723, 0.024049948900938034, -0.028536750003695488, -0.05252794921398163, -0.003984724637120962, -0.09075328707695007, -0.015937313437461853, 0.07316069304943085, 0.002778300317004323, 0.003214895725250244, -0.0832214206457138, 0.012602301314473152, 0.0687694102525711, 0.1425037384033203, -0.04724106192588806, 0.05618143081665039, 0.0028424363117665052, 0.03067261539399624, 0.008477674797177315, -0.002142940880730748, 0.0036045191809535027, -0.02257452718913555, 0.013552851043641567, -0.016065331175923347, 0.03364546224474907, 0.0027604023925960064, -0.013575572520494461, 0.1340155154466629, 0.04859570413827896, 0.07984673976898193, 0.006813493091613054, -0.017625009641051292, -0.0667564794421196, -0.0025298972614109516, -0.06280945241451263, 0.08589767664670944, -0.011751428246498108, 0.04074618220329285, 0.0561428964138031, -0.0068444423377513885, 0.028041694313287735, 0.06258948892354965, 0.02493610419332981, -0.018480388447642326, -0.035079196095466614, 0.14365622401237488, -0.046609606593847275, 0.040164150297641754, -0.049927353858947754, 0.06781942397356033, -0.04828719049692154, 0.03496144339442253, -0.044686879962682724, 0.04254060238599777, 0.024320241063833237, -0.0031205937266349792, -0.049061503261327744, -0.028716804459691048, -0.056192029267549515, 0.022012677043676376, -0.0745186060667038, -0.0008951064082793891, -0.051033493131399155, 0.023357892408967018, 0.06984421610832214, 0.0057564410381019115, -0.005192344542592764, -0.003961252048611641, -0.012275456450879574, -0.018581852316856384, -0.0046620736829936504, 0.02494811825454235, 0.0520334355533123, -0.02435225434601307, 0.0008846594137139618, 0.017687007784843445, 0.07866063714027405, -0.025595100596547127, -0.020679078996181488, -0.027750879526138306, 0.10005537420511246, -0.015581297688186169, -0.08011393249034882, 0.028118811547756195], "MUL":[-0.026987887918949127, 0.06016572564840317, 0.0787728950381279, -0.0803905576467514, 0.005736608523875475, -0.07245960086584091, -0.02662983350455761, 0.012340782210230827, 0.042490337044000626, 0.06399581581354141, -0.009004191495478153, 0.0370473749935627, -0.0605553574860096, -0.09520823508501053, 0.0010566662531346083, -0.028270091861486435, 0.08631408214569092, 0.002891023177653551, -0.051674507558345795, -0.04089691862463951, -0.04444378614425659, -0.061945777386426926, -0.026001833379268646, 0.04689744487404823, -0.07711070775985718, 0.07018855959177017, -0.02606336772441864, 0.054914504289627075, 0.03522270917892456, -0.027317974716424942, 0.02187947928905487, 0.009710998274385929, 0.01340037677437067, 0.016422593966126442, -0.058249425143003464, -0.08377814292907715, -0.04476138949394226, 0.04349169507622719, 0.05062006786465645, 0.01706511154770851, 0.020649245008826256, 0.06287672370672226, -0.03981941193342209, 0.04973218962550163, -0.03353424742817879, -0.016799092292785645, -0.031751759350299835, 0.10430201143026352, -0.04326871410012245, 0.0736854076385498, -0.0768580436706543, -0.03183818608522415, 0.010583195835351944, 0.015541432425379753, 0.03191666305065155, 0.020011236891150475, 0.041239380836486816, -0.0029152908828109503, 0.009499716572463512, -0.011166329495608807, 0.03469998389482498, 0.00607832008972764, 0.030300112441182137, -0.040855471044778824, 0.00988304428756237, 0.050531189888715744, 0.06647889316082001, -0.027519647032022476, -0.06819992512464523, 0.02215251699090004, 0.086424820125103, -0.03395787626504898, -0.020825445652008057, 0.08309803158044815, -0.0256529338657856, 0.005000723991543055, -0.03375622257590294, 0.005569287110120058, -0.028089171275496483, 0.04142652079463005, -0.03232670575380325, 0.025872791185975075, -0.07439207285642624, 0.04975134879350662, 0.049770113080739975, -0.05090470612049103, -0.04476647078990936, 0.09217675030231476, 0.05079415813088417, 0.017867455258965492, -0.04477125406265259, 0.004301204811781645, 0.05066722631454468, -0.08186711370944977, 0.008772231638431549, -0.10532139241695404, 0.004499110858887434, 0.03296274691820145, -0.0020684772171080112, 0.05012065917253494], - "Metadata":[-0.07879140228033066, 0.024690961465239525, 0.022790303453803062, 0.01354144886136055, -0.07098772376775742, 0.04053819552063942, -0.04038544371724129, -0.021055836230516434, 0.10361373424530029, 0.04415135458111763, -0.09545262902975082, 0.042553599923849106, -0.021835647523403168, 0.07703430950641632, -0.04880501329898834, -0.04054124280810356, 0.05049756169319153, 0.08986796438694, 0.0705084353685379, -0.0077315340749919415, -0.045390889048576355, 0.053155045956373215, 0.045656319707632065, -0.02663712576031685, -0.01446426473557949, -0.058978915214538574, 0.011314704082906246, 0.03043927252292633, -0.0843580812215805, 0.017854437232017517, -0.08720997720956802, 0.030351335182785988, -0.04896129295229912, 0.04189978539943695, -0.09887325763702393, 0.0015409664483740926, -0.08604399859905243, 0.10654544085264206, 0.1058540865778923, 0.014106648042798042, 0.0640459656715393, -0.05182884633541107, 0.006081609521061182, 0.07624028623104095, 0.02025698497891426, 0.08467324078083038, 0.027136018499732018, 0.026320911943912506, -0.035337720066308975, 0.03864980861544609, -0.019960917532444, -0.029152821749448776, 0.06562864780426025, 0.028298277407884598, -0.07397148013114929, -0.005078969523310661, 0.025909438729286194, -0.01157586183398962, 0.05436081811785698, 0.03408071771264076, -0.07142144441604614, -0.0523630827665329, -0.06302442401647568, -0.019975490868091583, -0.06937523931264877, 0.057667043060064316, -0.08580337464809418, -0.05092239752411842, -0.012613813392817974, 0.025480754673480988, 0.04219530522823334, -0.007300581783056259, 0.05323299020528793, 0.0489904023706913, 0.09260626882314682, -0.04819458723068237, 0.05419271066784859, 0.04558999091386795, 0.012036344967782497, -0.05483977124094963, -0.05181310698390007, -0.02104383148252964, -0.057876624166965485, 0.039601441472768784, 0.025240536779165268, -0.03984035924077034, 0.07654847204685211, -0.07073183357715607, -0.0018080074805766344, -0.016453349962830544, 0.03962434455752373, 0.05717255175113678, 0.01962372660636902, 0.00952839944511652, 0.0013127806596457958, 0.013634574599564075, 0.07692103832960129, 0.06334574520587921, 0.056647684425115585, -0.02965259924530983], "NEG":[-0.0585959330201149, -0.02519698068499565, 0.029133861884474754, -0.003332944354042411, 0.05054186284542084, -0.03572014719247818, -0.012210451066493988, 0.06708117574453354, -0.0712793841958046, -0.01644597202539444, 0.06453811377286911, -0.03662518784403801, 0.0545802004635334, -0.11130833625793457, -0.04544609412550926, 0.012950814329087734, 0.08011337369680405, 0.014672964811325073, 0.0030391360633075237, -0.10994786024093628, 0.004102041013538837, -0.0749390497803688, -0.010000540874898434, 0.062072113156318665, 0.03312767669558525, -0.04764379560947418, -0.033307697623968124, 0.02903047949075699, 0.0319744311273098, 0.027374137192964554, -0.05640692263841629, -0.01572772115468979, 0.019634589552879333, 0.0629790723323822, -0.024743184447288513, -0.09348101913928986, 0.04078087955713272, 0.0002063393039861694, 0.01791796088218689, -0.01174850668758154, 0.0067609078250825405, 0.031922854483127594, 0.045338794589042664, -0.06706424057483673, -0.03090975433588028, 0.035511564463377, 0.0377444289624691, -0.007464382331818342, 0.02387971244752407, -0.023001981899142265, -0.0052301278337836266, 0.08532170951366425, 0.00384823651984334, 0.0689602717757225, -0.05606595426797867, 0.03483026847243309, 0.023350417613983154, -0.06512849777936935, 0.0627395287156105, -0.0203714482486248, -0.009735504165291786, 0.06432165950536728, -0.04546240717172623, 0.0322086475789547, 0.004561635199934244, 0.040702879428863525, -0.0680280476808548, 0.025354159995913506, -0.07624178379774094, 0.06776861846446991, 0.07863514125347137, -0.037652503699064255, -0.023264721035957336, 0.030604641884565353, -0.07419195026159286, 0.014679630286991596, 0.1294829547405243, 0.007591600529849529, -0.06612348556518555, 0.03127516806125641, 0.10645392537117004, -0.018773522228002548, 0.03992835432291031, 0.044048961251974106, 0.00023814172891434282, -0.06797933578491211, -0.08000202476978302, -0.04320430010557175, 0.043590281158685684, -0.05034546181559563, 0.014501169323921204, 0.03329288214445114, 0.03045976720750332, -0.01932660862803459, -0.026188183575868607, -0.1232738122344017, -0.04858024790883064, -0.015570580027997494, 0.013346930965781212, 0.009410912171006203], "NOT":[0.02556992694735527, -0.0005189123330637813, 0.010195978917181492, -0.027382172644138336, -0.0374554842710495, 0.08793098479509354, 0.0024311996530741453, -0.08769379556179047, -0.054654307663440704, -0.08747632801532745, 0.09218847006559372, 0.0972878560423851, 0.044738128781318665, -0.02398994378745556, -0.046165600419044495, -0.0002692296984605491, -0.03797682002186775, 0.05161413550376892, -0.033769138157367706, 0.011279402300715446, 0.08941229432821274, -0.07437314093112946, -0.025249861180782318, 0.1026485413312912, -0.042062994092702866, 0.022835882380604744, 0.05108749121427536, -0.054616689682006836, -0.04208545386791229, 0.10205414891242981, -0.02474227361381054, -0.01605238951742649, -0.011079655028879642, -0.04231556877493858, -0.058844879269599915, 0.0017704797210171819, 0.005396600812673569, -0.058835554867982864, 0.03384264558553696, -0.024245088919997215, 0.03355555981397629, 0.02017929218709469, 0.04421762749552727, 0.09027500450611115, 0.03916880115866661, 0.042518291622400284, 0.024490609765052795, 0.00026937652728520334, -0.010342003777623177, -0.05488119646906853, 0.07418034970760345, 0.0008032438345253468, 0.09190968424081802, 0.07747997343540192, -0.024773627519607544, 0.0496656633913517, -0.038326963782310486, -0.0022213482297956944, 0.02448110282421112, 0.0022990668658167124, 0.052763812243938446, 0.051123637706041336, 0.03795074671506882, 0.06734737008810043, -0.030445149168372154, 0.021410485729575157, -0.044919464737176895, -0.0011586989276111126, -0.0903671532869339, -0.01408425159752369, 0.07342954725027084, -0.04118982329964638, -0.008432484231889248, -0.0008165669860318303, -0.0642886608839035, 0.007230957038700581, -0.0670868456363678, -0.01116579957306385, -0.09545603394508362, -0.03109285980463028, 0.005951744969934225, 0.024672016501426697, -0.04027184471487999, 0.03607063740491867, 0.023179687559604645, 0.0117312828078866, -0.019768331199884415, -0.023262612521648407, 0.04165903106331825, -0.039224691689014435, 0.040571704506874084, 0.08653629571199417, 0.027772698551416397, -0.08196783810853958, -0.013821743428707123, 0.004212009254842997, 0.01664070598781109, -0.008459849283099174, 0.041462354362010956, 0.06886350363492966], "OR":[-0.0010318798013031483, -0.058885037899017334, 0.015562368556857109, -0.03459857404232025, -0.006239954382181168, 0.04347813501954079, -0.043183062225580215, -0.06115246191620827, -0.08097145706415176, -0.040188197046518326, 0.02098822593688965, -0.013338722288608551, -0.01845080405473709, -0.07172099500894547, -0.00026761949993669987, 0.015059647150337696, -0.08275016397237778, 0.10280061513185501, -0.017712965607643127, -0.07511771470308304, 0.007648291997611523, -0.12827979028224945, -0.020353827625513077, 0.08809063583612442, -0.02829514630138874, 0.003038457129150629, -0.04399721696972847, 0.046383049339056015, 0.06416497379541397, -0.0006932668038643897, -0.033501505851745605, -0.012374987825751305, 0.018504725769162178, 0.00529597420245409, -0.040804456919431686, -0.00419827364385128, -0.017476536333560944, -0.04530858248472214, 0.01608600653707981, -0.08898036181926727, -0.015132613480091095, -0.053797122091054916, -0.011825251393020153, 0.09507828205823898, 0.08454664051532745, 0.04075947031378746, 0.020354142412543297, 0.01704799383878708, -0.026439497247338295, -0.04004717990756035, -0.053405825048685074, 0.04079057276248932, 0.026150185614824295, 0.04538597911596298, 0.046778932213783264, 0.057205770164728165, 0.037173718214035034, -0.07114585489034653, 0.03480122983455658, 0.0069038826040923595, -0.056386105716228485, -0.03294815868139267, 0.04636325314640999, -0.05767818167805672, -0.05788124352693558, -0.011048000305891037, -0.04350278526544571, 0.029680529609322548, -0.0512658953666687, 0.04321866109967232, 0.047014784067869186, -0.014913392253220081, -0.007425297982990742, -0.09810416400432587, -0.07316632568836212, 0.05063875392079353, -0.07298189401626587, -0.012434680946171284, -0.09386061877012253, 0.016765601933002472, 0.06658460199832916, 0.0014198448043316603, -0.022241152822971344, 0.05902376398444176, 0.057584285736083984, 0.024565961211919785, -0.02896188013255596, 0.006485136691480875, 0.05981580168008804, -0.015995489433407784, 0.027470067143440247, 0.09679803997278214, 0.0342426523566246, -0.08387557417154312, -0.015599220991134644, -0.0049544889479875565, -0.06524655222892761, 0.02150602824985981, 0.016511479392647743, 0.055177561938762665], @@ -115,7 +106,6 @@ "PCMPGTBrr":[-0.04665364325046539, -0.03588206693530083, 0.05219453573226929, 0.08376432955265045, 0.05562759190797806, -0.0034289404284209013, 0.08200010657310486, 0.023898538202047348, -0.002851601457223296, -0.08778133243322372, 0.017107484862208366, 0.08448091894388199, 0.020043527707457542, 0.038858626037836075, 0.036468397825956345, -0.0069902255199849606, -0.09442859143018723, 0.0018075992120429873, 0.05577728524804115, -0.0005804274696856737, 0.029588190838694572, -0.050955869257450104, 0.016604335978627205, -0.054141607135534286, -0.030936168506741524, 0.004688458051532507, -0.02321118488907814, -0.009524177759885788, 0.030161075294017792, -0.0557246096432209, 0.017830688506364822, 0.04058525711297989, 0.023080267012119293, 0.04536818340420723, 0.09658516198396683, 0.004083207808434963, 0.053284309804439545, 0.07114734500646591, 0.03272407501935959, -0.06646303087472916, 0.08200454711914062, -0.06558514386415482, 0.0745493471622467, -0.0010506648104637861, -0.02250707894563675, 0.015057512558996677, -5.047186277806759e-06, 0.04663649946451187, 0.06489380449056625, -0.0477377213537693, -0.08882559835910797, 0.08948437124490738, -0.052260447293519974, 0.06798093020915985, -0.06404604762792587, 0.0005905702710151672, 0.014312930405139923, 0.0370929092168808, 0.03622571751475334, 0.06601805984973907, 0.04077596217393875, -0.0019877473823726177, -0.02357509359717369, 0.04524341970682144, 0.024309739470481873, -0.05969798564910889, -0.015872884541749954, -0.055400021374225616, 0.04820183292031288, 0.024034500122070312, -0.05125486105680466, 0.020366262644529343, 0.03310052305459976, 0.1036759540438652, 0.049202825874090195, -0.010945710353553295, -0.030628688633441925, 0.048871662467718124, 0.07457619905471802, 0.017111260443925858, 0.028184816241264343, -0.09065181016921997, -0.017116032540798187, -0.06233282387256622, -0.011385255493223667, -0.06190027296543121, -0.01189250499010086, -0.03632708638906479, 0.04705822467803955, 0.0022293981164693832, 0.06782552599906921, -0.0490303635597229, -0.08690774440765381, -0.08311695605516434, 0.04079030826687813, 0.022971853613853455, -0.019726071506738663, -0.032829709351062775, -0.05147984251379967, -0.06768873333930969], "PCMPGTDrr":[0.026095403358340263, 0.009877854026854038, -0.022390423342585564, -0.06749505549669266, 0.03866114094853401, 0.07523459941148758, -0.02331429533660412, -0.013958744704723358, -0.05151516944169998, -0.033018071204423904, -0.017118683084845543, 0.06611985713243484, 0.024562569335103035, 0.027193237096071243, -0.04081164300441742, -0.0557839497923851, 0.07676059752702713, -0.017435213550925255, -0.0696197971701622, 0.04529204219579697, 0.015718640759587288, -0.0868423655629158, -0.025476763024926186, 0.1075882539153099, 0.08407340198755264, 0.03219793736934662, -0.029079284518957138, -0.10067792236804962, -0.01665782555937767, -0.002518820110708475, 0.06302576512098312, -0.042360853403806686, -0.014688530936837196, -0.04797102138400078, -0.05708448588848114, 0.05345156416296959, -0.03360274061560631, -0.006362707354128361, 0.045909661799669266, -0.0034944594372063875, -0.04771789163351059, -0.015326191671192646, -0.017800530418753624, 0.009678518399596214, -0.01412744726985693, 0.09620117396116257, 0.0705861821770668, -0.0663042888045311, 0.07589521259069443, -0.08846025168895721, 0.008178732357919216, -0.023293234407901764, 0.049390021711587906, 0.00771696399897337, -0.026583032682538033, 0.012981866486370564, -0.06098538264632225, -0.04784953594207764, -0.001411060569807887, -0.0646580159664154, 0.07771933078765869, 0.012061100453138351, 0.026251494884490967, 0.024035189300775528, 0.00368816708214581, 0.019370727241039276, 0.0473535880446434, 0.0688827782869339, -0.0656280517578125, 0.0001225982268806547, -0.04765431582927704, 0.08570858836174011, 0.06544618308544159, 0.02309294231235981, -0.07891835272312164, 0.05969972908496857, 0.04259306937456131, -0.0388357900083065, 0.10700955986976624, -0.03643207252025604, -0.014097973704338074, 0.018475063145160675, -0.008959461003541946, -0.04132810980081558, -0.01586003415286541, -0.013873838819563389, 0.07354859262704849, 0.003967848140746355, -0.023853322491049767, -0.013099947944283485, 0.06407736241817474, 0.03060499019920826, -0.08859552443027496, 0.009045977145433426, -0.09939071536064148, -0.022137949243187904, -0.03951180726289749, -0.0316530205309391, 0.05501912534236908, -0.06330689787864685], "PEXTRWrr":[-0.05698293820023537, -0.02332535944879055, -0.01313185878098011, 0.08844685554504395, -0.030702419579029083, -0.042257267981767654, -0.06976033002138138, 0.08907881379127502, 0.040486857295036316, 0.01966431364417076, 0.011261478997766972, 0.011022844351828098, -0.0069642444141209126, -0.016230706125497818, -0.009695738554000854, -0.04666578397154808, 0.016855308786034584, -0.03308985382318497, 0.01504850760102272, 0.09940154105424881, -0.07109691947698593, 0.043378811329603195, -0.06964893639087677, -0.05999808758497238, 0.008651218377053738, 0.04237857088446617, 0.04557272046804428, 0.04033806174993515, -0.005760873202234507, 0.008976156823337078, 0.05276636406779289, -0.06584233790636063, -0.011512805707752705, -0.01598522625863552, -0.044132646173238754, -0.020889364182949066, -0.09435509145259857, -0.02823605202138424, 0.0820322185754776, -0.0391690619289875, 0.03367430716753006, -0.029474111273884773, -0.07719384133815765, 0.003098628716543317, 0.05822441354393959, -0.09175454080104828, 0.02256210707128048, -0.004901964217424393, -0.008566503413021564, 0.040359016507864, -0.04049991816282272, 0.010366388596594334, -0.05293237417936325, -0.0956558957695961, -0.01418458204716444, 0.05464276298880577, -0.014091472141444683, 0.023551519960165024, -0.042662639170885086, -0.07025191932916641, -0.0017952515045180917, 0.07680258899927139, -0.10743812471628189, -0.08435508608818054, -0.00337960640899837, -0.03381747379899025, 0.027066459879279137, -0.009784750640392303, 0.04265652969479561, 0.02066781371831894, -0.03692338988184929, 0.0029027678538113832, 0.06893923878669739, 0.03784753382205963, -0.04037536308169365, -0.09532847255468369, 0.03193795308470726, 0.0387917198240757, 0.03887058049440384, -0.0002478501701261848, -0.0671166405081749, 0.06754262745380402, 0.01643708348274231, 0.012460017576813698, 0.03147564455866814, 0.05646798014640808, 0.014081758446991444, 0.07141963392496109, 0.016428180038928986, 0.0443485863506794, 0.06492826342582703, 0.09964785724878311, 0.026795320212841034, 0.0271765124052763, -0.015695465728640556, -0.08133535832166672, -0.05439477041363716, 0.04913243651390076, 0.024485180154442787, -0.04072758927941322], - "PHY_REG":[-0.008169060572981834, -0.017023155465722084, -0.04927198588848114, 0.0014261528849601746, 0.012259463779628277, -0.02794509381055832, -0.024857040494680405, 0.029203711077570915, 0.0433109886944294, 0.009679347276687622, -0.05811547115445137, -0.09075025469064713, -0.08525611460208893, -0.10545054078102112, 0.06474080681800842, 0.056396666914224625, 0.06781823933124542, 0.09059076011180878, -0.10420752316713333, -0.08284831047058105, 0.02349182404577732, -0.0354253351688385, -0.004627702757716179, 0.0068538435734808445, -0.053724177181720734, -0.02113335393369198, 0.05254676192998886, -0.050769440829753876, 0.061386119574308395, -0.07541731745004654, -0.024204161018133163, -0.0009893826209008694, -0.007493770215660334, -0.017051052302122116, 0.015025814063847065, -0.020427946001291275, -0.0844966471195221, 0.04589429497718811, 0.025571472942829132, -0.05280151963233948, 0.06895384937524796, 0.03960262984037399, 0.0068003153428435326, 0.09397424012422562, -0.0523529127240181, 0.03780638054013252, -0.015423302538692951, 0.029167350381612778, 0.01019437238574028, 0.023989612236618996, -0.03344425559043884, -0.07926471531391144, -0.09238854795694351, 0.04794330149888992, 0.01872367039322853, -0.029179377481341362, -0.05339968949556351, -0.04575541242957115, -0.004491546656936407, -0.009650425054132938, 0.026945313438773155, -0.02115861512720585, 0.06488905847072601, -0.06647083908319473, 0.008904196321964264, 0.010536684654653072, -0.06012551859021187, -0.00022655133216176182, -0.10175421833992004, 0.062001921236515045, -0.054452817887067795, 0.01785552129149437, -0.06749527156352997, -0.04883178323507309, -0.023449009284377098, 0.040745027363300323, 0.002448269398882985, 0.07842953503131866, -0.019806355237960815, -0.08275315910577774, 0.01131721492856741, 0.0482926219701767, 0.01892486959695816, 0.005685009527951479, -0.0055344682186841965, -0.0034555341117084026, -0.07923021167516708, 0.06387833505868912, 0.05978211387991905, -0.001252106623724103, 0.07216084003448486, -0.01223798282444477, 0.09716741740703583, 0.009659498929977417, -0.09404221922159195, -0.10122949630022049, -0.003581057768315077, 0.07885389029979706, 0.05305042862892151, -0.04988719895482063], "PMOVMSKBrr":[0.07294902205467224, -0.00040799094131216407, -0.01483855675905943, -0.02571418508887291, 0.08466307818889618, -0.03447218984365463, -0.05685977265238762, -0.019133185967803, 0.06332023441791534, -0.061352625489234924, -0.023195402696728706, -0.05378473922610283, -0.05650350823998451, 0.06583224982023239, -0.012845925986766815, -0.052972156554460526, 0.049470845609903336, -0.04565730318427086, 0.09717552363872528, -0.014171762391924858, 0.013508875854313374, 0.004057068843394518, -0.020556267350912094, -0.10475417971611023, 0.018426941707730293, -0.07273723930120468, 0.01702595315873623, -0.013097747229039669, -0.07530277967453003, 0.05442536994814873, -0.0601920410990715, -0.05255919322371483, -0.07305102050304413, 0.02758030779659748, 0.06180129200220108, 0.10606050491333008, 0.046477098017930984, -0.024062691256403923, 0.07360008358955383, -0.011283098720014095, -0.03712400794029236, -0.09973011910915375, 0.018314119428396225, 0.009135990403592587, -0.01891133189201355, 0.00915572326630354, 0.006080301944166422, -0.02368554100394249, -0.019582828506827354, 0.051494162529706955, -0.010953089222311974, 0.011621126905083656, 0.010515356436371803, 0.011188569478690624, -0.0202876515686512, 0.038686931133270264, -0.066365085542202, 0.014182188548147678, 0.00445093447342515, 0.05712618678808212, -0.04463819041848183, -0.10292281210422516, -0.011173201724886894, 0.0029098563827574253, 0.06890314072370529, 0.06398330628871918, 0.03248615562915802, -0.05457807704806328, -0.006898659747093916, 0.038892313838005066, -0.09130232781171799, 0.013324378058314323, -0.033766016364097595, -0.043404608964920044, 0.018701359629631042, -0.03784232959151268, -0.05014420300722122, 0.04404780641198158, 0.09254389256238937, 0.09839074313640594, -0.028214668855071068, 0.03262662887573242, 0.04281335324048996, 0.07356158643960953, -0.0773080587387085, 0.026536725461483, -0.06819723546504974, 0.03335537016391754, 0.09355103969573975, -0.052649617195129395, -0.08467497676610947, -0.06516479700803757, -0.07499512284994125, 0.023276200518012047, -0.06063856557011604, -0.044472258538007736, 0.03155883774161339, -0.011262890882790089, 0.04045895114541054, 0.012343645095825195], "PMULUDQrr":[-0.018331514671444893, 0.04249238595366478, 0.0718526765704155, 0.03221653401851654, -0.04829120263457298, -0.02055567130446434, 0.05200991779565811, -0.04337913170456886, -0.02698952704668045, 0.05037892237305641, 0.014545431360602379, 0.09035851061344147, 0.0777752548456192, -0.06762461364269257, 0.032133519649505615, 0.048851024359464645, 0.01295433659106493, 0.054136257618665695, 0.09599477052688599, 0.024489495903253555, 0.05683024227619171, -0.05242127552628517, -0.043476004153490067, 0.004586773458868265, 0.024281315505504608, 0.03402777388691902, 0.0033939755521714687, 0.049474406987428665, 0.0011405921541154385, 0.06828528642654419, 0.08426304161548615, -0.029339993372559547, -0.04173621907830238, -0.03966334089636803, -0.03011258877813816, -0.07684683799743652, 0.040944185107946396, -0.04709877818822861, 0.07968004047870636, 0.07534269988536835, -0.006957313045859337, -0.0016522067598998547, -0.017229178920388222, 0.030470186844468117, 0.05390452966094017, 0.05233803763985634, 0.045554302632808685, -0.03710555285215378, 0.05699322372674942, 0.019888387992978096, 0.10152119398117065, 0.026563912630081177, -0.0018862299621105194, -0.02453959546983242, -0.06107368320226669, -0.04910692200064659, -0.06316373497247696, 0.04648333042860031, -0.00939352996647358, 0.030374331399798393, 0.0027768383733928204, 0.07302171736955643, -0.0035402378998696804, 0.054474033415317535, -0.0739617869257927, 0.01190911140292883, -0.019428657367825508, -0.006644500885158777, -0.04998863860964775, 0.03215506672859192, 0.054085105657577515, 0.047874726355075836, 0.10735851526260376, 0.030255280435085297, 0.029996531084179878, 0.006218941882252693, 0.04892734810709953, 0.06425125896930695, -0.017792150378227234, 0.041398752480745316, -0.017293022945523262, -0.011015499010682106, -0.02933122031390667, -0.005825115367770195, -0.07212502509355545, 0.10469445586204529, 0.009840304031968117, 0.026172513142228127, 0.002459621522575617, -0.02771947532892227, -0.006639100145548582, -0.04062161594629288, -0.0746249407529831, 0.04523816704750061, -0.07439430058002472, 0.06977812945842743, 0.008738852106034756, 0.06937781721353531, 0.07391723990440369, -0.09542208909988403], "POPCNT":[0.032459065318107605, 0.11127372831106186, -0.004006756469607353, 0.06373029947280884, 0.07161973416805267, -0.07966824620962143, -0.014274416491389275, 0.02168503776192665, -0.060636017471551895, -0.051414258778095245, 0.003268218832090497, 0.05552225932478905, 0.01940925046801567, -0.05398592725396156, 0.09021458029747009, -0.060922130942344666, -0.0407782681286335, -0.027882883325219154, 0.012706448324024677, -0.02730434015393257, 0.05854162946343422, -0.0798129290342331, -0.00179530237801373, 0.04958317428827286, -0.04621487483382225, 0.0524308979511261, -0.03889109939336777, 0.07240460813045502, 0.06366933137178421, 0.029314585030078888, -0.014743340201675892, -0.021233027800917625, 0.06803205609321594, -0.01269250176846981, -0.033408213406801224, 0.09638478606939316, -0.02009841799736023, -0.014619074761867523, 0.022498659789562225, 0.006679723970592022, -0.016163295134902, 0.09717728197574615, -0.010882971808314323, -0.09489153325557709, 0.046623144298791885, -0.04596618935465813, -0.026864662766456604, 0.01605546846985817, 0.05979238823056221, -0.024411896243691444, 0.039511535316705704, -0.0108433086425066, -0.05629622936248779, 0.02339898608624935, -0.025785285979509354, 0.011886742897331715, 0.08834438771009445, -0.08506806194782257, 0.021776534616947174, 0.01446699257940054, -0.009117010980844498, -0.022380229085683823, -0.0541100800037384, -0.040569182485342026, -0.02888612262904644, 0.07774273306131363, -0.052350424230098724, -0.039240963757038116, 0.004771160893142223, 0.014987779781222343, -0.05511622130870819, 0.019763313233852386, -0.0920683741569519, 0.021821241825819016, 0.10812623798847198, -0.06422155350446701, -0.07388156652450562, 0.00949418731033802, -0.06905169039964676, 0.006180475000292063, -0.02844754233956337, 0.11084792017936707, -0.03348945826292038, 0.06860767304897308, -0.0214154664427042, -0.0008655296987853944, -0.020698973909020424, 0.03369581326842308, 0.019848104566335678, 0.013533092103898525, 0.03423681482672691, 0.014547858387231827, 0.02418140508234501, -0.013769546523690224, -0.09633788466453552, 0.01689709909260273, -0.01452709175646305, 0.047873757779598236, -0.0012036423431709409, 0.03720762953162193], @@ -145,7 +135,6 @@ "RET":[-0.09685279428958893, 0.0101965656504035, -0.04206235706806183, -0.05282443389296532, 0.050776951014995575, -0.006812752690166235, 0.09618920832872391, 0.04637071117758751, -0.018928129225969315, -0.04118828848004341, -0.06039129197597504, -0.018619466572999954, -0.07845143973827362, -0.14034120738506317, -0.03397035226225853, -0.028233898803591728, -0.08162513375282288, 0.048710327595472336, -0.04177732393145561, -0.08455172181129456, 0.00312337395735085, 0.03531079366803169, -0.057201240211725235, 0.09391707926988602, -0.02847883477807045, 0.01840023323893547, -0.04936904460191727, 0.027487540617585182, 0.08041024953126907, -0.08714525401592255, 0.11963017284870148, -0.0762581005692482, -0.06482874602079391, 0.038007382303476334, -0.003661463735625148, 0.0064629544503986835, -0.08281382918357849, -0.053177930414676666, 0.01966426707804203, -0.04822755232453346, -0.0474051795899868, 0.026990806683897972, -0.057971399277448654, 0.12347304075956345, -0.02745792828500271, 0.0832793116569519, 0.03029884397983551, -0.032751865684986115, -0.022912420332431793, -0.030569355934858322, 0.0971289873123169, 0.07298070192337036, 0.0306894201785326, 0.05817654728889465, 0.005174126010388136, 0.042281877249479294, 0.01975836046040058, -0.11205509305000305, 0.05081645026803017, 0.0034761943388730288, -0.03858469799160957, 0.007316718343645334, 0.07441510260105133, 0.004579664673656225, -0.021868426352739334, 0.01116174180060625, 0.061042461544275284, 0.029598504304885864, -0.06691239774227142, 0.03223221376538277, 0.0867755264043808, 0.05488765984773636, -0.019738517701625824, -0.030367519706487656, -0.06396497040987015, -0.0022451707627624273, -0.06131305173039436, -0.03129804506897926, -0.05657076835632324, 0.009733426384627819, -0.08145039528608322, -0.09049411863088608, 0.004821183159947395, 0.038612931966781616, -0.019062234088778496, -0.021097682416439056, -0.06061801686882973, 0.019766775891184807, 0.0276743546128273, -0.057942990213632584, -0.033430278301239014, 0.0043391571380198, 0.05848158895969391, 0.0826464518904686, 0.09988056123256683, -0.05677378550171852, -0.11326800286769867, 0.051275406032800674, 0.01158174965530634, 0.04368240013718605], "ROL":[0.026423713192343712, 0.08523924648761749, 0.005345864687114954, 0.027778491377830505, 0.06572498381137848, 0.056946828961372375, -0.03009108640253544, -0.05564097315073013, 0.07753216475248337, -0.07402804493904114, -0.05589171126484871, -0.050976503640413284, 0.041095346212387085, -0.06708681583404541, 0.08517566323280334, 0.02110634744167328, -0.027871981263160706, 0.0005450723110698164, 0.07511565834283829, 0.0016275837551802397, 0.04902505874633789, 0.024746844545006752, 0.08780711144208908, -0.06167766824364662, 0.06365402787923813, 0.06462119519710541, -0.04920244216918945, 0.056112516671419144, 0.10561680048704147, 0.07879003882408142, 0.03879575803875923, -0.03582729026675224, 0.004805437754839659, 0.030719229951500893, 0.0558336041867733, 0.04387545958161354, 0.020841658115386963, 0.015068157576024532, -0.008266274817287922, 0.05914990231394768, -0.01581275276839733, 0.060716625303030014, -0.02257946878671646, 0.00995479617267847, 0.002104438142850995, 0.03806104138493538, 0.010437156073749065, 0.039603881537914276, -0.02074524573981762, 0.024094516411423683, 0.031944990158081055, -0.07122939079999924, -0.023190783336758614, 0.006518832873553038, -0.04528677463531494, -0.02354210615158081, -0.03518632799386978, -0.07059651613235474, -0.017474880442023277, 0.06688393652439117, -0.07900173962116241, -0.05843310430645943, 0.05351021885871887, -0.05724814161658287, -0.02697751857340336, -0.031128596514463425, 0.03040527179837227, -0.009157841093838215, -0.07642515003681183, -0.042137425392866135, -0.031383614987134933, -0.07586777210235596, 0.0489036925137043, 0.0657171905040741, 0.027123138308525085, 0.034842655062675476, -0.035231154412031174, 0.009778738021850586, -0.06150955334305763, 0.042132262140512466, 0.08945925533771515, -0.07213590294122696, -0.0518047958612442, -0.07094760239124298, 0.07041053473949432, 0.1046413779258728, 0.02394813485443592, -0.014966128394007683, -0.04967860132455826, -0.03941388055682182, -0.10642798990011215, -0.03915626183152199, -0.10921923071146011, -0.035421375185251236, 0.039855729788541794, 0.04145469889044762, -0.025123557075858116, 0.06743432581424713, -0.02060243859887123, 0.02994687482714653], "ROR":[-0.03797177970409393, -0.03406170755624771, -0.014866529032588005, -0.002243943279609084, 0.024476991966366768, -0.08789698034524918, 0.02924288995563984, -0.03145875036716461, -0.0030907171312719584, 0.013303312472999096, 0.05823688209056854, 0.06085257977247238, 0.0682583823800087, 0.06680850684642792, -0.0008473473135381937, 0.056926507502794266, 0.05309343710541725, 0.017690004780888557, -0.028605103492736816, 0.02303914539515972, -0.07054196298122406, -0.011117611080408096, -0.0012138717574998736, -0.0877937376499176, -0.005339651368558407, -0.029197875410318375, -0.06283852458000183, 0.00677055399864912, 0.07529082894325256, -0.005144342314451933, -0.03930655121803284, -0.0469868965446949, 0.06799482554197311, -0.013870766386389732, -0.07353825122117996, -0.10425472259521484, 8.023920236155391e-05, 0.05196760594844818, -0.024758316576480865, -0.03249195218086243, -0.0037688545417040586, -0.0033505349420011044, 0.04382188990712166, 0.035679250955581665, -0.04743441194295883, 0.031142324209213257, -0.04255860671401024, -0.02310662344098091, -0.04199622571468353, -0.034439221024513245, -0.06397263705730438, -0.011049525812268257, -0.055776823312044144, 0.039233505725860596, 0.016644736751914024, -0.08737850934267044, 0.0151174021884799, 0.10728199779987335, -0.0006503594922833145, -0.060365013778209686, -0.05337308719754219, -0.021152105182409286, 0.06532585620880127, -0.00926337018609047, -0.08149554580450058, 0.0485830195248127, 0.034749776124954224, -0.05045035108923912, -0.06366241723299026, 0.0544571727514267, 0.07594002038240433, 0.027496861293911934, -0.047294747084379196, 0.017491186037659645, -0.034639474004507065, 0.006060798652470112, -0.07335491478443146, -0.054728057235479355, -0.0018357941880822182, -0.07110298424959183, 0.09072742611169815, 0.03083305060863495, 0.054598040878772736, -0.028097454458475113, -0.012821618467569351, 0.008708478882908821, -0.06561881303787231, -0.04448843002319336, 0.08860815316438675, -0.050312310457229614, 0.09012935310602188, -0.004711236339062452, -0.020932462066411972, -0.10615857690572739, -0.005630030296742916, 0.03976801037788391, 0.040199730545282364, 0.07235082983970642, -7.448523683706298e-05, 0.076942078769207], - "RegisterMask":[0.009287647902965546, 0.029691029340028763, -0.03465871885418892, 0.032606374472379684, -0.007339544594287872, 0.03367740660905838, -0.0661492720246315, 0.0436118021607399, -0.002896533813327551, 0.028440887108445168, -0.06791415065526962, 0.004055356606841087, -0.01596181094646454, -0.003846745239570737, 0.06762582808732986, -0.025632556527853012, 0.08132420480251312, 0.025554664433002472, -0.08994632959365845, 0.02521730400621891, 0.023826507851481438, 0.0004487193073146045, 0.01047397032380104, 0.03246957063674927, -0.033482909202575684, 0.05051224306225777, 0.005778896156698465, -0.0006257061613723636, 0.00522293895483017, -0.04666636884212494, 0.022335125133395195, -0.022150320932269096, 0.04510439187288284, -0.02769547514617443, 0.026804683730006218, 0.0710473507642746, -0.014513042755424976, 0.0695318952202797, 0.048469461500644684, -0.008654370903968811, -0.028613079339265823, -0.02918054349720478, -0.022721733897924423, -0.0004791628452949226, 0.011470172554254532, 0.08561886101961136, 0.07125027477741241, -0.05847848951816559, 0.011811288073658943, -0.025244031101465225, -0.03665035218000412, -0.03482883796095848, 0.04196881502866745, 0.06909161061048508, 0.02365143597126007, -0.0689089447259903, -0.0707414448261261, -0.03962424397468567, -0.025703679770231247, 0.06502455472946167, 0.057676125317811966, 0.026916807517409325, 0.024921152740716934, 0.009799988009035587, -0.018656229600310326, 0.009880480356514454, -0.06516153365373611, 0.019290866330266, 0.02236226759850979, -0.02598695270717144, -0.00299705658107996, 0.019448822364211082, -0.014883329160511494, 0.06645222008228302, -0.028751512989401817, -0.01589173451066017, 0.026225939393043518, 0.07285763323307037, -0.06037987396121025, -0.027615630999207497, -0.039930179715156555, -0.07122864574193954, 0.029825787991285324, 0.026364129036664963, -0.04438399150967598, 0.07015394419431686, -0.013950555585324764, 0.004367176443338394, 0.020521124824881554, 0.02030497044324875, 0.011951270513236523, 0.06765977293252945, -0.015042259357869625, 0.005189584568142891, -0.07532864063978195, -0.010886142030358315, 0.006792030762881041, -0.06348442286252975, 0.031859394162893295, -0.052482619881629944], "SAR":[-0.058561697602272034, -0.014889497309923172, -0.009758144617080688, 0.00019282882567495108, -0.040600407868623734, -0.05907759070396423, 0.033052023500204086, -0.04672614857554436, -0.050173744559288025, -0.06619776040315628, 0.005385559983551502, 0.05449973791837692, -0.0035163976717740297, -0.12835650146007538, 0.06576846539974213, 0.030572880059480667, -0.014856431633234024, 0.011252024210989475, 0.018954169005155563, -0.10070347040891647, -0.032273050397634506, 0.007221086882054806, -0.020879192277789116, 0.0691007450222969, 0.01286559458822012, -0.020694725215435028, -0.07545264810323715, -0.07742343097925186, -0.005103116389364004, 0.10223732143640518, -0.08521754294633865, 0.07459715753793716, 0.006563629489392042, -0.059839747846126556, -0.023294325917959213, 0.04265525937080383, -0.011012998409569263, -0.02257128618657589, -0.033783379942178726, 0.0368407666683197, -0.048024341464042664, -0.037417128682136536, 0.09010431170463562, 0.09016482532024384, -0.07939734309911728, 0.03274676203727722, 0.0388714037835598, -0.03253694251179695, 0.020820122212171555, -0.0039061333518475294, 0.025425976142287254, -0.01847209222614765, 0.013026821427047253, 0.08873090147972107, -0.010358930565416813, -0.026935681700706482, 0.04795868322253227, -0.06173045188188553, -0.02299962192773819, -0.09966729581356049, 0.008027775213122368, 0.03202224150300026, -0.08922284096479416, 0.03263246268033981, 0.0702379047870636, 0.08681228011846542, -0.053993936628103256, 0.0009890834335237741, -0.060423459857702255, 0.08636976033449173, 0.04784319922327995, 0.05135124549269676, -0.023515762761235237, 0.015414481982588768, -0.06941155344247818, 0.004289102740585804, -0.10909571498632431, 0.014149827882647514, -0.025285568088293076, 0.06270574778318405, 0.0669349953532219, 0.03599094599485397, 0.0436582937836647, 0.06281902641057968, -0.04479018226265907, -0.04126136004924774, -0.026938045397400856, -0.0349077507853508, 0.002964549232274294, -0.04247729107737541, 0.009402072057127953, 0.10574454814195633, 0.03262042999267578, 0.08030910044908524, -0.031244831159710884, 0.010621835477650166, -0.02628093585371971, 0.046942535787820816, -0.022998474538326263, 0.009223603643476963], "SBB":[-0.040700677782297134, -0.01474229246377945, 0.09491399675607681, 0.015464535914361477, -0.05408482998609543, -0.09618491679430008, -0.014700816012918949, -0.06255258619785309, 0.09308589994907379, 0.01991264335811138, 0.04899228736758232, -0.03322140499949455, -0.03979090601205826, -0.161369189620018, 0.0957769826054573, -0.045866891741752625, -0.03776619955897331, 0.09559016674757004, -0.0063005415722727776, 0.07086999714374542, -0.004713557660579681, 0.10066409409046173, -0.053719762712717056, 0.07039386034011841, 0.01788068749010563, 0.01069885678589344, -0.003849055850878358, 0.07810717821121216, 0.10748977214097977, -0.09462521225214005, -0.06140149384737015, -0.028434589505195618, 0.0395897701382637, 0.05396975204348564, 0.009982907213270664, -0.014297235757112503, 0.018435295671224594, -0.04264533147215843, -0.0471954308450222, -0.008587008342146873, 0.010918513871729374, -0.03147284686565399, 0.08885594457387924, 0.05178891867399216, 0.05807363614439964, 0.028190992772579193, 0.04205470532178879, 0.00935433991253376, 0.027427801862359047, -0.02180725708603859, -0.06614664196968079, 0.021269382908940315, 0.0585390068590641, 0.12827278673648834, 0.0420454666018486, 0.06753493845462799, -0.05479112267494202, -0.06480395793914795, 0.02621031180024147, -0.07586188614368439, -0.04831313341856003, 0.016674980521202087, -0.006851759273558855, 0.04103298485279083, 0.005965645890682936, -0.02317493036389351, -0.03966135531663895, -0.02576862834393978, -0.0916895642876625, 0.029451601207256317, 0.044677067548036575, 0.026928072795271873, -0.10388721525669098, 0.021140936762094498, -0.06990157812833786, 0.048356350511312485, -0.08890967816114426, -0.0003503488842397928, -0.10245566070079803, -0.0582563653588295, 0.04677841439843178, 0.04697449132800102, -0.04022470489144325, 0.02759086713194847, -0.02867579087615013, 0.013355317525565624, 0.011504339054226875, -0.04230086877942085, -0.045500747859478, -0.03741880878806114, 0.022458063438534737, 0.05192841589450836, 0.008104681968688965, -0.08284809440374374, 0.059996478259563446, 0.07762005180120468, -0.0031316280364990234, 0.06990513950586319, -0.020328091457486153, -0.0027387691661715508], "SETB_C":[-0.007473401725292206, -0.06315194815397263, 0.0693482831120491, 0.05207814276218414, -0.08006429672241211, -0.005448522046208382, -0.007457572966814041, 0.011581258848309517, -0.05411145091056824, -0.06738752871751785, 0.013233165256679058, -0.0677611380815506, 0.01846255734562874, -0.09321920573711395, -0.03116961196064949, 0.05861300230026245, -0.001519175828434527, 0.08354826271533966, 0.023905213922262192, 0.0124649154022336, 0.08983863890171051, 0.055941760540008545, 0.07229111343622208, 0.09052376449108124, -0.013718990609049797, 0.06642850488424301, 0.0822976604104042, 0.010060268454253674, 0.04116540774703026, -0.03301406651735306, 0.07296404242515564, -0.03534134477376938, 0.012426529079675674, -0.005412430968135595, 0.06087784096598625, 0.03547677770256996, -0.007232111878693104, 0.06580550968647003, -0.0037480974569916725, 0.02971699647605419, -0.06937503069639206, 0.08572175353765488, -0.02138090692460537, -0.0053040217608213425, -0.029469722881913185, -0.05332958698272705, 0.10073655843734741, -0.03199373558163643, -0.01775289885699749, -0.09716105461120605, 0.06483447551727295, 0.028643250465393066, -0.029914388433098793, 0.007070464547723532, 0.006640028208494186, -0.0033612342085689306, 0.005682659335434437, 0.011877131648361683, -0.038144148886203766, 0.03381858021020889, 0.02083616890013218, 0.029199717566370964, 0.07813020050525665, -0.006173993926495314, -0.016444502398371696, -0.08474857360124588, 0.03877300024032593, -0.046462398022413254, 0.02460806630551815, 0.053950369358062744, 0.01389766950160265, 0.03323421627283096, 0.04349416866898537, 0.04381947219371796, 0.10320119559764862, -0.1117740124464035, 0.03045269101858139, -0.03870442137122154, -0.07607249915599823, -0.00020808610133826733, -0.09519094228744507, 0.06727365404367447, -0.04469249024987221, 0.07144048810005188, -0.08811240643262863, 0.001203814405016601, 0.06901863217353821, 0.05462682247161865, -0.03902207687497139, -0.05885632708668709, -0.028275305405259132, -0.07151838392019272, -0.059166230261325836, -0.015570566058158875, 0.06314826756715775, -0.040293656289577484, 0.021595094352960587, -0.04083842411637306, -0.09180022031068802, -0.0309903621673584], @@ -263,26 +252,6 @@ "VINSERTI":[0.09046315401792526, 0.015515458770096302, 0.04200809448957443, -0.046130646020174026, -0.045843131840229034, 0.003743539797142148, 0.025380345061421394, -0.021319231018424034, 0.03850293532013893, 0.006397924851626158, -0.06982530653476715, -0.016159888356924057, -0.09164588898420334, 0.1245846077799797, -0.05104857683181763, -0.011446121148765087, -0.06936608999967575, -0.02683587372303009, -0.0337526798248291, -0.005495472811162472, 0.023584537208080292, -0.0771733894944191, -0.026287894695997238, -0.01172191184014082, 0.09737047553062439, 0.0375351756811142, 0.03280220180749893, -0.014072075486183167, 0.06032971292734146, 0.0072259255684912205, -0.08368974179029465, 0.054626062512397766, -0.021156134083867073, -0.09647785127162933, 0.07431179285049438, -0.09039300680160522, -0.07652204483747482, -0.002478789770975709, -0.012967151589691639, 0.08174770325422287, -0.00968913547694683, 0.015551612712442875, 0.08655177801847458, -0.056927114725112915, -0.011370684020221233, -0.0408773347735405, 0.04413295164704323, 0.05919815972447395, 0.08101782202720642, -0.008914918638765812, -0.019233090803027153, 0.05211508646607399, -0.010292282328009605, 0.021839600056409836, 0.0016950241988524795, -0.031931016594171524, 0.004831018857657909, 0.015328328125178814, -0.015326892025768757, -0.05457184836268425, 0.03782501816749573, -0.014512602239847183, 0.00869232788681984, -0.04001179710030556, -0.00994281005114317, -0.041689563542604446, 0.060574647039175034, 0.044912341982126236, 0.05958174169063568, -0.035378437489271164, 0.08524063974618912, 0.012326095253229141, -0.052227456122636795, -0.015090923756361008, -0.012893415056169033, -0.019565775990486145, -0.03284028172492981, -0.02651887759566307, 0.02436136268079281, 0.004743371158838272, 0.019924448803067207, -0.046163417398929596, 0.005615816451609135, -0.03354670852422714, 0.00801338255405426, 0.02501787059009075, 0.03313247114419937, -0.012842012569308281, 0.04856807366013527, -0.031942710280418396, -0.026277944445610046, 0.11483185738325119, 0.015686793252825737, 0.052031729370355606, 0.025188622996211052, -0.021448785439133644, 0.05062439665198326, -0.030834710225462914, 0.02746596559882164, 0.027027780190110207], "VINSERTPSrm":[0.010265239514410496, 0.03508870303630829, 0.0182888712733984, 0.01066108699887991, 0.09608251601457596, -0.0390457920730114, -0.02508910745382309, -0.051061324775218964, 0.051924970000982285, 0.02405509166419506, 0.07347054034471512, -0.023432396352291107, 0.03053455613553524, 0.11051331460475922, -0.07987380027770996, -0.07169938087463379, -0.06944284588098526, -0.010227087885141373, 0.01555782649666071, 0.0033066831529140472, -0.017572278156876564, 0.018880389630794525, -0.03347453102469444, 0.023936258628964424, 0.04189354181289673, -0.008910899050533772, 0.045309878885746, 0.039228133857250214, -0.0026367944665253162, 0.01713910512626171, -0.0038225038442760706, -0.02550170198082924, -0.04479162022471428, -0.11607012152671814, -0.05566885322332382, -0.03926549106836319, -0.05618799477815628, 0.0587196871638298, -0.003744689514860511, 0.09148088097572327, -0.008691483177244663, 0.060393813997507095, 0.05017181858420372, 0.05314680561423302, 0.010222898796200752, 0.04390108212828636, 0.06256565451622009, -0.039335936307907104, -0.030927496030926704, -0.010439696721732616, -0.09615844488143921, 0.04857023432850838, -0.021018074825406075, -0.04949686676263809, 0.0718517154455185, -0.0008082279819063842, -0.05119258537888527, -0.016725104302167892, -0.031902290880680084, 0.07473913580179214, -0.07040376961231232, -0.06263279169797897, -0.01866966485977173, -0.04580819234251976, -0.0018242622027173638, 0.02124813199043274, 0.01608111523091793, 0.033293697983026505, 0.04724595695734024, 0.06764823198318481, -0.010222701355814934, -0.09166357666254044, 0.0065320758149027824, 0.03907076641917229, 0.014404546469449997, -0.04371245950460434, -0.036747194826602936, -0.013570152223110199, -0.04874530807137489, -0.048001520335674286, 0.015114152804017067, 0.018710903823375702, 0.06920618563890457, -0.04024452716112137, 0.07851467281579971, -0.03879975154995918, 0.039278119802474976, -0.06678346544504166, -0.02139596827328205, 0.03483150899410248, 0.047795433551073074, -0.08071790635585785, 0.05023014172911644, -0.05863310769200325, 0.04391729459166527, -0.006796867586672306, -0.07156652212142944, -0.016402525827288628, 0.07568787783384323, 0.01640038751065731], "VINSERTPSrr":[0.022902479395270348, 0.024018414318561554, 0.011011038906872272, 0.009072761051356792, -0.057881347835063934, 0.03739643841981888, 0.006768766324967146, 0.07734010368585587, 0.001509909750893712, 0.043335434049367905, -0.07244917005300522, -0.1078081876039505, 0.027186766266822815, 0.018834171816706657, 0.007436624728143215, -0.048498328775167465, 0.09450934827327728, -0.015420452691614628, 0.014672537334263325, -0.0012827727477997541, 0.019664635881781578, -0.026565955951809883, -0.04819858446717262, -0.0004387270309962332, 0.01676507107913494, -0.0014571163337677717, 0.015105879865586758, 0.04038102179765701, 0.008408628404140472, 0.07757255434989929, -0.09923559427261353, -0.04181523248553276, -0.0313955582678318, -0.006045420188456774, 0.05904707312583923, -0.014993838034570217, 0.03219055384397507, 0.058543696999549866, -0.06872320920228958, 0.021718619391322136, -0.08984571695327759, 0.06557019799947739, -0.018167613074183464, -0.011413732543587685, 0.036035921424627304, 0.10104569792747498, 0.05836406350135803, -0.02576756477355957, 0.03827998414635658, -0.06874323636293411, 0.01668366976082325, 0.048310816287994385, -0.010213772766292095, -0.035550933331251144, -0.03385040909051895, 0.004614332225173712, 0.018951643258333206, 0.10679180175065994, -0.019135646522045135, -0.011955377645790577, 0.028140606358647346, -0.08185642957687378, -0.015775075182318687, -0.011507326737046242, 0.07914295047521591, -0.030148068442940712, 0.11757981777191162, 0.00040086795343086123, 0.056880321353673935, -0.014461426064372063, -0.0008378245402127504, -0.06245473772287369, -0.05332277715206146, -0.038401950150728226, 0.005011103581637144, 0.0368003323674202, -0.021230563521385193, 0.01497745979577303, 0.0372738242149353, 0.07988940924406052, -0.013381360098719597, 0.0036820468958467245, 0.07501927763223648, -0.0996084213256836, 0.028014982119202614, -0.09410325437784195, 0.0007335525006055832, 0.004884959198534489, 0.040397197008132935, -0.07655651122331619, 0.05677357316017151, 0.005359896458685398, -0.047478366643190384, 0.03828851506114006, 0.03363208472728729, -0.041756175458431244, -0.00031817640410736203, 0.022837577387690544, 0.039567966014146805, 0.03662540763616562], - "VIRT_REG_FR32":[0.0034248235169798136, -0.011980761773884296, -0.0501178540289402, 0.0494888611137867, 0.06103336811065674, -0.06178610771894455, 0.007709897588938475, -0.011392943561077118, 0.06570645421743393, 0.0771368145942688, 0.0005577280535362661, 0.013396150432527065, -0.041660163551568985, 0.05122360959649086, 0.11354377865791321, -0.009875510819256306, -0.06466709822416306, 0.048170577734708786, 0.0007201629341579974, 0.06538223475217819, 0.08870227634906769, -0.05771782249212265, 0.009273379109799862, -0.03325295075774193, 0.01197165809571743, 0.06604835391044617, 0.08265330642461777, -0.005758166313171387, 0.02512396313250065, 0.03383670747280121, 0.038484204560518265, -0.06539343297481537, -0.013461028225719929, 0.001498897559940815, 0.05170154944062233, 0.06965786963701248, -0.07339458167552948, 0.05094756931066513, 0.01983451284468174, -0.06855696439743042, 0.07892709225416183, 0.06099703162908554, 0.08492864668369293, 0.05357863008975983, -0.009294840507209301, -0.0054923719726502895, -0.029938997700810432, 0.028260599821805954, 0.053790509700775146, -0.06574371457099915, -0.009621666744351387, -0.08131514489650726, -0.08474338054656982, 0.039622966200113297, 0.06945627927780151, 0.02545306645333767, 0.005390701815485954, 0.04582791030406952, -0.1103447750210762, -0.050917647778987885, 0.03087870217859745, 0.06918162852525711, 0.0548822283744812, -0.01838473603129387, 0.05597897991538048, 0.03548860549926758, -0.009931124746799469, -0.07856663316488266, 0.033994875848293304, 0.03467561677098274, 0.09580692648887634, -0.04153195023536682, -0.06732118874788284, -0.06857144832611084, 0.03419093042612076, -0.01200241968035698, -0.06983492523431778, 0.05929506942629814, -0.00041734304977580905, -0.026396293193101883, 0.05230500176548958, -0.006162640172988176, 0.044198282063007355, -0.028765834867954254, 0.031155114993453026, 0.06967037916183472, -0.0892564132809639, 0.028816571459174156, -0.037065472453832626, 0.06540130823850632, -0.01888667233288288, 0.030632384121418, 0.0359313078224659, 0.106044240295887, 0.03259910270571709, -0.0775517001748085, -0.04267778620123863, 0.04977935180068016, -0.01790289767086506, -0.11223265528678894], - "VIRT_REG_FR64":[0.08496882021427155, 0.049308884888887405, -0.016840212047100067, 0.010602951049804688, -4.6025739720789716e-05, -0.06524767726659775, 0.048670798540115356, -0.06444543600082397, -0.0031944462098181248, 0.05608433857560158, -0.03958145156502724, 0.05171080678701401, -0.03572545200586319, -0.054364755749702454, 0.052311528474092484, -0.0361458919942379, 0.024109655991196632, 0.15923210978507996, -0.07255382835865021, -0.011799084022641182, -0.06846465915441513, 0.0023571476340293884, 0.02642918936908245, -0.05057685822248459, 0.029800178483128548, -0.06036723777651787, -0.012272411957383156, -0.022802220657467842, -0.02426644042134285, 0.05623406544327736, -0.07506053894758224, -0.02078152634203434, 0.02549685165286064, -0.030025657266378403, -0.0627482682466507, 0.062375299632549286, 0.03684084117412567, 0.06365678459405899, 0.0004415051080286503, -0.002180535811930895, 0.05225013941526413, -0.0693102702498436, -0.03649357333779335, 0.005159272346645594, -0.03298519179224968, 0.041419681161642075, -0.05325934663414955, -0.017585784196853638, -0.03843431547284126, -0.002649943344295025, 0.033329058438539505, -0.04736043140292168, -0.043852102011442184, -0.06713785231113434, -0.03237355872988701, 0.012679073959589005, -0.01959240809082985, 0.07324203103780746, 0.07468831539154053, 0.03327644243836403, -0.01596391387283802, 0.12015434354543686, 0.051839299499988556, 0.00980563648045063, -0.08275608718395233, 0.04445798322558403, -0.03891860321164131, 0.10891054570674896, -0.008730625733733177, -0.051655255258083344, -0.05982912331819534, 0.04106972739100456, 0.06872759014368057, 0.013289053924381733, 0.03469584137201309, -0.06673429906368256, -0.0695682018995285, 0.047426726669073105, 0.02815094031393528, -0.05552271753549576, 0.0010567272547632456, -0.051840681582689285, -0.01704293303191662, -0.047185055911540985, 0.036965738981962204, 0.03452568128705025, -0.05430837720632553, 0.0383443646132946, 0.0003438846324570477, -0.030417989939451218, 0.02749026007950306, -0.0546082966029644, 0.03005768544971943, 0.0025131346192210913, 0.0013019279576838017, -0.054173994809389114, -0.008382225409150124, 0.02153395675122738, 0.011912085115909576, -0.10461334884166718], - "VIRT_REG_GR16":[0.09543223679065704, 0.03513967618346214, 0.08986528217792511, -0.012217407114803791, -0.02076001651585102, -0.04190119728446007, 0.01318269595503807, -0.010142332874238491, -0.011869532987475395, -0.040446147322654724, 0.06552371382713318, 0.04439055174589157, 0.08176156878471375, -0.06334159523248672, -0.033928077667951584, -0.00024628525716252625, 0.0244551170617342, -0.019419007003307343, -0.09592454880475998, 0.005961012560874224, 0.03278326243162155, -0.07028506696224213, -0.08484592288732529, -6.329250754788518e-05, 0.015018146485090256, -0.05068608745932579, 0.0732998326420784, 0.023434389382600784, 0.0002124009479302913, 0.060401707887649536, 0.013626078143715858, -0.010556582361459732, -0.005069760140031576, -0.004616749472916126, -0.034329116344451904, 0.060584329068660736, -0.05430089309811592, -0.029179023578763008, 0.042385730892419815, -0.0652197003364563, 0.09378205984830856, -0.05090794339776039, -0.008510591462254524, 0.0837036669254303, 0.009071480482816696, 0.04464874789118767, -0.012855015695095062, 0.06306030601263046, -0.08556588739156723, -0.05393703281879425, -0.06741822510957718, -0.03717748448252678, 0.017156923189759254, 0.07401604950428009, -0.06629005819559097, -0.04564857482910156, -0.055414989590644836, 0.039407771080732346, -0.04089723527431488, 0.06915309280157089, 0.030190052464604378, 0.027542876079678535, 0.03557966649532318, 0.05191207677125931, -0.03237364813685417, -0.02036256715655327, -0.071859210729599, -0.06704329699277878, 0.0336633175611496, 0.09511569887399673, 0.0048662531189620495, 0.05273270234465599, -0.056247059255838394, 0.06079721450805664, -0.04150049015879631, -0.08104457706212997, -0.10303051024675369, 0.04522428661584854, -0.04379847273230553, -0.019447194412350655, 0.0021319733932614326, -0.010465282015502453, 0.06857019662857056, -0.00443653529509902, -0.08039603382349014, -0.05012141168117523, 0.0875077098608017, -0.03053239732980728, -0.05321606993675232, 0.016501901671290398, -0.0563507042825222, -0.03187479078769684, -0.0015389680629596114, 0.022985411807894707, -0.05008963868021965, 0.028300117701292038, 0.02875804342329502, -0.024458128958940506, -0.022238614037632942, -0.049835607409477234], - "VIRT_REG_GR32":[-0.008479167707264423, -0.02941126376390457, 0.05343153327703476, 0.03769504278898239, -0.0006716987118124962, -0.0329299233853817, 0.03442851081490517, -0.06826753169298172, -0.09117511659860611, -0.018657755106687546, 0.029032904654741287, 0.02404048666357994, 0.010598761960864067, -0.0482308566570282, 0.06956348568201065, -0.027967501431703568, -0.07380961626768112, -0.021098148077726364, -0.0808446854352951, 0.0127912862226367, -0.01355082169175148, -0.040285225957632065, 0.035385165363550186, -0.001157263875938952, -0.026462145149707794, -0.08616211265325546, -0.044482193887233734, -0.010969695635139942, 0.04645564407110214, -0.018178211525082588, -0.038536932319402695, -0.027571648359298706, -0.007523007690906525, -0.02699458785355091, -0.039170436561107635, 0.12889482080936432, -0.04512789845466614, -0.03883056715130806, 0.051210880279541016, 0.03924906626343727, 0.036943964660167694, -0.016879307106137276, 0.011263007298111916, 0.053573690354824066, -0.018964825198054314, -0.041856080293655396, -0.036545924842357635, 0.07715532928705215, -0.041981130838394165, -0.04114629328250885, -0.04393022507429123, -0.030163627117872238, 0.0019487979589030147, 0.10988762229681015, 0.09039165079593658, -0.0035424421075731516, -0.06272851675748825, 0.007701062131673098, -0.01971622183918953, 0.06203003600239754, 0.048561323434114456, -0.04599940404295921, 0.00802221056073904, -0.002905400237068534, -0.1050020381808281, 0.003395768813788891, -0.07973644882440567, 0.008020970039069653, -0.08614815771579742, 0.0518532320857048, 0.021174483001232147, 0.03254232555627823, -0.01905026100575924, -0.0009989180834963918, -0.06409642845392227, -0.022425753995776176, -0.03563409671187401, 0.07717793434858322, -0.04553033784031868, -0.02112392708659172, -0.002374667674303055, 0.03828585892915726, -0.014221777208149433, -0.015974245965480804, -0.01805220916867256, 0.04202109947800636, -0.0841534212231636, 0.06608130037784576, -0.11586519330739975, 0.024179989472031593, 0.017091574147343636, 0.08567194640636444, -0.03692129999399185, 0.03266705200076103, -0.046154942363500595, 0.0040525165386497974, -0.03177625685930252, 0.039895471185445786, 0.042960215359926224, -0.05573953315615654], - "VIRT_REG_GR32_ABCD":[0.016604775562882423, -0.0028934956062585115, 0.041060179471969604, -0.025077441707253456, -0.018642406910657883, 0.023762650787830353, -0.028646549209952354, -0.02460283786058426, 0.005985732190310955, 0.01774146780371666, -0.004014404024928808, -0.05473850294947624, -0.0417158380150795, -0.06322457641363144, 0.060795728117227554, -0.036435071378946304, -0.04245952516794205, 0.08069344609975815, 0.035319335758686066, -0.012020719237625599, 0.045771341770887375, -0.10842540860176086, 0.046253710985183716, -0.004099135287106037, 0.030616935342550278, -0.08288344740867615, 0.08569363504648209, -0.014164377935230732, -0.004303323570638895, 0.09726760536432266, 0.06208871304988861, -0.04007713496685028, 0.005815347656607628, 0.02377200312912464, 0.07813961058855057, 0.03192306309938431, -0.006230524741113186, 0.10110925883054733, -0.023409254848957062, 0.030774405226111412, -0.011607645079493523, -0.03929119184613228, 0.004817614797502756, -0.013827506452798843, 0.07770339399576187, -0.07994075864553452, -0.03157062083482742, 0.06743781268596649, 0.014881699346005917, -0.030165214091539383, -0.07844353467226028, -0.04563238099217415, 0.09747181832790375, 0.057128582149744034, 0.04173563793301582, -0.0011194447288289666, -0.01902887038886547, -0.032171595841646194, 0.04824799671769142, 0.008433254435658455, 0.024706291034817696, 0.0746094286441803, 0.04515853151679039, -0.0018984260968863964, -0.10070884972810745, -0.01883143000304699, -0.07785795629024506, 0.10938235372304916, -0.08001448959112167, -0.07419873028993607, 0.010544849559664726, 0.025767439976334572, -0.1005895584821701, 0.05103800818324089, -0.03675306960940361, -0.020510872825980186, 0.022482097148895264, 0.06463642418384552, -0.03149804100394249, -0.021647030487656593, 0.04025804623961449, 0.003628256032243371, 0.03532547131180763, -0.08667688816785812, 0.018817460164427757, -0.01690257526934147, -0.10114696621894836, -0.022815177217125893, 0.024386661127209663, 0.10286301374435425, 0.030005114153027534, 0.0370776504278183, -0.008584428578615189, -0.077603779733181, -0.03588058054447174, 0.030617419630289078, -0.07383710891008377, 0.03215676173567772, 0.03288266062736511, -0.036702848970890045], - "VIRT_REG_GR32_NOREX":[0.019052108749747276, -0.006784944795072079, -0.05410394072532654, 0.001966317882761359, -0.06686867773532867, 0.013514372520148754, 0.030097918584942818, -0.03868359327316284, 0.004314934369176626, -0.06713679432868958, 0.02491898462176323, 0.027683967724442482, 0.035907283425331116, -0.023093875497579575, -0.0892200842499733, -0.1052003800868988, -0.03923499956727028, 0.08808581531047821, -0.10092058777809143, 0.03336786851286888, -0.08974049985408783, -0.015254802070558071, 0.039686985313892365, -0.010083628818392754, -0.03423550724983215, -0.08821681141853333, -0.05621311068534851, -0.020327769219875336, -0.016793876886367798, 0.08908043801784515, -0.04112761467695236, -0.050139520317316055, -0.01524045504629612, 0.05841142684221268, 0.08270087838172913, 0.0348736047744751, -0.016146546229720116, 0.05751227214932442, 0.05081859603524208, -0.07304663956165314, -0.047101784497499466, -0.02825125865638256, 0.0006340605323202908, 0.0008785317186266184, -0.044239338487386703, 0.007173972204327583, -0.029449066147208214, 0.07254412025213242, -0.026029080152511597, 0.025982191786170006, -0.09524690359830856, -0.052613094449043274, -0.1270490437746048, 0.05319184809923172, 0.1046818196773529, 0.0477570965886116, -0.06291303783655167, 0.04725426062941551, -0.05330964922904968, 0.04056742787361145, 0.01543382927775383, 0.03627128154039383, -0.048232536762952805, 0.014761016704142094, -0.007380587514489889, -0.008060632273554802, -0.021923277527093887, -0.022500980645418167, -0.08495079725980759, 0.045358967036008835, -0.04728720709681511, 0.03550735488533974, 0.03445536270737648, -0.01891610585153103, -0.09439470618963242, -0.044266197830438614, -0.07952893525362015, 0.05221104994416237, -0.03507477045059204, 0.04218391329050064, 0.040326621383428574, -0.0395088866353035, 0.02447870559990406, -0.04280063137412071, 0.06520935893058777, -0.003358252113685012, -0.057561881840229034, 0.01911463774740696, 0.05295571684837341, 0.030342884361743927, 0.03814920783042908, -0.03366788476705551, 0.03090745024383068, 0.09487249702215195, -0.002995486371219158, -0.012020634487271309, -0.029147809371352196, 0.09558248519897461, 0.02548893168568611, 0.0931544378399849], - "VIRT_REG_GR64":[0.02717440389096737, -0.026730243116617203, -0.023244258016347885, 0.04027782380580902, 0.006808254402130842, -0.027519788593053818, -0.01906559243798256, 0.027793627232313156, -0.00129543652292341, -0.03455121070146561, 0.021734628826379776, 0.035481199622154236, -0.07251942157745361, -0.025691546499729156, -0.03271827474236488, -0.13225725293159485, -0.0601421520113945, 0.09084498882293701, -0.10225717723369598, 0.004034099169075489, 0.023578351363539696, -0.041603971272706985, 0.04199974611401558, -0.014711204916238785, -0.04272732138633728, -0.12534455955028534, -0.023738788440823555, 0.005328727886080742, 0.038416482508182526, -0.026419155299663544, -0.041119154542684555, 0.00022502713545691222, -0.05204978585243225, -0.019709734246134758, -0.04102563485503197, 0.06480151414871216, 0.009224721230566502, 0.04627599939703941, 0.027821402996778488, -0.05595114827156067, 0.04526345059275627, 0.024196594953536987, 0.10446277260780334, 0.07561361789703369, -0.08028160035610199, -0.0314163975417614, 0.11944323033094406, 0.1025814488530159, -0.08457476645708084, 0.02227119728922844, -0.041679076850414276, -0.02260834351181984, 0.036674268543720245, 0.10488750785589218, 0.019218411296606064, -0.015966340899467468, -0.06852715462446213, 0.026523491367697716, -0.11090730130672455, -0.0021082640159875154, -0.048291631042957306, -0.032388005405664444, 0.015713853761553764, 0.03355225548148155, -0.06502845883369446, -0.010098783299326897, -0.09930021315813065, -0.017413528636097908, -0.055861033499240875, 0.0801810696721077, -0.03900628536939621, -0.03278445452451706, -0.0337282195687294, -0.11434067040681839, -0.04371264949440956, -0.01736009307205677, -0.05100121721625328, 0.07490750402212143, -0.014680330641567707, -0.02126181870698929, 0.018013890832662582, 0.0018135658465325832, 0.029781077057123184, -0.012477489188313484, -0.021443217992782593, 0.047576501965522766, -0.05993758141994476, -0.06040889024734497, 0.016642581671476364, 0.011624492704868317, -0.042229063808918, -0.007573941722512245, -0.04010608047246933, -0.006444427650421858, -0.014495199546217918, -0.04122597724199295, -0.08505907654762268, -0.004049300216138363, 0.06545045226812363, -0.04762336611747742], - "VIRT_REG_GR64_ABCD":[0.04577033221721649, -0.07758746296167374, 0.00799313560128212, -0.11011485010385513, -0.010862522758543491, 0.012709266506135464, 0.05257265642285347, -0.07354705780744553, 0.04262387007474899, 0.07554348558187485, -0.06358839571475983, 0.006669520866125822, 0.049098193645477295, 0.11183933168649673, -0.028112098574638367, 0.021986473351716995, -0.02839403599500656, -0.06199958547949791, 0.08614487200975418, -0.041216861456632614, 0.041238460689783096, 0.005937385838478804, 0.00200703926384449, -0.05337367579340935, 0.037919919937849045, -0.07485998421907425, -0.09153831005096436, -0.0554175041615963, -0.10251995176076889, -0.01289951242506504, -0.030631467700004578, 0.04197017475962639, -0.03578301519155502, 0.010593005456030369, -0.05836241692304611, 0.06809061765670776, 0.10871735960245132, -0.09833388775587082, -0.009873395785689354, -0.056898634880781174, 0.05946199968457222, 0.015534073114395142, 0.01677171140909195, -0.020233800634741783, -0.006396631710231304, -0.049332089722156525, 0.012649210169911385, 0.03756912052631378, 0.0033660116605460644, -0.09084216505289078, -0.07142844051122665, -0.0030346515122801065, 0.0019640070386230946, 0.038837920874357224, 0.011760945431888103, 0.04995080456137657, -0.06997165083885193, -0.035297296941280365, 0.01996617764234543, 0.01954355463385582, -0.0934600979089737, 0.030165065079927444, -0.007337240036576986, -0.05346155911684036, 0.0732186883687973, -0.04716489836573601, -0.06555212289094925, -0.018465254455804825, 0.051119767129421234, -0.03106619231402874, 0.0748852789402008, -0.02095886692404747, 0.006320921704173088, 0.03146332502365112, -0.08238139003515244, -0.03618254140019417, -0.014570276252925396, 0.062481846660375595, -0.0394093319773674, -0.05171547457575798, -0.044726233929395676, -0.01228095218539238, 0.09699232876300812, 0.07471026480197906, 0.03112417459487915, 0.022543631494045258, -0.08634103089570999, 0.059702761471271515, -0.013801504857838154, 0.004984616301953793, 0.045798566192388535, -0.03205988556146622, -0.06150995194911957, -0.02244667150080204, 0.03318532556295395, 0.03462471440434456, 0.03236381709575653, 0.0884014293551445, -0.01604369841516018, -0.05234146490693092], - "VIRT_REG_GR64_NOREX":[-0.03959479182958603, -0.06190898269414902, -0.02920372597873211, -0.09973344951868057, -0.004333901684731245, -0.08522991091012955, 0.0459987074136734, -0.057674553245306015, 0.037046968936920166, -0.05669403821229935, -0.02221340872347355, -0.062426190823316574, 0.05804889276623726, -0.02635439857840538, -0.045627325773239136, 0.03632078319787979, 0.07128578424453735, 0.07544906437397003, -0.0537678524851799, -0.04624016210436821, 0.014316501095890999, 0.05580946058034897, 0.05251356214284897, -0.08244197070598602, -0.08901460468769073, -0.07641059905290604, -0.04924754425883293, 0.05417120084166527, -0.0060508353635668755, -0.00814742036163807, -0.06154030188918114, 0.05966867506504059, -0.03231468051671982, 0.021429890766739845, 0.031103987246751785, 0.04343251883983612, -0.08997714519500732, 0.039365898817777634, 0.052908625453710556, -0.02683917060494423, -0.05547752603888512, -0.014131218194961548, 0.0016863569617271423, -0.041112788021564484, -0.010230163112282753, -0.06687774509191513, -0.006144971586763859, -0.08074352145195007, 0.04034091532230377, -0.08176303654909134, -0.004055786412209272, -0.0024839320685714483, -0.007289807312190533, 0.06915127485990524, 0.023709064349532127, 0.04671626538038254, 0.06229325756430626, 0.04707597941160202, 0.06800796836614609, -0.02885584905743599, 0.030613983049988747, -0.019083039835095406, 0.045457858592271805, 0.040770504623651505, -0.05441175401210785, -0.05712401866912842, 0.07744520157575607, -0.0756613239645958, -0.06890957802534103, -0.07997069507837296, 0.09348486363887787, -0.04511028528213501, 0.036194607615470886, 0.040017660707235336, 0.016245214268565178, 0.023104460909962654, 0.058383163064718246, 0.0679842159152031, -0.00921112485229969, -0.10036550462245941, 0.09075804799795151, -0.059704095125198364, -0.013338442891836166, -0.005139742512255907, 0.07807526737451553, 0.06255412846803665, -0.008151572197675705, -0.0624256506562233, 0.012590888887643814, 0.03665084019303322, -0.028498578816652298, -0.01614067517220974, 0.007552243769168854, -0.007216903381049633, 0.0760180801153183, -0.04200543463230133, 0.06412865966558456, -0.05136435106396675, -0.0024792966432869434, 0.06856651604175568], - "VIRT_REG_GR64_NOREX_NOSP":[-0.0656895712018013, 0.058077458292245865, -0.006653467658907175, 0.037784356623888016, 0.07274001836776733, 0.07232078164815903, 0.07074914127588272, 0.05637859180569649, 0.04296007752418518, 0.05499762296676636, -0.01783664897084236, -0.08387365937232971, -0.01376343984156847, -0.07938199490308762, -0.027822256088256836, -0.0663403570652008, 0.036170270293951035, -0.07460261881351471, 0.08652043342590332, 0.02483147382736206, -0.07939319312572479, 0.033202506601810455, 0.0903514102101326, -0.10181311517953873, 0.060751549899578094, 0.07619930803775787, 0.05017509311437607, -0.0470910519361496, 0.07713821530342102, -0.0426195003092289, -0.04506472498178482, 0.003363420255482197, -0.0017315347213298082, 0.06264199316501617, 0.005245774984359741, -0.027923958376049995, 0.09868567436933517, 0.06738796830177307, -0.10339145362377167, 0.0020383980590850115, 0.087734155356884, 0.011040030047297478, -0.05993311479687691, -0.05790332704782486, 0.01574312523007393, 0.009771298617124557, 0.022676382213830948, -0.009197148494422436, 0.03372732177376747, 0.08404259383678436, -0.015135225839912891, -0.04693703353404999, 0.09917140752077103, 0.007134507410228252, 0.020209072157740593, -0.00027669535484164953, -0.0351635180413723, 0.03751315921545029, -0.019665181636810303, 0.028500953689217567, 0.034186746925115585, -0.005931361112743616, 0.05645192414522171, -0.02027188241481781, -0.022675039246678352, -0.08812297880649567, -0.014896178618073463, -0.048788342624902725, 0.008708382956683636, 0.019917558878660202, -0.002275944221764803, 0.03409638628363609, 0.033304013311862946, 0.057676300406455994, 0.039842985570430756, -0.025169866159558296, 0.016520975157618523, -0.030201178044080734, -0.021718870848417282, -0.07023277878761292, -0.007528252899646759, 0.009067370556294918, -0.0460657961666584, 0.07117785513401031, -0.03609836474061012, -0.011893372051417828, -0.006047600414603949, 0.0179970171302557, 0.024480223655700684, -0.03918423503637314, 0.004897980485111475, 0.05040167644619942, 0.010113563388586044, -0.1074901670217514, -0.06277655810117722, -0.02934161201119423, -0.06922926008701324, -0.05638887360692024, 0.05314395949244499, 0.04588884115219116], - "VIRT_REG_GR64_NOSP":[0.0015277941711246967, -0.03938478231430054, -0.030811766162514687, 0.027071669697761536, 0.02127140760421753, 0.0015787228476256132, -0.07842491567134857, 0.004658385645598173, -0.05909501388669014, -0.03576778993010521, -0.07251477241516113, 0.12117832154035568, 0.04499363154172897, -0.009405314922332764, -0.01015283353626728, -0.002841090550646186, 0.0689091831445694, 0.10697457194328308, -0.09274765104055405, -0.027955353260040283, -0.0379958301782608, -0.044126156717538834, 0.04907212778925896, -0.038063473999500275, -0.003686746582388878, -0.08313410729169846, -0.045181579887866974, -0.011702840216457844, -0.006579228211194277, 0.046807315200567245, -0.045654296875, -0.03466613590717316, -0.08313826471567154, -0.06678880006074905, -0.027727074921131134, 0.036734677851200104, -0.040936414152383804, 0.05170389637351036, 0.038199927657842636, 0.02960256300866604, 0.0355701707303524, -0.02052776888012886, 0.06218089163303375, 0.10570456087589264, -0.036479029804468155, -0.008999336510896683, -0.031860992312431335, 0.07250168174505234, -0.061084795743227005, -0.057996805757284164, -0.010533110238611698, -0.018169214949011803, 0.017261315137147903, 0.10023517906665802, -0.044131457805633545, -0.07618662714958191, -0.09124933928251266, 0.01819406822323799, -0.05906827375292778, 0.04295642301440239, -0.03197735920548439, 0.03641442954540253, 0.005168464966118336, -0.00010972691961796954, -0.0829579159617424, -0.014677388593554497, -0.08750011026859283, -0.04695136100053787, -0.07696729153394699, -0.00718996487557888, 0.018294518813490868, -0.014321570284664631, -0.04416860267519951, -0.0890057235956192, -0.014466283842921257, 0.02831638976931572, -0.04845190420746803, 0.08228176832199097, 0.03420877829194069, 0.056510377675294876, 0.037403274327516556, 0.04364967346191406, 0.08903267979621887, -0.016827082261443138, -0.0682789757847786, 0.06286796927452087, -0.0958203598856926, 0.018489282578229904, 0.02886355295777321, 0.028006011620163918, 0.039986785501241684, -0.04771937429904938, -0.004648604430258274, 0.033939141780138016, -0.027820419520139694, -0.026187442243099213, -0.07972361892461777, 0.006323353853076696, 0.016448041424155235, -0.01961681991815567], - "VIRT_REG_GR64_NOSP_and_GR64_TC":[0.08079065382480621, -0.05147358775138855, -0.08338657021522522, 0.06757336109876633, -0.015237463638186455, 0.026806311681866646, 0.07564966380596161, -0.037159934639930725, -0.02222878858447075, -0.04553138092160225, -0.006632891017943621, 0.001604291144758463, 0.043711669743061066, 0.0710049569606781, -0.08854726701974869, -0.03142566233873367, -0.0865127220749855, 0.08521236479282379, 0.039203498512506485, 0.04737624153494835, 0.02893459051847458, 0.004120660945773125, 0.03552098199725151, -0.0010448878165334463, 0.04423774778842926, 0.03258584439754486, 0.03433830663561821, -0.019990455359220505, -0.03263172507286072, 0.09782663732767105, -0.00702365068718791, -0.06544602662324905, 0.013447105884552002, 0.04603038728237152, 0.029931804165244102, 0.0988783910870552, -0.062023941427469254, -0.0070026409812271595, 0.032557111233472824, -0.08212000876665115, 0.03199682757258415, 0.020828546956181526, 0.07071725279092789, -0.018812179565429688, -0.0184739138931036, -0.06008931249380112, 0.01504000648856163, -0.019235603511333466, 0.014653048478066921, -0.009083813987672329, 0.03171474114060402, 0.019499456509947777, 0.05263463407754898, 0.10554639250040054, -0.02759619802236557, -0.00156494346447289, -0.03898271545767784, 0.06027846410870552, -0.061001915484666824, 0.039365388453006744, -0.06546281278133392, 0.0006352368509396911, 0.0500405877828598, -0.03232716768980026, -0.010176514275372028, 0.002549059921875596, 0.0666508674621582, -0.037290267646312714, -0.028836704790592194, 0.06271649152040482, -0.016647985205054283, 0.013602355495095253, 0.020110899582505226, 0.011730309575796127, -0.10071564465761185, -0.06239647418260574, -0.09507977962493896, -0.09190725535154343, -0.08861985802650452, -0.0006123466300778091, 0.0951915979385376, -0.035364676266908646, -0.04007220268249512, 0.08415472507476807, 0.0006664254469797015, 0.05864431709051132, 0.01460045762360096, -0.09507087618112564, 0.024228032678365707, 0.04208158329129219, 0.006106846500188112, 0.09294755011796951, 0.06157369166612625, 0.0826527327299118, -0.058974966406822205, -0.09958664327859879, 0.06913749873638153, -0.08108915388584137, 0.07425157725811005, 0.04784728214144707], - "VIRT_REG_GR64_TC":[-0.0944172665476799, 0.040403831750154495, -0.017597073689103127, 0.04766053333878517, -0.03104357235133648, 0.025751160457730293, 0.036779265850782394, -0.0235747080296278, 0.032111138105392456, 0.009872193448245525, -0.01596468687057495, 0.05234881862998009, -0.047335200011730194, 0.005157034378498793, -0.02132921665906906, -0.0544377863407135, 0.057515472173690796, -0.006743279751390219, -0.01474941335618496, -0.0990658849477768, 0.022418741136789322, -0.007098495960235596, 0.046933863312006, 0.1002131924033165, 0.01583809033036232, 0.03995800018310547, -0.017743254080414772, -0.01684877835214138, 0.06543229520320892, 0.04597911611199379, 0.05365373566746712, -0.008774830959737301, -0.01341968309134245, -0.004754040390253067, 0.04739849269390106, 0.032378777861595154, -0.0020728895906358957, 0.03502136841416359, 0.05946416035294533, -0.06190952658653259, 0.01910495012998581, -0.023678753525018692, 0.012653682380914688, -0.06766874343156815, -0.0729866623878479, 0.0757005363702774, -0.027033904567360878, -0.06776778399944305, -0.010131776332855225, -0.06334701925516129, -0.04702980816364288, 0.06837917864322662, 0.002726735547184944, 0.04345812648534775, 0.04288078844547272, -0.06921732425689697, -0.07625382393598557, 0.037991974502801895, -0.04257906600832939, 0.06338586658239365, 0.05315309390425682, -0.02785014547407627, 0.04054750129580498, 0.06967299431562424, -0.07271680235862732, 0.0032969408202916384, -0.08254148811101913, 0.07269596308469772, -0.01827111467719078, 0.034775473177433014, 0.010106234811246395, 0.0389409065246582, 0.042805008590221405, -0.03822058066725731, 0.0668339803814888, -0.005216705612838268, -0.00022202919353730977, -0.0221820380538702, -0.027401722967624664, -0.045061662793159485, -0.05296671763062477, -0.0190189890563488, -0.002744461875408888, -0.04073096439242363, -0.06974441558122635, 0.05868958309292793, -0.06907399743795395, -0.026619713753461838, 0.015318086370825768, 0.035948701202869415, -0.08301021158695221, 0.03955607861280441, 0.028369972482323647, 0.0202812347561121, -0.12075140327215195, -0.039504438638687134, -0.03826067969202995, 0.01607581228017807, 0.02135113812983036, -0.08897850662469864], - "VIRT_REG_GR64_TC_with_sub_8bit":[0.00805664248764515, 0.06228634715080261, -0.005148644559085369, -0.025605352595448494, -0.04853198677301407, -0.018169978633522987, 0.008530518971383572, -0.1050964742898941, -0.08428415656089783, -0.014802628196775913, 0.05918573588132858, 0.07529161125421524, 0.09815273433923721, -0.014188972301781178, 0.06676790118217468, 0.09496084600687027, -0.03843621164560318, -0.00740150036290288, -0.11988909542560577, -0.01781499572098255, -0.03719411790370941, -0.07447166740894318, 0.005513608455657959, -0.014381160028278828, 0.036786310374736786, -0.04839075356721878, -0.009440913796424866, 0.03984222561120987, -0.08096668124198914, 0.026751000434160233, 0.06400448083877563, 0.07998895645141602, 2.295125523232855e-05, 0.0266779325902462, -0.0030931613873690367, 0.05236855521798134, -0.010479471646249294, -0.011119752191007137, -0.06124376133084297, -0.019449712708592415, 0.03448517248034477, -0.04095051810145378, 0.01377212442457676, 0.09643338620662689, 0.021325431764125824, 0.06029453128576279, 0.048866767436265945, -0.03436344116926193, -0.043422505259513855, 0.03822150453925133, 0.004718889016658068, -0.04090931639075279, -0.04219569265842438, 0.019032739102840424, 0.06111171841621399, 0.04305591061711311, -0.0379939004778862, -0.03224434703588486, -0.06517905741930008, 0.002272483194246888, 0.09273418039083481, -0.028145847842097282, 0.01824336126446724, 0.00936606340110302, -0.07281909137964249, -0.028650810942053795, -0.060721538960933685, -0.09477518498897552, -0.0014060320099815726, 0.06919887661933899, -0.03463669493794441, 0.0026504716370254755, -0.0653621107339859, -0.02800566703081131, -0.02503957599401474, -0.060285311192274094, 0.014794053509831429, -0.08424058556556702, 0.0482206828892231, -0.07467620074748993, -0.09909844398498535, -0.06888734549283981, -0.0014173799427226186, -0.09022543579339981, 0.06461413204669952, 0.024526789784431458, -0.07400602847337723, -0.008816084824502468, 0.025513656437397003, 0.047476526349782944, -0.05981749668717384, 0.08338218182325363, 0.02657591737806797, 0.03547860309481621, -0.043622229248285294, 0.10129662603139877, 0.08802521973848343, -0.09759330749511719, 0.025680232793092728, 0.05964493378996849], - "VIRT_REG_GR64_with_sub_16bit_in_GR16_NOREX":[-0.03117012232542038, -0.02872271090745926, -0.039712607860565186, 0.03738812729716301, 0.030099159106612206, 0.00013636364019475877, -0.019107641652226448, -0.04186702147126198, -0.053099144250154495, -0.020432034507393837, -0.0004185919533483684, 0.010934959165751934, 0.036054231226444244, 0.03788067027926445, 0.05227302014827728, -0.034505825489759445, -0.08298061788082123, 0.0399160161614418, 0.03668724000453949, 0.014606554992496967, -0.0071771652437746525, 0.059049926698207855, -0.06330917030572891, 0.007379058748483658, -0.0750177726149559, -0.0423760749399662, -0.019386067986488342, -0.018436923623085022, -0.015116279944777489, 0.023602722212672234, 0.0533282607793808, -0.026401247829198837, 0.023750485852360725, -0.027648568153381348, -0.016443056985735893, 0.04291580244898796, -0.04391908273100853, 0.05113501846790314, -0.03743087872862816, 0.056367188692092896, 0.048130668699741364, -0.0230261143296957, 0.03358393907546997, -0.030188169330358505, 0.08421863615512848, 0.0033821314573287964, 0.03151029348373413, -0.042818162590265274, 0.04007953777909279, -0.0050337472930550575, 0.03335743024945259, -0.026563530787825584, 0.016440672799944878, -0.04272226244211197, -0.07304228097200394, 0.024836458265781403, -0.016342775896191597, -0.055494848638772964, -0.05826134234666824, 0.027478834614157677, 0.025981346145272255, -0.04745938256382942, 0.013695796020328999, -0.027888784185051918, 0.03769542649388313, -0.024486247450113297, 0.04720773920416832, -0.012697651982307434, -0.03559652715921402, 0.012948199175298214, -0.025600459426641464, 0.014954420737922192, -0.06651762872934341, 0.04277091473340988, -0.08291683346033096, 0.016881149262189865, 0.04145864024758339, -0.04162050038576126, -0.03363965451717377, -0.05018439516425133, 0.06321889907121658, -0.00871780700981617, 0.06867428869009018, 0.057975344359874725, 0.009704249911010265, 0.049075234681367874, -0.06111253425478935, 0.027943406254053116, 0.03725599870085716, 0.032480716705322266, -0.01960119605064392, -0.0295172780752182, 0.014026675373315811, 0.056797921657562256, -0.031707022339105606, 0.0010152219329029322, -0.023705823346972466, -0.07695567607879639, 0.017504720017313957, -0.0020094760693609715], - "VIRT_REG_GR64_with_sub_8bit":[-0.011493992060422897, -0.027181852608919144, 0.022013556212186813, 0.05687474459409714, -0.03289574757218361, -0.04803529754281044, -0.04204253479838371, 0.044671084731817245, -0.0849028080701828, -0.09561576694250107, 0.03596775606274605, 0.027156801894307137, 0.05034027621150017, -0.006308000069111586, 0.012393618933856487, -0.048590339720249176, -0.049129705876111984, 0.059305012226104736, -0.10330235958099365, 0.00738809397444129, 0.03855152800679207, -0.03220852091908455, 0.05221837759017944, -0.01274650078266859, 0.024303985759615898, -0.05925533175468445, -0.015623844228684902, -0.025864524766802788, 0.009918035939335823, 0.004779431037604809, -0.02866589091718197, 0.006512579973787069, -0.037251196801662445, 0.005028596147894859, -0.011677909642457962, 0.051886074244976044, -0.03552602231502533, 0.011968757025897503, 0.00829426757991314, -0.06981230527162552, -0.029781555756926537, -0.012621275149285793, 0.08595969527959824, 0.08630531281232834, 0.10018875449895859, -0.054863955825567245, -0.044519901275634766, 0.0893385037779808, 0.04004377871751785, 0.003711731405928731, -0.021447300910949707, -0.08500636368989944, 0.0037281641270965338, 0.14561010897159576, 0.03993009030818939, 0.07621612399816513, 0.020513180643320084, 0.004926605150103569, -0.035578932613134384, 0.06101486086845398, -0.08422145247459412, -0.03511432558298111, 0.01537742093205452, -0.010146304965019226, -0.05133780837059021, -0.010472903028130531, -0.09726933389902115, -0.010570867918431759, -0.09348491579294205, 0.002129049738869071, -0.01265127956867218, 0.03504374623298645, -0.008679943159222603, -0.002507386729121208, -0.06586045026779175, -0.04775359109044075, -0.042809367179870605, 0.08359787613153458, -0.0230431966483593, -0.015440763905644417, 0.0195400882512331, -0.0186530202627182, -0.03176320344209671, -0.019522372633218765, -0.02984560839831829, 0.024256182834506035, -0.07656785100698471, 0.03944750130176544, 0.016559945419430733, 0.007124909665435553, 0.08061631768941879, 0.08561833202838898, -0.018525447696447372, -0.0019649232272058725, -0.018469924107193947, -0.012311050668358803, -0.08448101580142975, 0.060216110199689865, 0.06368701905012131, -0.07110093533992767], - "VIRT_REG_GR8":[0.02255251444876194, 0.012649326585233212, 0.05363747105002403, -0.006129346787929535, 0.027027001604437828, 0.03703385218977928, -0.045294541865587234, -0.02489621751010418, 0.026587747037410736, -0.06228360906243324, 0.01547946222126484, 0.03494448587298393, 0.08276952058076859, -0.03470698744058609, 0.0036826131399720907, 0.04216131567955017, -0.04518325626850128, 0.09584730118513107, -0.09126991778612137, -0.11293632537126541, 0.0141398124396801, -0.05086163431406021, 0.0421922467648983, -0.0001364851341350004, 0.05821910500526428, -0.04154132679104805, 0.036521218717098236, -0.016718950122594833, 0.0773339569568634, 0.05134757608175278, -0.03728386387228966, -0.014684299007058144, 0.016949277371168137, 0.025767508894205093, -0.01573120802640915, 0.0343811996281147, 0.008209497667849064, 0.0011038129450753331, -0.06688684970140457, -0.08167136460542679, 0.03875276446342468, 0.08301592618227005, 0.023012684658169746, 0.07135005295276642, 0.008461466059088707, 0.004998552612960339, 0.02622731775045395, -0.09479465335607529, 0.014987453818321228, -0.008574756793677807, -0.008050303906202316, -0.005560623947530985, 0.04616820812225342, 0.11537269502878189, 0.032199542969465256, 0.05507092550396919, -0.053164780139923096, 0.012255114503204823, -0.01981479674577713, 0.06012535095214844, 0.043957680463790894, 0.02384384348988533, 0.04837791621685028, 0.04945961385965347, -0.1063770279288292, -0.07354240119457245, -0.08922741562128067, -0.026019031181931496, -0.08768662065267563, 0.09241457283496857, 0.03253300115466118, -0.018267929553985596, -0.04406850412487984, -0.05577726289629936, -0.05304105579853058, 0.016035545617341995, 0.05610279366374016, 0.06247573718428612, -0.019430609419941902, -0.017088554799556732, -0.022114543244242668, 0.07442588359117508, -0.017668865621089935, -0.02403153106570244, 0.006919574458152056, 0.05879344418644905, -0.0885634645819664, -0.016336753964424133, -0.024662213400006294, 0.029266972094774246, -0.04889025166630745, 0.042460259050130844, -0.013102580793201923, 0.023992935195565224, 0.024768078699707985, 0.047551900148391724, -0.02243787795305252, 0.05929713696241379, 0.03110451251268387, -0.00550821190699935], - "VIRT_REG_RFP80":[-0.04414765536785126, 0.05147779360413551, -0.035608600825071335, -0.03939598798751831, 0.0430026613175869, -0.03331028297543526, 0.015591064468026161, 0.01892651617527008, -0.011428372003138065, -0.06980786472558975, 0.06445881724357605, 0.1036338210105896, 0.01164929661899805, -0.07599718868732452, 0.022036561742424965, 0.10396245121955872, -0.041171155869960785, -0.07264886051416397, 0.00032837275648489594, 0.02848120965063572, -0.031889040023088455, 0.023848745971918106, -0.02298046089708805, -0.05559201166033745, 0.026687605306506157, 0.0565699003636837, -0.0134252505376935, 0.05494402348995209, -0.0584089457988739, 0.05422470346093178, -0.024360226467251778, 0.03570455685257912, 0.013681530021131039, -0.006910417694598436, 0.011886067688465118, 0.07619262486696243, 0.08147607743740082, 0.05824091285467148, 0.001224246108904481, -0.030463339760899544, -0.023527851328253746, 0.03078501485288143, -0.02225799672305584, -0.058049511164426804, 0.015403151512145996, 0.07900431007146835, 0.025944147258996964, 0.021455328911542892, 0.023985104635357857, -0.0327906534075737, 0.04195002466440201, -0.10313323140144348, -0.023333510383963585, -0.010316243395209312, -0.02042137086391449, 0.07474000751972198, 0.02313513681292534, -0.0030733307357877493, 0.06138097122311592, 0.005197131074965, -0.03222955763339996, 0.005364845506846905, -0.05313501134514809, 0.0013082564109936357, 0.025044983252882957, 0.0349799208343029, 0.09704083949327469, -0.017403649166226387, -0.03375721350312233, 0.05970870703458786, -0.021679691970348358, -0.04719642922282219, 0.024217652156949043, -0.06130526587367058, 0.004813425708562136, 0.07473690062761307, -0.039600174874067307, -0.009295261465013027, 0.05440402403473854, 0.04785943776369095, -0.04006686061620712, -0.020133933052420616, 0.00989031046628952, -0.054447200149297714, 0.06291327625513077, -0.01196430902928114, 0.0841275230050087, -0.05557875707745552, -0.0813804343342781, -0.0746457576751709, -0.024255990982055664, -0.048101916909217834, -0.014132879674434662, -0.013147399760782719, -0.009715595282614231, 0.08717820793390274, -0.04318689927458763, -0.0311901792883873, -0.017253845930099487, 0.005144816357642412], - "VIRT_REG_VR128":[0.08292517066001892, 0.053138989955186844, 0.0019234063802286983, -0.030035940930247307, 0.0821828693151474, -0.0540342852473259, 0.06449387222528458, -0.03985493257641792, 0.026820721104741096, 0.0352952741086483, -0.1056072935461998, 0.054804764688014984, 0.01685425080358982, 0.05867069214582443, 0.11665259301662445, -0.07655566930770874, 0.021201618015766144, 0.00927705504000187, -0.04723019897937775, 0.016582123935222626, -0.01160470675677061, -0.013075411319732666, 0.01054342370480299, -0.05403316020965576, 0.033609066158533096, -0.07971179485321045, 0.1005927175283432, -0.020655132830142975, -0.0036442605778574944, 0.018269486725330353, 0.036334097385406494, -0.06517180055379868, -0.028530113399028778, -0.03768114373087883, 0.10582506656646729, 0.011199450120329857, -0.06707775592803955, 0.02332702837884426, -0.014528930187225342, -0.09369251132011414, 0.069722481071949, 0.031001657247543335, 0.08032777905464172, -0.060744334012269974, 0.015131807886064053, 0.01935953088104725, -0.087028868496418, 0.041773099452257156, 0.0381581112742424, -0.07518653571605682, 0.021307995542883873, -0.07350508868694305, -0.04699733853340149, -0.007377162110060453, 0.07836157828569412, 0.016066696494817734, -0.02160775288939476, -0.030519334599375725, -0.09255059063434601, 0.03597188740968704, -0.11260625720024109, -0.08602424710988998, 0.058293748646974564, -0.034749604761600494, 0.005541469436138868, -0.07924741506576538, -0.024103455245494843, 0.06047135218977928, 0.026729481294751167, 0.03493977710604668, -0.07453227788209915, -0.01716521382331848, 0.008985077030956745, -0.08075122535228729, 0.03353623300790787, -0.08125714957714081, 0.04245763644576073, 0.06520543247461319, 0.020550349727272987, -0.003161275526508689, -0.03491697832942009, -0.005496494937688112, 0.09021904319524765, -0.057418785989284515, 0.03494826331734657, -0.052578359842300415, -0.044952504336833954, 0.11770184338092804, -0.048565153032541275, -0.03815764561295509, 0.06020108237862587, -0.09397949278354645, 0.03820547088980675, 0.08039405196905136, 0.014751153998076916, 0.006572262849658728, 0.05658692866563797, 0.05043925344944, -0.0060436660423874855, -0.12018798291683197], - "VIRT_REG_VR256":[0.032775089144706726, 0.029240285977721214, 0.01821955479681492, 0.023595772683620453, -0.02587016113102436, -0.12190376222133636, 0.09720813482999802, 0.005780891049653292, -0.0581410676240921, 0.04817686229944229, -0.04627984017133713, 0.03618951886892319, -0.10393846780061722, 0.04380590096116066, 0.030101926997303963, -0.021811308339238167, 0.0012455569813027978, 0.06209835410118103, -0.08859474956989288, 0.0671553835272789, -0.006448917090892792, 0.0169842429459095, 0.031113164499402046, -0.07417412847280502, 0.05549546331167221, -0.013042094185948372, 0.0948401540517807, -0.07335975021123886, -0.03987044095993042, -0.005343804135918617, -0.08741248399019241, -0.08009110391139984, 0.005667346995323896, 0.03745159134268761, 0.019986214116215706, -0.03723142296075821, -0.0037649653386324644, 0.005682446528226137, 0.0659727230668068, -0.002658356446772814, 0.07049102336168289, -0.01944110542535782, -0.014278342947363853, 0.04189611226320267, 0.0312303826212883, -0.046760618686676025, 0.040438465774059296, 0.054074693471193314, 0.07479880005121231, -0.016405146569013596, 0.027125591412186623, -0.04216836765408516, 0.0011189498472958803, -0.01471384521573782, -0.010250975377857685, -0.006412460468709469, -0.12170380353927612, 0.015495882369577885, -0.054699406027793884, 0.05955614894628525, 0.06753991544246674, -0.03688138723373413, 0.049010518938302994, -0.07614680379629135, 0.06504888087511063, -0.014145595952868462, 0.02210555598139763, 0.023598313331604004, 0.00511248828843236, 0.013318972662091255, -0.11605404317378998, -0.032067783176898956, -0.05010659247636795, -0.023693162947893143, 0.06650379300117493, -0.026386691257357597, 0.06052805855870247, 0.0515507273375988, 0.033960308879613876, -0.06421340256929398, -0.09355985373258591, -0.0658700093626976, 0.10278744995594025, -0.10271084308624268, -0.012089421041309834, -0.04169749841094017, -0.07112454622983932, -0.032573599368333817, -0.0003141233173664659, 0.017007946968078613, 0.03622191399335861, 0.05829676240682602, 0.06261610984802246, 0.005667738616466522, 0.009631159715354443, 0.022852277383208275, 0.057013869285583496, -0.05015721917152405, 0.027599012479186058, -0.08637165278196335], "VMASKMOVPDYmr":[-0.04878474771976471, 0.009688055142760277, 0.05428608879446983, -0.030850162729620934, 0.03008297272026539, 0.03831377625465393, -0.023454757407307625, 0.061062078922986984, -0.07177434861660004, 0.003681673901155591, 0.040161218494176865, -0.009652352891862392, 0.07261710613965988, -0.010966332629323006, -0.013221205212175846, -0.03301544487476349, 0.04829031974077225, -0.08083753287792206, 0.030231673270463943, -0.02659734897315502, -0.036777157336473465, 0.06681652367115021, 0.01175805926322937, 0.06305940449237823, -0.019296150654554367, 0.02796877548098564, -0.029999401420354843, -0.0198240764439106, -0.04471949115395546, -0.06781838089227676, 0.024380704388022423, 0.03754236921668053, 0.06767786294221878, 0.04803696274757385, 0.046649131923913956, 0.04538867995142937, -0.028629129752516747, 0.0127564687281847, 0.004995361436158419, -0.08728974312543869, 0.029057662934064865, 0.07067801058292389, 0.0007887053652666509, 0.019237162545323372, -0.04447153955698013, -0.10583364218473434, 0.08983936905860901, 0.015038984827697277, -0.034384895116090775, -0.055098336189985275, -0.07670909911394119, 0.002524072304368019, 0.10086455941200256, 0.022610867395997047, 0.05591642111539841, -0.07907918840646744, -0.04253252223134041, 0.05387851223349571, -0.034182146191596985, -0.08478306978940964, -0.039358172565698624, 0.05872701108455658, 0.0004980096709914505, -0.054017916321754456, -0.05543661117553711, -0.05234605073928833, -0.01648441143333912, -0.039598412811756134, 0.014009279198944569, 0.07753992825746536, -0.024791967123746872, 0.0015941763995215297, -0.08564147353172302, 0.015439499169588089, 0.04659571126103401, 0.042471837252378464, 0.005456998012959957, 0.015990061685442924, -0.02272135764360428, -0.03891618177294731, -0.0077924951910972595, -0.05113787576556206, 0.040118955075740814, -0.043831776827573776, 0.05283576622605324, 0.09104584157466888, 0.015506122261285782, -0.028880758211016655, -0.0025508899707347155, 0.08238258212804794, -0.011219828389585018, 0.0496247261762619, -0.044287387281656265, 0.050674524158239365, 0.02936738170683384, -0.017218898981809616, 0.07722929865121841, 0.04578819498419762, -0.031120644882321358, -0.022032534703612328], "VMASKMOVPSYmr":[0.020578626543283463, -0.004085692577064037, 0.07696651667356491, 0.028803450986742973, -0.006955036427825689, -0.018540993332862854, 0.0719260424375534, 0.09322775900363922, 0.05095001682639122, -0.01811334490776062, 0.01627892442047596, 0.050088733434677124, -0.06736274808645248, 0.025077303871512413, 0.06022811681032181, -0.09305489808320999, -0.09338469058275223, -0.0525103323161602, -0.06159364432096481, 0.030921749770641327, 0.06632588058710098, 0.031169326975941658, 0.016549210995435715, -0.06410345435142517, 0.034944821149110794, 0.01632581278681755, 0.06805131584405899, -0.004622941836714745, -0.02994105964899063, 0.025459013879299164, 0.020487098023295403, 0.06677251309156418, -0.046148937195539474, -0.05847230181097984, -0.0662517175078392, -0.006552667822688818, 0.05338975414633751, -0.07456435263156891, -0.05682503432035446, -0.0720917284488678, -0.08354304730892181, 0.057539310306310654, -0.0984572172164917, -0.015717046335339546, -0.04905203357338905, -0.016580646857619286, 0.030063051730394363, -0.04245767742395401, -0.019089849665760994, 0.037014883011579514, -0.03125334531068802, -0.02194075658917427, 0.057924628257751465, 0.053156934678554535, 0.03154401481151581, -0.03698640316724777, -0.047283731400966644, -0.07787752151489258, -0.09294760227203369, 0.008879968896508217, -0.039479922503232956, 0.06407082825899124, 0.021868228912353516, 0.02621234394609928, -0.05872492864727974, -0.07943505048751831, 0.024682780727744102, 0.014713538810610771, 0.02206231839954853, -0.0664556622505188, -0.08985312283039093, -0.028045928105711937, 0.022865260019898415, -0.03564520925283432, 0.06292934715747833, 0.009946631267666817, 0.031550049781799316, -0.08577742427587509, 0.047102898359298706, -0.07018786668777466, -0.10670997202396393, 0.0016501775244250894, -0.08505392074584961, 0.00861909706145525, -0.06370823830366135, 0.03423767164349556, 0.03173772618174553, 0.019602738320827484, 0.021573755890130997, 0.02385428547859192, -0.01468846295028925, 0.023825718089938164, -0.05538937821984291, 0.05968264490365982, 0.08997872471809387, -0.006320557557046413, 0.012793052941560745, -0.10326020419597626, -0.015349009074270725, 0.006139614153653383], "VMASKMOVPSYrm":[-0.09999474138021469, -0.05461611971259117, 0.06111544370651245, 0.009340068325400352, 0.05158305540680885, 0.018409717828035355, 0.03258055821061134, -0.0017857305938377976, 0.041260261088609695, -0.04183795303106308, -0.04711655154824257, 0.007005605846643448, 0.017177876085042953, 0.011972760781645775, -0.058734532445669174, 0.022736912593245506, -0.10794606059789658, 0.029367392882704735, -0.012645614333450794, -0.09590506553649902, -0.07090207934379578, -0.05850019305944443, -0.018024247139692307, -0.0036007456947118044, -0.06459654122591019, 0.009839186444878578, 0.04846305027604103, -0.11106285452842712, 0.029033005237579346, 0.10009876638650894, 0.012796668335795403, -0.0073439814150333405, -0.08754748106002808, -0.037603843957185745, -0.015900349244475365, -0.007158457301557064, 0.03420218825340271, 0.027995899319648743, -0.07699259370565414, 0.042778756469488144, 0.04648644104599953, -0.04391217231750488, -0.018405593931674957, -0.01280362717807293, 0.08530068397521973, -0.03674551844596863, 0.06248623505234718, 0.0038591010961681604, -0.07031620293855667, -0.01702764257788658, 0.005379523150622845, -0.029414091259241104, 0.00011999297566944733, 0.058016858994960785, 0.10091454535722733, 0.07112561911344528, -0.07445680350065231, -0.08252609521150589, -0.05458306148648262, 0.0828995481133461, 0.030287114903330803, 0.08512170612812042, -0.0745752677321434, 0.011145705357193947, 0.07730960845947266, 0.06756677478551865, 0.10192125290632248, -0.015338120050728321, 0.025173967704176903, -0.017697714269161224, 0.00455897580832243, -0.01002852339297533, -0.09001599997282028, 0.06024448946118355, 0.01357717253267765, -0.04349803552031517, 0.026919689029455185, 0.07871785014867783, 0.06163106486201286, -0.02904645912349224, 0.05042176693677902, 0.019180594012141228, -0.029065869748592377, -0.02645217627286911, -0.04180121049284935, -0.01644887775182724, 0.005278781522065401, 0.021325504407286644, 0.0710480809211731, -0.02405066229403019, 0.06883849203586578, -0.08493685722351074, -0.0180019810795784, 0.10276532918214798, -0.04697193205356598, -0.0004998040967620909, 0.014400942251086235, 0.07172509282827377, 0.027445673942565918, 0.04722077399492264], @@ -673,5 +642,42 @@ "V_SETALLONES":[0.011805560439825058, 0.005605545360594988, 0.019577916711568832, -0.007038246374577284, -0.013101942837238312, -0.060087915509939194, 0.06600171327590942, 0.1127510741353035, 0.03251935541629791, -0.08513955771923065, -0.1272188425064087, -0.05743984133005142, 0.03415455296635628, -0.01813715696334839, 0.08123213797807693, -0.02604430541396141, 0.004977638833224773, -0.05056260898709297, 0.0759609192609787, -0.03905864432454109, -0.029284782707691193, -0.0773778036236763, -0.06391929090023041, 0.03013690747320652, 0.025567403063178062, -0.04096659645438194, -0.013911372050642967, 0.03076753579080105, 0.09287972748279572, 0.06516721844673157, 0.013303481042385101, -0.05148301273584366, 0.013247961178421974, -0.02087739109992981, -0.06532798707485199, -0.07080436497926712, 0.03797996789216995, -0.05954182893037796, -0.006158157251775265, -0.039611611515283585, 0.016250262036919594, -0.009441757574677467, -0.009183786809444427, 0.16159473359584808, 0.08712765574455261, -0.022884182631969452, -0.03575573116540909, -0.03199240192770958, -0.03306444734334946, -0.003918874077498913, 0.062194518744945526, 0.015179269947111607, -0.027334710583090782, -0.058873455971479416, 0.128275528550148, -0.0292880367487669, -0.07747887820005417, 0.1131230816245079, 0.02434738725423813, -0.025987306609749794, 0.006977062206715345, 0.005061171483248472, 0.010551988147199154, -0.011694980785250664, -0.04222672060132027, 0.0018857514951378107, -0.09771532565355301, 0.005980918649584055, -0.021874738857150078, -0.03269551321864128, -0.0660959854722023, -0.03511122986674309, -0.012204808183014393, -0.010394910350441933, 0.05620425567030907, -0.07928325980901718, 0.0231300238519907, -0.018796175718307495, -0.059483520686626434, -0.06498315185308456, -0.002720780670642853, 0.017449399456381798, -0.07902888208627701, -0.09885134547948837, 0.013462111353874207, 0.0991656631231308, 0.03312922269105911, -0.006249894388020039, 0.005173753947019577, -0.06332565099000931, -0.06398826092481613, -0.03855561092495918, 0.049685269594192505, 0.016197331249713898, -0.006844596937298775, -0.05894636735320091, 0.026065604761242867, -0.023921040818095207, 0.0833858922123909, 0.04180749133229256], "XCHG":[0.03013892099261284, -0.005918541457504034, -0.003877029987052083, -0.01153622567653656, 0.07044235616922379, 0.0020885420963168144, -0.04268760234117508, 0.07963797450065613, 0.0896378755569458, -0.03346250206232071, -0.026062551885843277, 0.07721738517284393, 0.08893758058547974, 0.0798523873090744, -0.025333784520626068, -0.01930663175880909, -0.012997916899621487, -0.051225848495960236, -0.0299966000020504, -0.032841041684150696, -0.06343690305948257, -0.016547048464417458, 0.034530773758888245, 0.057199425995349884, 0.0693645030260086, 0.04208416864275932, -0.028830133378505707, 0.08431533724069595, -0.06464798003435135, 0.0009512414690107107, 0.042868468910455704, -0.031348757445812225, -0.01816270686686039, 0.05597987025976181, -0.017707090824842453, -0.03889893740415573, -0.052769940346479416, 0.012921033427119255, -0.029488561674952507, -0.012502696365118027, 0.05398940294981003, -0.032147347927093506, -0.005250571761280298, -0.014250441454350948, 0.08205590397119522, 0.049281857907772064, -0.07257362455129623, -0.0003973407146986574, -0.00821124017238617, 0.10007432103157043, 0.054469816386699677, -0.05644146353006363, 0.013105852529406548, -0.08262810856103897, -0.02594495750963688, 0.007682343479245901, -0.011262120679020882, -0.007376475725322962, -0.05011703073978424, -0.06952987611293793, -0.033738043159246445, 0.01750120520591736, -0.026767224073410034, -0.04718783125281334, 0.002559647196903825, 0.01700885407626629, -0.07193762063980103, 0.07015261799097061, 0.0034866048954427242, -0.08257746696472168, -0.07703307271003723, 0.006709580775350332, 0.06423933804035187, 0.024792056530714035, -0.008637255057692528, 0.0364011712372303, 0.035330090671777725, -0.060980167239904404, 0.026977067813277245, -0.02813805267214775, -0.02690977416932583, 0.05637027323246002, 0.008040377870202065, -0.03371180593967438, -0.06654872000217438, -0.030922764912247658, -0.07050447911024094, 0.047597192227840424, 0.047301240265369415, 0.04565070942044258, -0.0005885852151550353, -0.01970672234892845, -0.013277091085910797, 0.03462797775864601, -0.050644565373659134, -6.830461643403396e-05, -0.0032834408339112997, -0.09096988290548325, -0.0431605726480484, 0.004180085379630327], "XOR":[0.05397406592965126, 0.030059566721320152, -0.008174624294042587, -0.015902524814009666, -0.05867229402065277, 0.10023067146539688, 0.039013586938381195, -0.0062194764614105225, 0.0027951474767178297, -0.12871405482292175, 0.006182669661939144, -0.03362947702407837, 0.03972288593649864, -0.0761077031493187, 0.07198456674814224, 0.06330277770757675, -0.020690103992819786, 0.04084693267941475, -0.029953323304653168, -0.1037738174200058, 0.058683767914772034, -0.09326515346765518, -0.030509043484926224, 0.08620086312294006, -0.028335779905319214, 0.0025649559684097767, 0.02293877862393856, 0.06309233605861664, 0.05537085980176926, 0.008650199510157108, 0.08450134843587875, 0.006163342390209436, 0.08676894754171371, 0.00373055599629879, -0.0536164715886116, 0.017478466033935547, -0.02005663886666298, -0.009954672306776047, 0.0935724526643753, -0.013202485628426075, 0.019175032153725624, 0.047811202704906464, -0.010279017500579357, 0.08613553643226624, 0.030951783061027527, -0.007498149760067463, 0.02222890406847, 0.022576699033379555, -0.037464242428541183, -0.05039561539888382, -0.05145428702235222, 0.05291113257408142, -0.04549814388155937, 0.07552238553762436, 0.04320567473769188, 0.08343681693077087, -0.03850278630852699, -0.01834949105978012, 0.047886237502098083, 0.00965320598334074, 0.014898041263222694, -0.06947735697031021, -0.002480468712747097, 0.033667247742414474, -0.057668499648571014, 0.038462892174720764, -0.04644528403878212, -0.06664751470088959, -0.048734813928604126, 0.04303475841879845, 0.027636554092168808, 0.024116700515151024, -0.003788548056036234, -0.0088395019993186, -0.04236738011240959, -0.02894027903676033, -0.135579451918602, -0.032144784927368164, -0.11316774785518646, -0.0039872839115560055, 0.07162772864103317, 0.03945969045162201, 0.007661669049412012, 0.04564569517970085, 0.023007070645689964, 0.0002026051515713334, -0.030437719076871872, -0.01982058770954609, -0.017619898542761803, -0.04013601690530777, 0.03464880958199501, -0.04437020793557167, 0.010373799130320549, -0.057255037128925323, -0.006371108815073967, -0.02713695913553238, -0.06605585664510727, 0.01780680939555168, -0.00013575045159086585, 0.07283638417720795] + }, + "CommonOperands" : { + "Immediate":[-0.039664868265390396, 0.028720445930957794, -0.057207897305488586, 0.04179477319121361, 0.04477043077349663, 0.020050648599863052, -0.056656818836927414, -0.025030966848134995, -0.04394019395112991, 0.04849115386605263, 0.012325904332101345, 0.06731707602739334, 0.04568001255393028, -0.04773757979273796, -0.012142524123191833, -0.03986259177327156, -0.027249159291386604, -0.04930245876312256, -0.10542229562997818, -0.05678592994809151, -0.038303568959236145, -0.07283245027065277, 0.0217409897595644, -0.01139344647526741, 0.006936497986316681, -0.04702157527208328, 0.09977010637521744, -0.035237088799476624, 0.028822069987654686, -0.0691431537270546, -0.0829710066318512, -0.1289154589176178, -0.08470306545495987, -0.06731563061475754, 0.06642980873584747, 0.026025734841823578, -0.04049745202064514, 0.030080674216151237, 0.04203929752111435, 0.06834205985069275, 0.04315062239766121, 0.00788890291005373, 0.03426999971270561, 0.08819636702537537, 0.004112098831683397, 0.03392210975289345, 0.010541473515331745, 0.08045777678489685, -0.02914009988307953, 0.0624285452067852, 0.03299122676253319, -0.05355033650994301, -0.07568570226430893, 0.08106201142072678, 0.0376802459359169, -0.04886564612388611, -0.10992937535047531, -0.00761816743761301, -0.014918084256350994, 0.03816765174269676, -0.04981819912791252, 0.00031993765151128173, 0.011382698081433773, -0.029902901500463486, -0.0117422454059124, -0.057965945452451706, -0.09519924223423004, 0.020727403461933136, -0.04526710882782936, 0.09883677959442139, 0.018033087253570557, -0.003035350237041712, -0.06968960911035538, -0.09893210977315903, -0.01264366414397955, 0.017397744581103325, -0.08519260585308075, 0.09382850676774979, -0.055508699268102646, -0.026548130437731743, -0.013868317008018494, -0.03162496164441109, 0.06089535728096962, -0.01583624631166458, -0.060260944068431854, 0.06709896773099899, -0.09333796799182892, -0.02887417934834957, -0.03424007445573807, -0.01687423326075077, 0.11968979239463806, -0.08361987769603729, 0.09037765115499496, -0.04322688281536102, -0.040831610560417175, -0.061376459896564484, -0.03485504537820816, 0.016033072024583817, 0.004106835462152958, -0.03354674205183983], + "MBB":[0.0285621527582407, 0.017540860921144485, -0.08473232388496399, -0.004012782592326403, 0.01284435298293829, -0.05268647149205208, 0.05576688051223755, 0.0021535248961299658, -0.03945871442556381, -0.006189210340380669, -0.015129411593079567, -0.08998296409845352, -0.023543253540992737, -0.03973307088017464, 0.03474939242005348, -0.01602775789797306, -0.07461361587047577, -0.016514597460627556, -0.016366377472877502, 0.004728052299469709, -0.023341577500104904, -0.0914730429649353, 0.030636735260486603, -0.03425632417201996, 0.03614623472094536, -0.007019295822829008, -0.0218521635979414, -0.015808485448360443, -0.05414801836013794, 0.029721688479185104, 0.09407073259353638, 0.029655681923031807, -0.005722714588046074, 0.08653672784566879, 0.01633341796696186, -0.07890991121530533, -0.07574641704559326, 0.013483843766152859, -0.0011275253491476178, -0.05623066797852516, -0.03096684440970421, -0.0019136210903525352, 0.005127475131303072, 0.005057196598500013, -0.008401975966989994, -0.0391613207757473, -0.0026145142037421465, 0.05342942103743553, 0.034099776297807693, 0.028928104788064957, -0.006105952430516481, -0.039190810173749924, 0.026784662157297134, -0.07679374516010284, -0.007475676946341991, -0.036650288850069046, 0.00774755235761404, 0.008984091691672802, -0.059830714017152786, 0.042310964316129684, 0.0681624785065651, -0.018189340829849243, -0.014816401526331902, -0.05541539564728737, -0.09348370134830475, 0.003691869555041194, -0.0010735570685938, -0.010131723247468472, -0.041050590574741364, -0.013792471028864384, -0.024337435141205788, 0.07526508718729019, 0.08163300901651382, -0.03508464992046356, -0.01681988686323166, -0.06734774261713028, -0.07656992971897125, -0.03866373747587204, 0.004544078838080168, 0.0585801787674427, -0.021823249757289886, -0.0610244981944561, -0.04469957575201988, -0.011089849285781384, -0.05069964751601219, -0.025694409385323524, -0.0670132040977478, 0.09616350382566452, 0.06308142840862274, -0.10543308407068253, 0.0023751568514853716, -0.06237253174185753, 0.05771911144256592, -0.06010056659579277, -0.016188565641641617, 0.009142348542809486, -0.014255198650062084, -0.02999819628894329, 0.00473234336823225, 0.03976761922240257], + "FrameIndex":[0.05219179764389992, -0.01926516741514206, -0.021848104894161224, -0.008528115227818489, 0.02989117242395878, -0.012461756356060505, -0.050973404198884964, 0.026713935658335686, 0.01968700997531414, -0.001058116089552641, 0.009182002395391464, 0.03877940773963928, 0.070717453956604, -0.0028735792730003595, 0.0528000183403492, -0.015265910886228085, 0.007753959856927395, 0.01596899703145027, -0.07933179289102554, -0.02578687109053135, 0.02417992427945137, -0.03462255373597145, 0.04385964199900627, 0.004388607107102871, 0.03716951236128807, 0.04064105078577995, 0.07711678743362427, 0.0068300217390060425, -0.05443308874964714, -0.010809220373630524, -0.03124961629509926, 0.004911563824862242, -0.09201066941022873, 0.051436200737953186, 0.015400445088744164, 0.07804328948259354, -0.02971532940864563, -0.0003241244703531265, -0.02131350338459015, -0.09173687547445297, -0.01707594096660614, 0.0025449323002249002, 0.08701702952384949, 0.10675988346338272, -0.05082142353057861, 0.021581847220659256, -0.04104776680469513, 0.08402986079454422, -0.06109907105565071, 0.015201682224869728, 0.04374992102384567, -0.028573378920555115, -0.07767742872238159, 0.07216905802488327, 0.020538095384836197, -0.01229778677225113, 0.003033912740647793, -0.0007747758063487709, -0.09185474365949631, -0.02851664461195469, -0.009441743604838848, 0.05500328913331032, -0.002983751241117716, -0.09198789298534393, -0.051319632679224014, -0.054626885801553726, -0.020108554512262344, 0.0010591084137558937, -0.009138713590800762, 0.07223176956176758, -0.022099260240793228, 0.016025206074118614, -0.05320229008793831, 0.025131219998002052, 0.06626036763191223, 0.07639450579881668, -0.027084894478321075, 0.06581225991249084, -0.017618829384446144, -0.03859466314315796, -0.03385398909449577, 0.018783841282129288, -0.0730312392115593, 0.06957981735467911, -0.03065340407192707, 0.020685074850916862, -0.05311165004968643, 0.09466810524463654, 0.00955914705991745, -0.013919183053076267, -0.05540250986814499, -0.03087283857166767, -0.009688221849501133, 0.016239993274211884, -0.012926830910146236, -0.027712060138583183, -0.06342892348766327, -0.011996395885944366, 0.05536693334579468, -0.04359230771660805], + "ConstantPoolIndex":[0.041396364569664, -0.032536957412958145, -0.01450332161039114, -0.006678386591374874, 0.058945223689079285, 0.02544882893562317, -0.03047209233045578, -0.07739393413066864, -0.09328317642211914, -0.01668739691376686, -0.024649402126669884, -0.0379607230424881, -0.11910244077444077, -0.020992999896407127, -0.007654233835637569, -0.005232746247202158, -0.05641235038638115, -0.030478237196803093, -0.11095637828111649, -0.029757868498563766, 0.007831704802811146, -0.06478779017925262, -0.029330771416425705, -0.016729608178138733, 0.016851121559739113, -0.08636923134326935, 0.09819734841585159, -0.06862954050302505, -0.054081980139017105, -0.11573795974254608, 0.025045182555913925, -0.045820001512765884, -0.03937136381864548, -0.0006095073185861111, 0.010480350814759731, 0.04263518005609512, -0.07309181243181229, 0.030367357656359673, 0.05174611508846283, -0.07616177201271057, 0.08458246290683746, -0.05704038590192795, -0.08539492636919022, -0.027642514556646347, -0.01617196388542652, 0.025178344920277596, 0.009598441421985626, -0.02391812391579151, -0.007018273696303368, 0.08220435678958893, 0.019317878410220146, -0.07800780981779099, 0.008812256157398224, -0.08796992152929306, -0.018406951799988747, 0.06285018473863602, 0.0247958917170763, -0.010797450318932533, 0.042904313653707504, 0.04307369515299797, 0.03591239079833031, 0.0318138487637043, -0.052741825580596924, -0.05960077419877052, 0.05289359390735626, -0.07335714250802994, -0.07966916263103485, 0.06509458273649216, -0.014078558422625065, 0.05966315418481827, -0.10191051661968231, 0.038503143936395645, 0.08414285629987717, -0.09167703986167908, -0.03125883638858795, 0.00029595239902846515, -0.05052953213453293, 0.06109768897294998, 0.027757229283452034, 0.07064288854598999, 0.025423981249332428, 0.04430470988154411, 0.006646708585321903, 0.011614424176514149, -0.058028463274240494, -0.026873555034399033, -0.045714568346738815, -0.009242760017514229, -0.08255617320537567, 0.03060135245323181, -0.019932182505726814, -0.07189206779003143, 0.01935136877000332, 0.05297813192009926, 0.004497232846915722, -0.08383949100971222, -0.0008196682319976389, 0.03524069860577583, 0.023135961964726448, 0.00863903108984232], + "JumpTableIndex":[-0.007416237145662308, 0.0038157713133841753, 0.05180662125349045, 0.03776901960372925, -0.011749244295060635, -0.02952706068754196, -0.06646136939525604, 0.02088487148284912, -0.001927916775457561, 0.018895410001277924, 0.0509350448846817, 0.057210080325603485, -0.0476078987121582, -0.00016809302906040102, -0.02341553010046482, -0.06734820455312729, 0.02047930844128132, 0.009282611310482025, 0.0038133300840854645, 0.0020261742174625397, -0.09253961592912674, 0.0766557827591896, -0.049570225179195404, -0.11510220915079117, -0.009570423513650894, -0.007274465169757605, 0.07750000059604645, 0.02489926479756832, -0.08297400176525116, 0.048176445066928864, 0.03797437995672226, 0.060842450708150864, 0.020265065133571625, -0.03559373319149017, 0.03493893891572952, -0.0036544676404446363, 0.010211148299276829, -0.06471849977970123, -0.034595828503370285, -0.05245388671755791, -0.0014119939878582954, 0.008752748370170593, -0.020637203007936478, 0.053244929760694504, 0.052053239196538925, 0.014706660993397236, 0.02803724631667137, -0.07983336597681046, 0.03106858767569065, 0.001688914722763002, -0.07647732645273209, -0.028148295357823372, -0.0528123639523983, 0.08006428182125092, -0.06398879736661911, -0.033476538956165314, 0.05217607319355011, -0.03093232959508896, 0.044230975210666656, 0.05123162269592285, -0.05225585401058197, 0.06976816058158875, -0.0014492797199636698, 0.03833283483982086, 0.08385992050170898, -0.04722217097878456, -0.00226160092279315, -0.027254855260252953, -0.09566919505596161, 0.02109321765601635, -0.032354824244976044, 0.08032239973545074, -0.046937450766563416, -0.004326784983277321, -0.026024870574474335, 0.12039119750261307, 0.1016048863530159, 0.06808122247457504, -0.012297546491026878, -0.06450799852609634, 0.015778351575136185, 0.012280710972845554, 0.04002666845917702, 0.04792468994855881, -0.06248988211154938, -0.054222140461206436, 0.018379682675004005, -0.0029111658222973347, 0.016062958166003227, 0.09880068898200989, 0.03846307471394539, 0.04975416138768196, 0.07305088639259338, -0.020941948518157005, -0.020897891372442245, 0.03872328996658325, -0.05682756006717682, 0.09583723545074463, 0.0028475294820964336, -0.05127262324094772], + "ExternalSymbol":[0.014755810610949993, -0.049842361360788345, -0.06733497977256775, 0.05401315540075302, 0.061938412487506866, 0.02437831088900566, -0.06823863834142685, 0.03685877099633217, 0.02961423434317112, -0.04944299906492233, -0.1271103173494339, 0.030452819541096687, 0.019848955795168877, -0.03185190260410309, 0.06586895883083344, 0.0007315169204957783, 0.010839227586984634, -0.09547370672225952, -0.01799146644771099, -0.02204788289964199, 0.048699937760829926, 0.004187166225165129, 0.004053634125739336, -0.04464051127433777, -0.005158414598554373, -0.0416896678507328, -0.024279240518808365, -0.05358913540840149, -0.04719633609056473, -0.07180647552013397, 0.02559211477637291, 0.04657098650932312, 0.08353757858276367, -0.0023563469294458628, 0.046847302466630936, -0.03508693352341652, 0.0696689784526825, 0.054716791957616806, -0.012037037871778011, 0.019885245710611343, 0.01824580691754818, -0.06719563156366348, -0.05447190999984741, 0.08877509087324142, -0.01375679112970829, -0.014463561587035656, -0.049798283725976944, 0.06304343044757843, -0.007584648672491312, -0.016156170517206192, 0.024602508172392845, 0.004940119571983814, -0.04088609293103218, 0.0026271860115230083, 0.00787595845758915, -0.01889132149517536, -0.041029710322618484, 0.07343143969774246, -0.02505693957209587, -0.04825644940137863, 0.060728199779987335, 0.00460366066545248, 0.020744791254401207, 0.04238201677799225, -0.024090539664030075, -0.05792662873864174, 0.07639332860708237, -0.07511764764785767, -0.08259762078523636, 0.07901840656995773, -0.000285966758383438, 0.021390466019511223, -0.07818973809480667, -0.02385067008435726, -0.0014113716315478086, -0.055170729756355286, 0.00946732610464096, 0.02471417747437954, 0.07941421121358871, 0.006746167317032814, -0.06766024231910706, -0.089698426425457, 0.01933225803077221, -0.06994582712650299, -0.10149082541465759, 0.06007266044616699, -0.14545120298862457, -0.03447172790765762, 0.03258124738931656, 0.04966919496655464, 0.023691890761256218, -0.014501980505883694, 0.05896589905023575, 0.04760534316301346, -0.017742110416293144, 0.0019451226107776165, -0.01854461058974266, -0.04744676500558853, -0.017504630610346794, 0.05197983980178833], + "GlobalAddress":[0.021709734573960304, -0.03253590315580368, -0.04603651538491249, -0.02350226789712906, 0.02841794677078724, 0.01920732669532299, 0.053104616701602936, 0.03941836208105087, -0.01895466446876526, -0.030471740290522575, 0.010719750076532364, 0.020050356164574623, 0.03648754581809044, -0.021573888137936592, -0.02554452419281006, -3.637039117165841e-05, 0.05989491194486618, -0.006903402041643858, -0.08826262503862381, -0.028047384694218636, -0.04230065643787384, -0.05190899223089218, 0.06145390123128891, 0.0005839569494128227, -4.391977927298285e-05, -0.01880771853029728, 0.09660127758979797, 0.04333353415131569, 0.06461602449417114, -0.06010710820555687, -0.0690189078450203, 0.04574553668498993, -0.07640431076288223, 0.01879746839404106, 0.02076675370335579, 0.04869573190808296, 0.025147439911961555, 0.05311164632439613, 0.05711919441819191, 0.049520380795001984, 0.041169121861457825, -0.0603964701294899, -0.04195070639252663, 0.07676130533218384, -0.015161959454417229, 0.02903268299996853, -0.027548301964998245, 0.04705912992358208, -0.11194053292274475, -0.008245207369327545, -0.07792827486991882, -0.019468743354082108, 0.05482499673962593, -0.0028855702839791775, 0.05478052794933319, 0.07484771311283112, -0.011742575094103813, 0.00923923309892416, -0.05074375122785568, 0.06956734508275986, -0.045990440994501114, 0.007280972320586443, 0.040920473635196686, -0.09143709391355515, -0.06105270981788635, -0.0021254979074001312, -0.09519167989492416, 0.06324268877506256, -0.0693386048078537, -0.05100148543715477, 0.010643817484378815, -0.008162467740476131, -0.08811189234256744, -0.08640385419130325, 0.0077143507078289986, 0.030832089483737946, -0.01504515577107668, 0.07277517020702362, 0.02581198327243328, -0.052599068731069565, -0.06478387117385864, 0.01634707674384117, -0.021173706278204918, 0.030482977628707886, -0.09826494008302689, 0.07716016471385956, -0.10845024883747101, 0.04479274898767471, -0.015128640457987785, -0.03491876646876335, 0.05239150673151016, -0.03427724912762642, 0.06768845021724701, -0.04174086079001427, -0.05136744678020477, 0.0037109211552888155, -0.030324269086122513, -0.06928850710391998, -0.0395960658788681, 0.07726000994443893], + "RegisterMask":[0.009287647902965546, 0.029691029340028763, -0.03465871885418892, 0.032606374472379684, -0.007339544594287872, 0.03367740660905838, -0.0661492720246315, 0.0436118021607399, -0.002896533813327551, 0.028440887108445168, -0.06791415065526962, 0.004055356606841087, -0.01596181094646454, -0.003846745239570737, 0.06762582808732986, -0.025632556527853012, 0.08132420480251312, 0.025554664433002472, -0.08994632959365845, 0.02521730400621891, 0.023826507851481438, 0.0004487193073146045, 0.01047397032380104, 0.03246957063674927, -0.033482909202575684, 0.05051224306225777, 0.005778896156698465, -0.0006257061613723636, 0.00522293895483017, -0.04666636884212494, 0.022335125133395195, -0.022150320932269096, 0.04510439187288284, -0.02769547514617443, 0.026804683730006218, 0.0710473507642746, -0.014513042755424976, 0.0695318952202797, 0.048469461500644684, -0.008654370903968811, -0.028613079339265823, -0.02918054349720478, -0.022721733897924423, -0.0004791628452949226, 0.011470172554254532, 0.08561886101961136, 0.07125027477741241, -0.05847848951816559, 0.011811288073658943, -0.025244031101465225, -0.03665035218000412, -0.03482883796095848, 0.04196881502866745, 0.06909161061048508, 0.02365143597126007, -0.0689089447259903, -0.0707414448261261, -0.03962424397468567, -0.025703679770231247, 0.06502455472946167, 0.057676125317811966, 0.026916807517409325, 0.024921152740716934, 0.009799988009035587, -0.018656229600310326, 0.009880480356514454, -0.06516153365373611, 0.019290866330266, 0.02236226759850979, -0.02598695270717144, -0.00299705658107996, 0.019448822364211082, -0.014883329160511494, 0.06645222008228302, -0.028751512989401817, -0.01589173451066017, 0.026225939393043518, 0.07285763323307037, -0.06037987396121025, -0.027615630999207497, -0.039930179715156555, -0.07122864574193954, 0.029825787991285324, 0.026364129036664963, -0.04438399150967598, 0.07015394419431686, -0.013950555585324764, 0.004367176443338394, 0.020521124824881554, 0.02030497044324875, 0.011951270513236523, 0.06765977293252945, -0.015042259357869625, 0.005189584568142891, -0.07532864063978195, -0.010886142030358315, 0.006792030762881041, -0.06348442286252975, 0.031859394162893295, -0.052482619881629944], + "Metadata":[-0.07879140228033066, 0.024690961465239525, 0.022790303453803062, 0.01354144886136055, -0.07098772376775742, 0.04053819552063942, -0.04038544371724129, -0.021055836230516434, 0.10361373424530029, 0.04415135458111763, -0.09545262902975082, 0.042553599923849106, -0.021835647523403168, 0.07703430950641632, -0.04880501329898834, -0.04054124280810356, 0.05049756169319153, 0.08986796438694, 0.0705084353685379, -0.0077315340749919415, -0.045390889048576355, 0.053155045956373215, 0.045656319707632065, -0.02663712576031685, -0.01446426473557949, -0.058978915214538574, 0.011314704082906246, 0.03043927252292633, -0.0843580812215805, 0.017854437232017517, -0.08720997720956802, 0.030351335182785988, -0.04896129295229912, 0.04189978539943695, -0.09887325763702393, 0.0015409664483740926, -0.08604399859905243, 0.10654544085264206, 0.1058540865778923, 0.014106648042798042, 0.0640459656715393, -0.05182884633541107, 0.006081609521061182, 0.07624028623104095, 0.02025698497891426, 0.08467324078083038, 0.027136018499732018, 0.026320911943912506, -0.035337720066308975, 0.03864980861544609, -0.019960917532444, -0.029152821749448776, 0.06562864780426025, 0.028298277407884598, -0.07397148013114929, -0.005078969523310661, 0.025909438729286194, -0.01157586183398962, 0.05436081811785698, 0.03408071771264076, -0.07142144441604614, -0.0523630827665329, -0.06302442401647568, -0.019975490868091583, -0.06937523931264877, 0.057667043060064316, -0.08580337464809418, -0.05092239752411842, -0.012613813392817974, 0.025480754673480988, 0.04219530522823334, -0.007300581783056259, 0.05323299020528793, 0.0489904023706913, 0.09260626882314682, -0.04819458723068237, 0.05419271066784859, 0.04558999091386795, 0.012036344967782497, -0.05483977124094963, -0.05181310698390007, -0.02104383148252964, -0.057876624166965485, 0.039601441472768784, 0.025240536779165268, -0.03984035924077034, 0.07654847204685211, -0.07073183357715607, -0.0018080074805766344, -0.016453349962830544, 0.03962434455752373, 0.05717255175113678, 0.01962372660636902, 0.00952839944511652, 0.0013127806596457958, 0.013634574599564075, 0.07692103832960129, 0.06334574520587921, 0.056647684425115585, -0.02965259924530983], + "MCSymbol":[0.05158298835158348, 0.05024643987417221, 0.06704410910606384, 0.0378347709774971, -0.03902719169855118, -0.08626251667737961, 0.03964311257004738, 0.06615762412548065, 0.04361319541931152, 0.03646374121308327, -0.018487416207790375, 0.0024993624538183212, 0.006693041883409023, 0.08311881870031357, 0.021111667156219482, 0.038208797574043274, 0.08689694851636887, -0.03659898787736893, 0.020775076001882553, 0.03553535416722298, 0.06854367256164551, -0.002012243028730154, 0.03658154606819153, 0.03127564862370491, 0.0363621786236763, -0.027205800637602806, -0.05243372917175293, 0.012564878910779953, -0.013430594466626644, -0.04043225944042206, -0.025083716958761215, 0.09665156900882721, 0.005077417939901352, -0.05181048810482025, 0.08925056457519531, 0.0777667909860611, -0.013708796352148056, 0.07754126191139221, 0.08393577486276627, 0.06395212560892105, -0.07428556680679321, -0.052424050867557526, 0.03497577831149101, 0.01964585855603218, -0.0429445318877697, 0.07072066515684128, 0.0017074055504053831, 0.059513408690690994, 0.013262910768389702, -0.07240563631057739, 0.09288764744997025, 0.030620144680142403, -0.046197980642318726, 0.04847298562526703, -0.03942957893013954, -0.0025783153250813484, -0.019526517018675804, 0.038867682218551636, 0.006007499527186155, -0.06366054713726044, 0.004640159662812948, 0.013837787322700024, -0.020015377551317215, -0.010317903012037277, 0.001741019543260336, 0.06261103600263596, -0.03374830260872841, 0.01629183441400528, -0.013137640431523323, 0.026046304032206535, -0.009679407812654972, -0.07085473090410233, 0.03035539574921131, -0.08764562010765076, -0.03820766881108284, -0.04181021824479103, -0.05163294076919556, 0.06666433811187744, -0.08939782530069351, 0.040260378271341324, -0.06847432255744934, 0.09106951206922531, -0.07388591021299362, -0.07479099184274673, -0.001779694459401071, -0.0963745042681694, -0.06515862792730331, -0.08404017239809036, -0.09935544431209564, 0.010541093535721302, -0.04491754248738289, 0.09378639608621597, 0.006655062548816204, 0.06637217849493027, -0.05623293295502663, -0.020134123042225838, 0.005873391404747963, -0.07765494287014008, -0.0008442706312052906, -0.03568055108189583] + }, + "VirtualRegisters" : { + "VIRT_REG_FR32":[0.0034248235169798136, -0.011980761773884296, -0.0501178540289402, 0.0494888611137867, 0.06103336811065674, -0.06178610771894455, 0.007709897588938475, -0.011392943561077118, 0.06570645421743393, 0.0771368145942688, 0.0005577280535362661, 0.013396150432527065, -0.041660163551568985, 0.05122360959649086, 0.11354377865791321, -0.009875510819256306, -0.06466709822416306, 0.048170577734708786, 0.0007201629341579974, 0.06538223475217819, 0.08870227634906769, -0.05771782249212265, 0.009273379109799862, -0.03325295075774193, 0.01197165809571743, 0.06604835391044617, 0.08265330642461777, -0.005758166313171387, 0.02512396313250065, 0.03383670747280121, 0.038484204560518265, -0.06539343297481537, -0.013461028225719929, 0.001498897559940815, 0.05170154944062233, 0.06965786963701248, -0.07339458167552948, 0.05094756931066513, 0.01983451284468174, -0.06855696439743042, 0.07892709225416183, 0.06099703162908554, 0.08492864668369293, 0.05357863008975983, -0.009294840507209301, -0.0054923719726502895, -0.029938997700810432, 0.028260599821805954, 0.053790509700775146, -0.06574371457099915, -0.009621666744351387, -0.08131514489650726, -0.08474338054656982, 0.039622966200113297, 0.06945627927780151, 0.02545306645333767, 0.005390701815485954, 0.04582791030406952, -0.1103447750210762, -0.050917647778987885, 0.03087870217859745, 0.06918162852525711, 0.0548822283744812, -0.01838473603129387, 0.05597897991538048, 0.03548860549926758, -0.009931124746799469, -0.07856663316488266, 0.033994875848293304, 0.03467561677098274, 0.09580692648887634, -0.04153195023536682, -0.06732118874788284, -0.06857144832611084, 0.03419093042612076, -0.01200241968035698, -0.06983492523431778, 0.05929506942629814, -0.00041734304977580905, -0.026396293193101883, 0.05230500176548958, -0.006162640172988176, 0.044198282063007355, -0.028765834867954254, 0.031155114993453026, 0.06967037916183472, -0.0892564132809639, 0.028816571459174156, -0.037065472453832626, 0.06540130823850632, -0.01888667233288288, 0.030632384121418, 0.0359313078224659, 0.106044240295887, 0.03259910270571709, -0.0775517001748085, -0.04267778620123863, 0.04977935180068016, -0.01790289767086506, -0.11223265528678894], + "VIRT_REG_FR64":[0.08496882021427155, 0.049308884888887405, -0.016840212047100067, 0.010602951049804688, -4.6025739720789716e-05, -0.06524767726659775, 0.048670798540115356, -0.06444543600082397, -0.0031944462098181248, 0.05608433857560158, -0.03958145156502724, 0.05171080678701401, -0.03572545200586319, -0.054364755749702454, 0.052311528474092484, -0.0361458919942379, 0.024109655991196632, 0.15923210978507996, -0.07255382835865021, -0.011799084022641182, -0.06846465915441513, 0.0023571476340293884, 0.02642918936908245, -0.05057685822248459, 0.029800178483128548, -0.06036723777651787, -0.012272411957383156, -0.022802220657467842, -0.02426644042134285, 0.05623406544327736, -0.07506053894758224, -0.02078152634203434, 0.02549685165286064, -0.030025657266378403, -0.0627482682466507, 0.062375299632549286, 0.03684084117412567, 0.06365678459405899, 0.0004415051080286503, -0.002180535811930895, 0.05225013941526413, -0.0693102702498436, -0.03649357333779335, 0.005159272346645594, -0.03298519179224968, 0.041419681161642075, -0.05325934663414955, -0.017585784196853638, -0.03843431547284126, -0.002649943344295025, 0.033329058438539505, -0.04736043140292168, -0.043852102011442184, -0.06713785231113434, -0.03237355872988701, 0.012679073959589005, -0.01959240809082985, 0.07324203103780746, 0.07468831539154053, 0.03327644243836403, -0.01596391387283802, 0.12015434354543686, 0.051839299499988556, 0.00980563648045063, -0.08275608718395233, 0.04445798322558403, -0.03891860321164131, 0.10891054570674896, -0.008730625733733177, -0.051655255258083344, -0.05982912331819534, 0.04106972739100456, 0.06872759014368057, 0.013289053924381733, 0.03469584137201309, -0.06673429906368256, -0.0695682018995285, 0.047426726669073105, 0.02815094031393528, -0.05552271753549576, 0.0010567272547632456, -0.051840681582689285, -0.01704293303191662, -0.047185055911540985, 0.036965738981962204, 0.03452568128705025, -0.05430837720632553, 0.0383443646132946, 0.0003438846324570477, -0.030417989939451218, 0.02749026007950306, -0.0546082966029644, 0.03005768544971943, 0.0025131346192210913, 0.0013019279576838017, -0.054173994809389114, -0.008382225409150124, 0.02153395675122738, 0.011912085115909576, -0.10461334884166718], + "VIRT_REG_GR16":[0.09543223679065704, 0.03513967618346214, 0.08986528217792511, -0.012217407114803791, -0.02076001651585102, -0.04190119728446007, 0.01318269595503807, -0.010142332874238491, -0.011869532987475395, -0.040446147322654724, 0.06552371382713318, 0.04439055174589157, 0.08176156878471375, -0.06334159523248672, -0.033928077667951584, -0.00024628525716252625, 0.0244551170617342, -0.019419007003307343, -0.09592454880475998, 0.005961012560874224, 0.03278326243162155, -0.07028506696224213, -0.08484592288732529, -6.329250754788518e-05, 0.015018146485090256, -0.05068608745932579, 0.0732998326420784, 0.023434389382600784, 0.0002124009479302913, 0.060401707887649536, 0.013626078143715858, -0.010556582361459732, -0.005069760140031576, -0.004616749472916126, -0.034329116344451904, 0.060584329068660736, -0.05430089309811592, -0.029179023578763008, 0.042385730892419815, -0.0652197003364563, 0.09378205984830856, -0.05090794339776039, -0.008510591462254524, 0.0837036669254303, 0.009071480482816696, 0.04464874789118767, -0.012855015695095062, 0.06306030601263046, -0.08556588739156723, -0.05393703281879425, -0.06741822510957718, -0.03717748448252678, 0.017156923189759254, 0.07401604950428009, -0.06629005819559097, -0.04564857482910156, -0.055414989590644836, 0.039407771080732346, -0.04089723527431488, 0.06915309280157089, 0.030190052464604378, 0.027542876079678535, 0.03557966649532318, 0.05191207677125931, -0.03237364813685417, -0.02036256715655327, -0.071859210729599, -0.06704329699277878, 0.0336633175611496, 0.09511569887399673, 0.0048662531189620495, 0.05273270234465599, -0.056247059255838394, 0.06079721450805664, -0.04150049015879631, -0.08104457706212997, -0.10303051024675369, 0.04522428661584854, -0.04379847273230553, -0.019447194412350655, 0.0021319733932614326, -0.010465282015502453, 0.06857019662857056, -0.00443653529509902, -0.08039603382349014, -0.05012141168117523, 0.0875077098608017, -0.03053239732980728, -0.05321606993675232, 0.016501901671290398, -0.0563507042825222, -0.03187479078769684, -0.0015389680629596114, 0.022985411807894707, -0.05008963868021965, 0.028300117701292038, 0.02875804342329502, -0.024458128958940506, -0.022238614037632942, -0.049835607409477234], + "VIRT_REG_GR32":[-0.008479167707264423, -0.02941126376390457, 0.05343153327703476, 0.03769504278898239, -0.0006716987118124962, -0.0329299233853817, 0.03442851081490517, -0.06826753169298172, -0.09117511659860611, -0.018657755106687546, 0.029032904654741287, 0.02404048666357994, 0.010598761960864067, -0.0482308566570282, 0.06956348568201065, -0.027967501431703568, -0.07380961626768112, -0.021098148077726364, -0.0808446854352951, 0.0127912862226367, -0.01355082169175148, -0.040285225957632065, 0.035385165363550186, -0.001157263875938952, -0.026462145149707794, -0.08616211265325546, -0.044482193887233734, -0.010969695635139942, 0.04645564407110214, -0.018178211525082588, -0.038536932319402695, -0.027571648359298706, -0.007523007690906525, -0.02699458785355091, -0.039170436561107635, 0.12889482080936432, -0.04512789845466614, -0.03883056715130806, 0.051210880279541016, 0.03924906626343727, 0.036943964660167694, -0.016879307106137276, 0.011263007298111916, 0.053573690354824066, -0.018964825198054314, -0.041856080293655396, -0.036545924842357635, 0.07715532928705215, -0.041981130838394165, -0.04114629328250885, -0.04393022507429123, -0.030163627117872238, 0.0019487979589030147, 0.10988762229681015, 0.09039165079593658, -0.0035424421075731516, -0.06272851675748825, 0.007701062131673098, -0.01971622183918953, 0.06203003600239754, 0.048561323434114456, -0.04599940404295921, 0.00802221056073904, -0.002905400237068534, -0.1050020381808281, 0.003395768813788891, -0.07973644882440567, 0.008020970039069653, -0.08614815771579742, 0.0518532320857048, 0.021174483001232147, 0.03254232555627823, -0.01905026100575924, -0.0009989180834963918, -0.06409642845392227, -0.022425753995776176, -0.03563409671187401, 0.07717793434858322, -0.04553033784031868, -0.02112392708659172, -0.002374667674303055, 0.03828585892915726, -0.014221777208149433, -0.015974245965480804, -0.01805220916867256, 0.04202109947800636, -0.0841534212231636, 0.06608130037784576, -0.11586519330739975, 0.024179989472031593, 0.017091574147343636, 0.08567194640636444, -0.03692129999399185, 0.03266705200076103, -0.046154942363500595, 0.0040525165386497974, -0.03177625685930252, 0.039895471185445786, 0.042960215359926224, -0.05573953315615654], + "VIRT_REG_GR32_ABCD":[0.016604775562882423, -0.0028934956062585115, 0.041060179471969604, -0.025077441707253456, -0.018642406910657883, 0.023762650787830353, -0.028646549209952354, -0.02460283786058426, 0.005985732190310955, 0.01774146780371666, -0.004014404024928808, -0.05473850294947624, -0.0417158380150795, -0.06322457641363144, 0.060795728117227554, -0.036435071378946304, -0.04245952516794205, 0.08069344609975815, 0.035319335758686066, -0.012020719237625599, 0.045771341770887375, -0.10842540860176086, 0.046253710985183716, -0.004099135287106037, 0.030616935342550278, -0.08288344740867615, 0.08569363504648209, -0.014164377935230732, -0.004303323570638895, 0.09726760536432266, 0.06208871304988861, -0.04007713496685028, 0.005815347656607628, 0.02377200312912464, 0.07813961058855057, 0.03192306309938431, -0.006230524741113186, 0.10110925883054733, -0.023409254848957062, 0.030774405226111412, -0.011607645079493523, -0.03929119184613228, 0.004817614797502756, -0.013827506452798843, 0.07770339399576187, -0.07994075864553452, -0.03157062083482742, 0.06743781268596649, 0.014881699346005917, -0.030165214091539383, -0.07844353467226028, -0.04563238099217415, 0.09747181832790375, 0.057128582149744034, 0.04173563793301582, -0.0011194447288289666, -0.01902887038886547, -0.032171595841646194, 0.04824799671769142, 0.008433254435658455, 0.024706291034817696, 0.0746094286441803, 0.04515853151679039, -0.0018984260968863964, -0.10070884972810745, -0.01883143000304699, -0.07785795629024506, 0.10938235372304916, -0.08001448959112167, -0.07419873028993607, 0.010544849559664726, 0.025767439976334572, -0.1005895584821701, 0.05103800818324089, -0.03675306960940361, -0.020510872825980186, 0.022482097148895264, 0.06463642418384552, -0.03149804100394249, -0.021647030487656593, 0.04025804623961449, 0.003628256032243371, 0.03532547131180763, -0.08667688816785812, 0.018817460164427757, -0.01690257526934147, -0.10114696621894836, -0.022815177217125893, 0.024386661127209663, 0.10286301374435425, 0.030005114153027534, 0.0370776504278183, -0.008584428578615189, -0.077603779733181, -0.03588058054447174, 0.030617419630289078, -0.07383710891008377, 0.03215676173567772, 0.03288266062736511, -0.036702848970890045], + "VIRT_REG_GR32_NOREX":[0.019052108749747276, -0.006784944795072079, -0.05410394072532654, 0.001966317882761359, -0.06686867773532867, 0.013514372520148754, 0.030097918584942818, -0.03868359327316284, 0.004314934369176626, -0.06713679432868958, 0.02491898462176323, 0.027683967724442482, 0.035907283425331116, -0.023093875497579575, -0.0892200842499733, -0.1052003800868988, -0.03923499956727028, 0.08808581531047821, -0.10092058777809143, 0.03336786851286888, -0.08974049985408783, -0.015254802070558071, 0.039686985313892365, -0.010083628818392754, -0.03423550724983215, -0.08821681141853333, -0.05621311068534851, -0.020327769219875336, -0.016793876886367798, 0.08908043801784515, -0.04112761467695236, -0.050139520317316055, -0.01524045504629612, 0.05841142684221268, 0.08270087838172913, 0.0348736047744751, -0.016146546229720116, 0.05751227214932442, 0.05081859603524208, -0.07304663956165314, -0.047101784497499466, -0.02825125865638256, 0.0006340605323202908, 0.0008785317186266184, -0.044239338487386703, 0.007173972204327583, -0.029449066147208214, 0.07254412025213242, -0.026029080152511597, 0.025982191786170006, -0.09524690359830856, -0.052613094449043274, -0.1270490437746048, 0.05319184809923172, 0.1046818196773529, 0.0477570965886116, -0.06291303783655167, 0.04725426062941551, -0.05330964922904968, 0.04056742787361145, 0.01543382927775383, 0.03627128154039383, -0.048232536762952805, 0.014761016704142094, -0.007380587514489889, -0.008060632273554802, -0.021923277527093887, -0.022500980645418167, -0.08495079725980759, 0.045358967036008835, -0.04728720709681511, 0.03550735488533974, 0.03445536270737648, -0.01891610585153103, -0.09439470618963242, -0.044266197830438614, -0.07952893525362015, 0.05221104994416237, -0.03507477045059204, 0.04218391329050064, 0.040326621383428574, -0.0395088866353035, 0.02447870559990406, -0.04280063137412071, 0.06520935893058777, -0.003358252113685012, -0.057561881840229034, 0.01911463774740696, 0.05295571684837341, 0.030342884361743927, 0.03814920783042908, -0.03366788476705551, 0.03090745024383068, 0.09487249702215195, -0.002995486371219158, -0.012020634487271309, -0.029147809371352196, 0.09558248519897461, 0.02548893168568611, 0.0931544378399849], + "VIRT_REG_GR64":[0.02717440389096737, -0.026730243116617203, -0.023244258016347885, 0.04027782380580902, 0.006808254402130842, -0.027519788593053818, -0.01906559243798256, 0.027793627232313156, -0.00129543652292341, -0.03455121070146561, 0.021734628826379776, 0.035481199622154236, -0.07251942157745361, -0.025691546499729156, -0.03271827474236488, -0.13225725293159485, -0.0601421520113945, 0.09084498882293701, -0.10225717723369598, 0.004034099169075489, 0.023578351363539696, -0.041603971272706985, 0.04199974611401558, -0.014711204916238785, -0.04272732138633728, -0.12534455955028534, -0.023738788440823555, 0.005328727886080742, 0.038416482508182526, -0.026419155299663544, -0.041119154542684555, 0.00022502713545691222, -0.05204978585243225, -0.019709734246134758, -0.04102563485503197, 0.06480151414871216, 0.009224721230566502, 0.04627599939703941, 0.027821402996778488, -0.05595114827156067, 0.04526345059275627, 0.024196594953536987, 0.10446277260780334, 0.07561361789703369, -0.08028160035610199, -0.0314163975417614, 0.11944323033094406, 0.1025814488530159, -0.08457476645708084, 0.02227119728922844, -0.041679076850414276, -0.02260834351181984, 0.036674268543720245, 0.10488750785589218, 0.019218411296606064, -0.015966340899467468, -0.06852715462446213, 0.026523491367697716, -0.11090730130672455, -0.0021082640159875154, -0.048291631042957306, -0.032388005405664444, 0.015713853761553764, 0.03355225548148155, -0.06502845883369446, -0.010098783299326897, -0.09930021315813065, -0.017413528636097908, -0.055861033499240875, 0.0801810696721077, -0.03900628536939621, -0.03278445452451706, -0.0337282195687294, -0.11434067040681839, -0.04371264949440956, -0.01736009307205677, -0.05100121721625328, 0.07490750402212143, -0.014680330641567707, -0.02126181870698929, 0.018013890832662582, 0.0018135658465325832, 0.029781077057123184, -0.012477489188313484, -0.021443217992782593, 0.047576501965522766, -0.05993758141994476, -0.06040889024734497, 0.016642581671476364, 0.011624492704868317, -0.042229063808918, -0.007573941722512245, -0.04010608047246933, -0.006444427650421858, -0.014495199546217918, -0.04122597724199295, -0.08505907654762268, -0.004049300216138363, 0.06545045226812363, -0.04762336611747742], + "VIRT_REG_GR64_ABCD":[0.04577033221721649, -0.07758746296167374, 0.00799313560128212, -0.11011485010385513, -0.010862522758543491, 0.012709266506135464, 0.05257265642285347, -0.07354705780744553, 0.04262387007474899, 0.07554348558187485, -0.06358839571475983, 0.006669520866125822, 0.049098193645477295, 0.11183933168649673, -0.028112098574638367, 0.021986473351716995, -0.02839403599500656, -0.06199958547949791, 0.08614487200975418, -0.041216861456632614, 0.041238460689783096, 0.005937385838478804, 0.00200703926384449, -0.05337367579340935, 0.037919919937849045, -0.07485998421907425, -0.09153831005096436, -0.0554175041615963, -0.10251995176076889, -0.01289951242506504, -0.030631467700004578, 0.04197017475962639, -0.03578301519155502, 0.010593005456030369, -0.05836241692304611, 0.06809061765670776, 0.10871735960245132, -0.09833388775587082, -0.009873395785689354, -0.056898634880781174, 0.05946199968457222, 0.015534073114395142, 0.01677171140909195, -0.020233800634741783, -0.006396631710231304, -0.049332089722156525, 0.012649210169911385, 0.03756912052631378, 0.0033660116605460644, -0.09084216505289078, -0.07142844051122665, -0.0030346515122801065, 0.0019640070386230946, 0.038837920874357224, 0.011760945431888103, 0.04995080456137657, -0.06997165083885193, -0.035297296941280365, 0.01996617764234543, 0.01954355463385582, -0.0934600979089737, 0.030165065079927444, -0.007337240036576986, -0.05346155911684036, 0.0732186883687973, -0.04716489836573601, -0.06555212289094925, -0.018465254455804825, 0.051119767129421234, -0.03106619231402874, 0.0748852789402008, -0.02095886692404747, 0.006320921704173088, 0.03146332502365112, -0.08238139003515244, -0.03618254140019417, -0.014570276252925396, 0.062481846660375595, -0.0394093319773674, -0.05171547457575798, -0.044726233929395676, -0.01228095218539238, 0.09699232876300812, 0.07471026480197906, 0.03112417459487915, 0.022543631494045258, -0.08634103089570999, 0.059702761471271515, -0.013801504857838154, 0.004984616301953793, 0.045798566192388535, -0.03205988556146622, -0.06150995194911957, -0.02244667150080204, 0.03318532556295395, 0.03462471440434456, 0.03236381709575653, 0.0884014293551445, -0.01604369841516018, -0.05234146490693092], + "VIRT_REG_GR64_NOREX":[-0.03959479182958603, -0.06190898269414902, -0.02920372597873211, -0.09973344951868057, -0.004333901684731245, -0.08522991091012955, 0.0459987074136734, -0.057674553245306015, 0.037046968936920166, -0.05669403821229935, -0.02221340872347355, -0.062426190823316574, 0.05804889276623726, -0.02635439857840538, -0.045627325773239136, 0.03632078319787979, 0.07128578424453735, 0.07544906437397003, -0.0537678524851799, -0.04624016210436821, 0.014316501095890999, 0.05580946058034897, 0.05251356214284897, -0.08244197070598602, -0.08901460468769073, -0.07641059905290604, -0.04924754425883293, 0.05417120084166527, -0.0060508353635668755, -0.00814742036163807, -0.06154030188918114, 0.05966867506504059, -0.03231468051671982, 0.021429890766739845, 0.031103987246751785, 0.04343251883983612, -0.08997714519500732, 0.039365898817777634, 0.052908625453710556, -0.02683917060494423, -0.05547752603888512, -0.014131218194961548, 0.0016863569617271423, -0.041112788021564484, -0.010230163112282753, -0.06687774509191513, -0.006144971586763859, -0.08074352145195007, 0.04034091532230377, -0.08176303654909134, -0.004055786412209272, -0.0024839320685714483, -0.007289807312190533, 0.06915127485990524, 0.023709064349532127, 0.04671626538038254, 0.06229325756430626, 0.04707597941160202, 0.06800796836614609, -0.02885584905743599, 0.030613983049988747, -0.019083039835095406, 0.045457858592271805, 0.040770504623651505, -0.05441175401210785, -0.05712401866912842, 0.07744520157575607, -0.0756613239645958, -0.06890957802534103, -0.07997069507837296, 0.09348486363887787, -0.04511028528213501, 0.036194607615470886, 0.040017660707235336, 0.016245214268565178, 0.023104460909962654, 0.058383163064718246, 0.0679842159152031, -0.00921112485229969, -0.10036550462245941, 0.09075804799795151, -0.059704095125198364, -0.013338442891836166, -0.005139742512255907, 0.07807526737451553, 0.06255412846803665, -0.008151572197675705, -0.0624256506562233, 0.012590888887643814, 0.03665084019303322, -0.028498578816652298, -0.01614067517220974, 0.007552243769168854, -0.007216903381049633, 0.0760180801153183, -0.04200543463230133, 0.06412865966558456, -0.05136435106396675, -0.0024792966432869434, 0.06856651604175568], + "VIRT_REG_GR64_NOREX_NOSP":[-0.0656895712018013, 0.058077458292245865, -0.006653467658907175, 0.037784356623888016, 0.07274001836776733, 0.07232078164815903, 0.07074914127588272, 0.05637859180569649, 0.04296007752418518, 0.05499762296676636, -0.01783664897084236, -0.08387365937232971, -0.01376343984156847, -0.07938199490308762, -0.027822256088256836, -0.0663403570652008, 0.036170270293951035, -0.07460261881351471, 0.08652043342590332, 0.02483147382736206, -0.07939319312572479, 0.033202506601810455, 0.0903514102101326, -0.10181311517953873, 0.060751549899578094, 0.07619930803775787, 0.05017509311437607, -0.0470910519361496, 0.07713821530342102, -0.0426195003092289, -0.04506472498178482, 0.003363420255482197, -0.0017315347213298082, 0.06264199316501617, 0.005245774984359741, -0.027923958376049995, 0.09868567436933517, 0.06738796830177307, -0.10339145362377167, 0.0020383980590850115, 0.087734155356884, 0.011040030047297478, -0.05993311479687691, -0.05790332704782486, 0.01574312523007393, 0.009771298617124557, 0.022676382213830948, -0.009197148494422436, 0.03372732177376747, 0.08404259383678436, -0.015135225839912891, -0.04693703353404999, 0.09917140752077103, 0.007134507410228252, 0.020209072157740593, -0.00027669535484164953, -0.0351635180413723, 0.03751315921545029, -0.019665181636810303, 0.028500953689217567, 0.034186746925115585, -0.005931361112743616, 0.05645192414522171, -0.02027188241481781, -0.022675039246678352, -0.08812297880649567, -0.014896178618073463, -0.048788342624902725, 0.008708382956683636, 0.019917558878660202, -0.002275944221764803, 0.03409638628363609, 0.033304013311862946, 0.057676300406455994, 0.039842985570430756, -0.025169866159558296, 0.016520975157618523, -0.030201178044080734, -0.021718870848417282, -0.07023277878761292, -0.007528252899646759, 0.009067370556294918, -0.0460657961666584, 0.07117785513401031, -0.03609836474061012, -0.011893372051417828, -0.006047600414603949, 0.0179970171302557, 0.024480223655700684, -0.03918423503637314, 0.004897980485111475, 0.05040167644619942, 0.010113563388586044, -0.1074901670217514, -0.06277655810117722, -0.02934161201119423, -0.06922926008701324, -0.05638887360692024, 0.05314395949244499, 0.04588884115219116], + "VIRT_REG_GR64_NOSP":[0.0015277941711246967, -0.03938478231430054, -0.030811766162514687, 0.027071669697761536, 0.02127140760421753, 0.0015787228476256132, -0.07842491567134857, 0.004658385645598173, -0.05909501388669014, -0.03576778993010521, -0.07251477241516113, 0.12117832154035568, 0.04499363154172897, -0.009405314922332764, -0.01015283353626728, -0.002841090550646186, 0.0689091831445694, 0.10697457194328308, -0.09274765104055405, -0.027955353260040283, -0.0379958301782608, -0.044126156717538834, 0.04907212778925896, -0.038063473999500275, -0.003686746582388878, -0.08313410729169846, -0.045181579887866974, -0.011702840216457844, -0.006579228211194277, 0.046807315200567245, -0.045654296875, -0.03466613590717316, -0.08313826471567154, -0.06678880006074905, -0.027727074921131134, 0.036734677851200104, -0.040936414152383804, 0.05170389637351036, 0.038199927657842636, 0.02960256300866604, 0.0355701707303524, -0.02052776888012886, 0.06218089163303375, 0.10570456087589264, -0.036479029804468155, -0.008999336510896683, -0.031860992312431335, 0.07250168174505234, -0.061084795743227005, -0.057996805757284164, -0.010533110238611698, -0.018169214949011803, 0.017261315137147903, 0.10023517906665802, -0.044131457805633545, -0.07618662714958191, -0.09124933928251266, 0.01819406822323799, -0.05906827375292778, 0.04295642301440239, -0.03197735920548439, 0.03641442954540253, 0.005168464966118336, -0.00010972691961796954, -0.0829579159617424, -0.014677388593554497, -0.08750011026859283, -0.04695136100053787, -0.07696729153394699, -0.00718996487557888, 0.018294518813490868, -0.014321570284664631, -0.04416860267519951, -0.0890057235956192, -0.014466283842921257, 0.02831638976931572, -0.04845190420746803, 0.08228176832199097, 0.03420877829194069, 0.056510377675294876, 0.037403274327516556, 0.04364967346191406, 0.08903267979621887, -0.016827082261443138, -0.0682789757847786, 0.06286796927452087, -0.0958203598856926, 0.018489282578229904, 0.02886355295777321, 0.028006011620163918, 0.039986785501241684, -0.04771937429904938, -0.004648604430258274, 0.033939141780138016, -0.027820419520139694, -0.026187442243099213, -0.07972361892461777, 0.006323353853076696, 0.016448041424155235, -0.01961681991815567], + "VIRT_REG_GR64_NOSP_and_GR64_TC":[0.08079065382480621, -0.05147358775138855, -0.08338657021522522, 0.06757336109876633, -0.015237463638186455, 0.026806311681866646, 0.07564966380596161, -0.037159934639930725, -0.02222878858447075, -0.04553138092160225, -0.006632891017943621, 0.001604291144758463, 0.043711669743061066, 0.0710049569606781, -0.08854726701974869, -0.03142566233873367, -0.0865127220749855, 0.08521236479282379, 0.039203498512506485, 0.04737624153494835, 0.02893459051847458, 0.004120660945773125, 0.03552098199725151, -0.0010448878165334463, 0.04423774778842926, 0.03258584439754486, 0.03433830663561821, -0.019990455359220505, -0.03263172507286072, 0.09782663732767105, -0.00702365068718791, -0.06544602662324905, 0.013447105884552002, 0.04603038728237152, 0.029931804165244102, 0.0988783910870552, -0.062023941427469254, -0.0070026409812271595, 0.032557111233472824, -0.08212000876665115, 0.03199682757258415, 0.020828546956181526, 0.07071725279092789, -0.018812179565429688, -0.0184739138931036, -0.06008931249380112, 0.01504000648856163, -0.019235603511333466, 0.014653048478066921, -0.009083813987672329, 0.03171474114060402, 0.019499456509947777, 0.05263463407754898, 0.10554639250040054, -0.02759619802236557, -0.00156494346447289, -0.03898271545767784, 0.06027846410870552, -0.061001915484666824, 0.039365388453006744, -0.06546281278133392, 0.0006352368509396911, 0.0500405877828598, -0.03232716768980026, -0.010176514275372028, 0.002549059921875596, 0.0666508674621582, -0.037290267646312714, -0.028836704790592194, 0.06271649152040482, -0.016647985205054283, 0.013602355495095253, 0.020110899582505226, 0.011730309575796127, -0.10071564465761185, -0.06239647418260574, -0.09507977962493896, -0.09190725535154343, -0.08861985802650452, -0.0006123466300778091, 0.0951915979385376, -0.035364676266908646, -0.04007220268249512, 0.08415472507476807, 0.0006664254469797015, 0.05864431709051132, 0.01460045762360096, -0.09507087618112564, 0.024228032678365707, 0.04208158329129219, 0.006106846500188112, 0.09294755011796951, 0.06157369166612625, 0.0826527327299118, -0.058974966406822205, -0.09958664327859879, 0.06913749873638153, -0.08108915388584137, 0.07425157725811005, 0.04784728214144707], + "VIRT_REG_GR64_TC":[-0.0944172665476799, 0.040403831750154495, -0.017597073689103127, 0.04766053333878517, -0.03104357235133648, 0.025751160457730293, 0.036779265850782394, -0.0235747080296278, 0.032111138105392456, 0.009872193448245525, -0.01596468687057495, 0.05234881862998009, -0.047335200011730194, 0.005157034378498793, -0.02132921665906906, -0.0544377863407135, 0.057515472173690796, -0.006743279751390219, -0.01474941335618496, -0.0990658849477768, 0.022418741136789322, -0.007098495960235596, 0.046933863312006, 0.1002131924033165, 0.01583809033036232, 0.03995800018310547, -0.017743254080414772, -0.01684877835214138, 0.06543229520320892, 0.04597911611199379, 0.05365373566746712, -0.008774830959737301, -0.01341968309134245, -0.004754040390253067, 0.04739849269390106, 0.032378777861595154, -0.0020728895906358957, 0.03502136841416359, 0.05946416035294533, -0.06190952658653259, 0.01910495012998581, -0.023678753525018692, 0.012653682380914688, -0.06766874343156815, -0.0729866623878479, 0.0757005363702774, -0.027033904567360878, -0.06776778399944305, -0.010131776332855225, -0.06334701925516129, -0.04702980816364288, 0.06837917864322662, 0.002726735547184944, 0.04345812648534775, 0.04288078844547272, -0.06921732425689697, -0.07625382393598557, 0.037991974502801895, -0.04257906600832939, 0.06338586658239365, 0.05315309390425682, -0.02785014547407627, 0.04054750129580498, 0.06967299431562424, -0.07271680235862732, 0.0032969408202916384, -0.08254148811101913, 0.07269596308469772, -0.01827111467719078, 0.034775473177433014, 0.010106234811246395, 0.0389409065246582, 0.042805008590221405, -0.03822058066725731, 0.0668339803814888, -0.005216705612838268, -0.00022202919353730977, -0.0221820380538702, -0.027401722967624664, -0.045061662793159485, -0.05296671763062477, -0.0190189890563488, -0.002744461875408888, -0.04073096439242363, -0.06974441558122635, 0.05868958309292793, -0.06907399743795395, -0.026619713753461838, 0.015318086370825768, 0.035948701202869415, -0.08301021158695221, 0.03955607861280441, 0.028369972482323647, 0.0202812347561121, -0.12075140327215195, -0.039504438638687134, -0.03826067969202995, 0.01607581228017807, 0.02135113812983036, -0.08897850662469864], + "VIRT_REG_GR64_TC_with_sub_8bit":[0.00805664248764515, 0.06228634715080261, -0.005148644559085369, -0.025605352595448494, -0.04853198677301407, -0.018169978633522987, 0.008530518971383572, -0.1050964742898941, -0.08428415656089783, -0.014802628196775913, 0.05918573588132858, 0.07529161125421524, 0.09815273433923721, -0.014188972301781178, 0.06676790118217468, 0.09496084600687027, -0.03843621164560318, -0.00740150036290288, -0.11988909542560577, -0.01781499572098255, -0.03719411790370941, -0.07447166740894318, 0.005513608455657959, -0.014381160028278828, 0.036786310374736786, -0.04839075356721878, -0.009440913796424866, 0.03984222561120987, -0.08096668124198914, 0.026751000434160233, 0.06400448083877563, 0.07998895645141602, 2.295125523232855e-05, 0.0266779325902462, -0.0030931613873690367, 0.05236855521798134, -0.010479471646249294, -0.011119752191007137, -0.06124376133084297, -0.019449712708592415, 0.03448517248034477, -0.04095051810145378, 0.01377212442457676, 0.09643338620662689, 0.021325431764125824, 0.06029453128576279, 0.048866767436265945, -0.03436344116926193, -0.043422505259513855, 0.03822150453925133, 0.004718889016658068, -0.04090931639075279, -0.04219569265842438, 0.019032739102840424, 0.06111171841621399, 0.04305591061711311, -0.0379939004778862, -0.03224434703588486, -0.06517905741930008, 0.002272483194246888, 0.09273418039083481, -0.028145847842097282, 0.01824336126446724, 0.00936606340110302, -0.07281909137964249, -0.028650810942053795, -0.060721538960933685, -0.09477518498897552, -0.0014060320099815726, 0.06919887661933899, -0.03463669493794441, 0.0026504716370254755, -0.0653621107339859, -0.02800566703081131, -0.02503957599401474, -0.060285311192274094, 0.014794053509831429, -0.08424058556556702, 0.0482206828892231, -0.07467620074748993, -0.09909844398498535, -0.06888734549283981, -0.0014173799427226186, -0.09022543579339981, 0.06461413204669952, 0.024526789784431458, -0.07400602847337723, -0.008816084824502468, 0.025513656437397003, 0.047476526349782944, -0.05981749668717384, 0.08338218182325363, 0.02657591737806797, 0.03547860309481621, -0.043622229248285294, 0.10129662603139877, 0.08802521973848343, -0.09759330749511719, 0.025680232793092728, 0.05964493378996849], + "VIRT_REG_GR64_with_sub_16bit_in_GR16_NOREX":[-0.03117012232542038, -0.02872271090745926, -0.039712607860565186, 0.03738812729716301, 0.030099159106612206, 0.00013636364019475877, -0.019107641652226448, -0.04186702147126198, -0.053099144250154495, -0.020432034507393837, -0.0004185919533483684, 0.010934959165751934, 0.036054231226444244, 0.03788067027926445, 0.05227302014827728, -0.034505825489759445, -0.08298061788082123, 0.0399160161614418, 0.03668724000453949, 0.014606554992496967, -0.0071771652437746525, 0.059049926698207855, -0.06330917030572891, 0.007379058748483658, -0.0750177726149559, -0.0423760749399662, -0.019386067986488342, -0.018436923623085022, -0.015116279944777489, 0.023602722212672234, 0.0533282607793808, -0.026401247829198837, 0.023750485852360725, -0.027648568153381348, -0.016443056985735893, 0.04291580244898796, -0.04391908273100853, 0.05113501846790314, -0.03743087872862816, 0.056367188692092896, 0.048130668699741364, -0.0230261143296957, 0.03358393907546997, -0.030188169330358505, 0.08421863615512848, 0.0033821314573287964, 0.03151029348373413, -0.042818162590265274, 0.04007953777909279, -0.0050337472930550575, 0.03335743024945259, -0.026563530787825584, 0.016440672799944878, -0.04272226244211197, -0.07304228097200394, 0.024836458265781403, -0.016342775896191597, -0.055494848638772964, -0.05826134234666824, 0.027478834614157677, 0.025981346145272255, -0.04745938256382942, 0.013695796020328999, -0.027888784185051918, 0.03769542649388313, -0.024486247450113297, 0.04720773920416832, -0.012697651982307434, -0.03559652715921402, 0.012948199175298214, -0.025600459426641464, 0.014954420737922192, -0.06651762872934341, 0.04277091473340988, -0.08291683346033096, 0.016881149262189865, 0.04145864024758339, -0.04162050038576126, -0.03363965451717377, -0.05018439516425133, 0.06321889907121658, -0.00871780700981617, 0.06867428869009018, 0.057975344359874725, 0.009704249911010265, 0.049075234681367874, -0.06111253425478935, 0.027943406254053116, 0.03725599870085716, 0.032480716705322266, -0.01960119605064392, -0.0295172780752182, 0.014026675373315811, 0.056797921657562256, -0.031707022339105606, 0.0010152219329029322, -0.023705823346972466, -0.07695567607879639, 0.017504720017313957, -0.0020094760693609715], + "VIRT_REG_GR64_with_sub_8bit":[-0.011493992060422897, -0.027181852608919144, 0.022013556212186813, 0.05687474459409714, -0.03289574757218361, -0.04803529754281044, -0.04204253479838371, 0.044671084731817245, -0.0849028080701828, -0.09561576694250107, 0.03596775606274605, 0.027156801894307137, 0.05034027621150017, -0.006308000069111586, 0.012393618933856487, -0.048590339720249176, -0.049129705876111984, 0.059305012226104736, -0.10330235958099365, 0.00738809397444129, 0.03855152800679207, -0.03220852091908455, 0.05221837759017944, -0.01274650078266859, 0.024303985759615898, -0.05925533175468445, -0.015623844228684902, -0.025864524766802788, 0.009918035939335823, 0.004779431037604809, -0.02866589091718197, 0.006512579973787069, -0.037251196801662445, 0.005028596147894859, -0.011677909642457962, 0.051886074244976044, -0.03552602231502533, 0.011968757025897503, 0.00829426757991314, -0.06981230527162552, -0.029781555756926537, -0.012621275149285793, 0.08595969527959824, 0.08630531281232834, 0.10018875449895859, -0.054863955825567245, -0.044519901275634766, 0.0893385037779808, 0.04004377871751785, 0.003711731405928731, -0.021447300910949707, -0.08500636368989944, 0.0037281641270965338, 0.14561010897159576, 0.03993009030818939, 0.07621612399816513, 0.020513180643320084, 0.004926605150103569, -0.035578932613134384, 0.06101486086845398, -0.08422145247459412, -0.03511432558298111, 0.01537742093205452, -0.010146304965019226, -0.05133780837059021, -0.010472903028130531, -0.09726933389902115, -0.010570867918431759, -0.09348491579294205, 0.002129049738869071, -0.01265127956867218, 0.03504374623298645, -0.008679943159222603, -0.002507386729121208, -0.06586045026779175, -0.04775359109044075, -0.042809367179870605, 0.08359787613153458, -0.0230431966483593, -0.015440763905644417, 0.0195400882512331, -0.0186530202627182, -0.03176320344209671, -0.019522372633218765, -0.02984560839831829, 0.024256182834506035, -0.07656785100698471, 0.03944750130176544, 0.016559945419430733, 0.007124909665435553, 0.08061631768941879, 0.08561833202838898, -0.018525447696447372, -0.0019649232272058725, -0.018469924107193947, -0.012311050668358803, -0.08448101580142975, 0.060216110199689865, 0.06368701905012131, -0.07110093533992767], + "VIRT_REG_GR8":[0.02255251444876194, 0.012649326585233212, 0.05363747105002403, -0.006129346787929535, 0.027027001604437828, 0.03703385218977928, -0.045294541865587234, -0.02489621751010418, 0.026587747037410736, -0.06228360906243324, 0.01547946222126484, 0.03494448587298393, 0.08276952058076859, -0.03470698744058609, 0.0036826131399720907, 0.04216131567955017, -0.04518325626850128, 0.09584730118513107, -0.09126991778612137, -0.11293632537126541, 0.0141398124396801, -0.05086163431406021, 0.0421922467648983, -0.0001364851341350004, 0.05821910500526428, -0.04154132679104805, 0.036521218717098236, -0.016718950122594833, 0.0773339569568634, 0.05134757608175278, -0.03728386387228966, -0.014684299007058144, 0.016949277371168137, 0.025767508894205093, -0.01573120802640915, 0.0343811996281147, 0.008209497667849064, 0.0011038129450753331, -0.06688684970140457, -0.08167136460542679, 0.03875276446342468, 0.08301592618227005, 0.023012684658169746, 0.07135005295276642, 0.008461466059088707, 0.004998552612960339, 0.02622731775045395, -0.09479465335607529, 0.014987453818321228, -0.008574756793677807, -0.008050303906202316, -0.005560623947530985, 0.04616820812225342, 0.11537269502878189, 0.032199542969465256, 0.05507092550396919, -0.053164780139923096, 0.012255114503204823, -0.01981479674577713, 0.06012535095214844, 0.043957680463790894, 0.02384384348988533, 0.04837791621685028, 0.04945961385965347, -0.1063770279288292, -0.07354240119457245, -0.08922741562128067, -0.026019031181931496, -0.08768662065267563, 0.09241457283496857, 0.03253300115466118, -0.018267929553985596, -0.04406850412487984, -0.05577726289629936, -0.05304105579853058, 0.016035545617341995, 0.05610279366374016, 0.06247573718428612, -0.019430609419941902, -0.017088554799556732, -0.022114543244242668, 0.07442588359117508, -0.017668865621089935, -0.02403153106570244, 0.006919574458152056, 0.05879344418644905, -0.0885634645819664, -0.016336753964424133, -0.024662213400006294, 0.029266972094774246, -0.04889025166630745, 0.042460259050130844, -0.013102580793201923, 0.023992935195565224, 0.024768078699707985, 0.047551900148391724, -0.02243787795305252, 0.05929713696241379, 0.03110451251268387, -0.00550821190699935], + "VIRT_REG_RFP80":[-0.04414765536785126, 0.05147779360413551, -0.035608600825071335, -0.03939598798751831, 0.0430026613175869, -0.03331028297543526, 0.015591064468026161, 0.01892651617527008, -0.011428372003138065, -0.06980786472558975, 0.06445881724357605, 0.1036338210105896, 0.01164929661899805, -0.07599718868732452, 0.022036561742424965, 0.10396245121955872, -0.041171155869960785, -0.07264886051416397, 0.00032837275648489594, 0.02848120965063572, -0.031889040023088455, 0.023848745971918106, -0.02298046089708805, -0.05559201166033745, 0.026687605306506157, 0.0565699003636837, -0.0134252505376935, 0.05494402348995209, -0.0584089457988739, 0.05422470346093178, -0.024360226467251778, 0.03570455685257912, 0.013681530021131039, -0.006910417694598436, 0.011886067688465118, 0.07619262486696243, 0.08147607743740082, 0.05824091285467148, 0.001224246108904481, -0.030463339760899544, -0.023527851328253746, 0.03078501485288143, -0.02225799672305584, -0.058049511164426804, 0.015403151512145996, 0.07900431007146835, 0.025944147258996964, 0.021455328911542892, 0.023985104635357857, -0.0327906534075737, 0.04195002466440201, -0.10313323140144348, -0.023333510383963585, -0.010316243395209312, -0.02042137086391449, 0.07474000751972198, 0.02313513681292534, -0.0030733307357877493, 0.06138097122311592, 0.005197131074965, -0.03222955763339996, 0.005364845506846905, -0.05313501134514809, 0.0013082564109936357, 0.025044983252882957, 0.0349799208343029, 0.09704083949327469, -0.017403649166226387, -0.03375721350312233, 0.05970870703458786, -0.021679691970348358, -0.04719642922282219, 0.024217652156949043, -0.06130526587367058, 0.004813425708562136, 0.07473690062761307, -0.039600174874067307, -0.009295261465013027, 0.05440402403473854, 0.04785943776369095, -0.04006686061620712, -0.020133933052420616, 0.00989031046628952, -0.054447200149297714, 0.06291327625513077, -0.01196430902928114, 0.0841275230050087, -0.05557875707745552, -0.0813804343342781, -0.0746457576751709, -0.024255990982055664, -0.048101916909217834, -0.014132879674434662, -0.013147399760782719, -0.009715595282614231, 0.08717820793390274, -0.04318689927458763, -0.0311901792883873, -0.017253845930099487, 0.005144816357642412], + "VIRT_REG_VR128":[0.08292517066001892, 0.053138989955186844, 0.0019234063802286983, -0.030035940930247307, 0.0821828693151474, -0.0540342852473259, 0.06449387222528458, -0.03985493257641792, 0.026820721104741096, 0.0352952741086483, -0.1056072935461998, 0.054804764688014984, 0.01685425080358982, 0.05867069214582443, 0.11665259301662445, -0.07655566930770874, 0.021201618015766144, 0.00927705504000187, -0.04723019897937775, 0.016582123935222626, -0.01160470675677061, -0.013075411319732666, 0.01054342370480299, -0.05403316020965576, 0.033609066158533096, -0.07971179485321045, 0.1005927175283432, -0.020655132830142975, -0.0036442605778574944, 0.018269486725330353, 0.036334097385406494, -0.06517180055379868, -0.028530113399028778, -0.03768114373087883, 0.10582506656646729, 0.011199450120329857, -0.06707775592803955, 0.02332702837884426, -0.014528930187225342, -0.09369251132011414, 0.069722481071949, 0.031001657247543335, 0.08032777905464172, -0.060744334012269974, 0.015131807886064053, 0.01935953088104725, -0.087028868496418, 0.041773099452257156, 0.0381581112742424, -0.07518653571605682, 0.021307995542883873, -0.07350508868694305, -0.04699733853340149, -0.007377162110060453, 0.07836157828569412, 0.016066696494817734, -0.02160775288939476, -0.030519334599375725, -0.09255059063434601, 0.03597188740968704, -0.11260625720024109, -0.08602424710988998, 0.058293748646974564, -0.034749604761600494, 0.005541469436138868, -0.07924741506576538, -0.024103455245494843, 0.06047135218977928, 0.026729481294751167, 0.03493977710604668, -0.07453227788209915, -0.01716521382331848, 0.008985077030956745, -0.08075122535228729, 0.03353623300790787, -0.08125714957714081, 0.04245763644576073, 0.06520543247461319, 0.020550349727272987, -0.003161275526508689, -0.03491697832942009, -0.005496494937688112, 0.09021904319524765, -0.057418785989284515, 0.03494826331734657, -0.052578359842300415, -0.044952504336833954, 0.11770184338092804, -0.048565153032541275, -0.03815764561295509, 0.06020108237862587, -0.09397949278354645, 0.03820547088980675, 0.08039405196905136, 0.014751153998076916, 0.006572262849658728, 0.05658692866563797, 0.05043925344944, -0.0060436660423874855, -0.12018798291683197], + "VIRT_REG_VR256":[0.032775089144706726, 0.029240285977721214, 0.01821955479681492, 0.023595772683620453, -0.02587016113102436, -0.12190376222133636, 0.09720813482999802, 0.005780891049653292, -0.0581410676240921, 0.04817686229944229, -0.04627984017133713, 0.03618951886892319, -0.10393846780061722, 0.04380590096116066, 0.030101926997303963, -0.021811308339238167, 0.0012455569813027978, 0.06209835410118103, -0.08859474956989288, 0.0671553835272789, -0.006448917090892792, 0.0169842429459095, 0.031113164499402046, -0.07417412847280502, 0.05549546331167221, -0.013042094185948372, 0.0948401540517807, -0.07335975021123886, -0.03987044095993042, -0.005343804135918617, -0.08741248399019241, -0.08009110391139984, 0.005667346995323896, 0.03745159134268761, 0.019986214116215706, -0.03723142296075821, -0.0037649653386324644, 0.005682446528226137, 0.0659727230668068, -0.002658356446772814, 0.07049102336168289, -0.01944110542535782, -0.014278342947363853, 0.04189611226320267, 0.0312303826212883, -0.046760618686676025, 0.040438465774059296, 0.054074693471193314, 0.07479880005121231, -0.016405146569013596, 0.027125591412186623, -0.04216836765408516, 0.0011189498472958803, -0.01471384521573782, -0.010250975377857685, -0.006412460468709469, -0.12170380353927612, 0.015495882369577885, -0.054699406027793884, 0.05955614894628525, 0.06753991544246674, -0.03688138723373413, 0.049010518938302994, -0.07614680379629135, 0.06504888087511063, -0.014145595952868462, 0.02210555598139763, 0.023598313331604004, 0.00511248828843236, 0.013318972662091255, -0.11605404317378998, -0.032067783176898956, -0.05010659247636795, -0.023693162947893143, 0.06650379300117493, -0.026386691257357597, 0.06052805855870247, 0.0515507273375988, 0.033960308879613876, -0.06421340256929398, -0.09355985373258591, -0.0658700093626976, 0.10278744995594025, -0.10271084308624268, -0.012089421041309834, -0.04169749841094017, -0.07112454622983932, -0.032573599368333817, -0.0003141233173664659, 0.017007946968078613, 0.03622191399335861, 0.05829676240682602, 0.06261610984802246, 0.005667738616466522, 0.009631159715354443, 0.022852277383208275, 0.057013869285583496, -0.05015721917152405, 0.027599012479186058, -0.08637165278196335] + }, + "PhysicalRegisters" : { + "PHY_REG":[-0.008169060572981834, -0.017023155465722084, -0.04927198588848114, 0.0014261528849601746, 0.012259463779628277, -0.02794509381055832, -0.024857040494680405, 0.029203711077570915, 0.0433109886944294, 0.009679347276687622, -0.05811547115445137, -0.09075025469064713, -0.08525611460208893, -0.10545054078102112, 0.06474080681800842, 0.056396666914224625, 0.06781823933124542, 0.09059076011180878, -0.10420752316713333, -0.08284831047058105, 0.02349182404577732, -0.0354253351688385, -0.004627702757716179, 0.0068538435734808445, -0.053724177181720734, -0.02113335393369198, 0.05254676192998886, -0.050769440829753876, 0.061386119574308395, -0.07541731745004654, -0.024204161018133163, -0.0009893826209008694, -0.007493770215660334, -0.017051052302122116, 0.015025814063847065, -0.020427946001291275, -0.0844966471195221, 0.04589429497718811, 0.025571472942829132, -0.05280151963233948, 0.06895384937524796, 0.03960262984037399, 0.0068003153428435326, 0.09397424012422562, -0.0523529127240181, 0.03780638054013252, -0.015423302538692951, 0.029167350381612778, 0.01019437238574028, 0.023989612236618996, -0.03344425559043884, -0.07926471531391144, -0.09238854795694351, 0.04794330149888992, 0.01872367039322853, -0.029179377481341362, -0.05339968949556351, -0.04575541242957115, -0.004491546656936407, -0.009650425054132938, 0.026945313438773155, -0.02115861512720585, 0.06488905847072601, -0.06647083908319473, 0.008904196321964264, 0.010536684654653072, -0.06012551859021187, -0.00022655133216176182, -0.10175421833992004, 0.062001921236515045, -0.054452817887067795, 0.01785552129149437, -0.06749527156352997, -0.04883178323507309, -0.023449009284377098, 0.040745027363300323, 0.002448269398882985, 0.07842953503131866, -0.019806355237960815, -0.08275315910577774, 0.01131721492856741, 0.0482926219701767, 0.01892486959695816, 0.005685009527951479, -0.0055344682186841965, -0.0034555341117084026, -0.07923021167516708, 0.06387833505868912, 0.05978211387991905, -0.001252106623724103, 0.07216084003448486, -0.01223798282444477, 0.09716741740703583, 0.009659498929977417, -0.09404221922159195, -0.10122949630022049, -0.003581057768315077, 0.07885389029979706, 0.05305042862892151, -0.04988719895482063] } } \ No newline at end of file diff --git a/llvm/lib/CodeGen/MIR2Vec.cpp b/llvm/lib/CodeGen/MIR2Vec.cpp index 99be1fc0dcf1f..75ca06a79eaa5 100644 --- a/llvm/lib/CodeGen/MIR2Vec.cpp +++ b/llvm/lib/CodeGen/MIR2Vec.cpp @@ -42,6 +42,13 @@ static cl::opt cl::opt OpcWeight("mir2vec-opc-weight", cl::Optional, cl::init(1.0), cl::desc("Weight for machine opcode embeddings"), cl::cat(MIR2VecCategory)); +cl::opt CommonOperandWeight( + "mir2vec-common-operand-weight", cl::Optional, cl::init(1.0), + cl::desc("Weight for common operand embeddings"), cl::cat(MIR2VecCategory)); +cl::opt + RegOperandWeight("mir2vec-reg-operand-weight", cl::Optional, cl::init(1.0), + cl::desc("Weight for register operand embeddings"), + cl::cat(MIR2VecCategory)); cl::opt MIR2VecEmbeddingKind( "mir2vec-kind", cl::Optional, cl::values(clEnumValN(MIR2VecKind::Symbolic, "symbolic", @@ -56,26 +63,52 @@ cl::opt MIR2VecEmbeddingKind( // Vocabulary //===----------------------------------------------------------------------===// -MIRVocabulary::MIRVocabulary(VocabMap &&OpcodeEntries, - const TargetInstrInfo &TII) - : TII(TII) { +MIRVocabulary::MIRVocabulary(VocabMap &&OpcodeMap, VocabMap &&CommonOperandMap, + VocabMap &&PhysicalRegisterMap, + VocabMap &&VirtualRegisterMap, + const TargetInstrInfo &TII, + const TargetRegisterInfo &TRI, + const MachineRegisterInfo &MRI) + : TII(TII), TRI(TRI), MRI(MRI) { buildCanonicalOpcodeMapping(); - unsigned CanonicalOpcodeCount = UniqueBaseOpcodeNames.size(); assert(CanonicalOpcodeCount > 0 && "No canonical opcodes found for target - invalid vocabulary"); - Layout.OperandBase = CanonicalOpcodeCount; - generateStorage(OpcodeEntries); + + buildRegisterOperandMapping(); + + // Define layout of vocabulary sections + Layout.OpcodeBase = 0; + Layout.CommonOperandBase = CanonicalOpcodeCount; + // We expect same classes for physical and virtual registers + Layout.PhyRegBase = Layout.CommonOperandBase + std::size(CommonOperandNames); + Layout.VirtRegBase = Layout.PhyRegBase + RegisterOperandNames.size(); + + generateStorage(OpcodeMap, CommonOperandMap, PhysicalRegisterMap, + VirtualRegisterMap); Layout.TotalEntries = Storage.size(); } -Expected MIRVocabulary::create(VocabMap &&Entries, - const TargetInstrInfo &TII) { - if (Entries.empty()) +Expected +MIRVocabulary::create(VocabMap &&OpcodeMap, VocabMap &&CommonOperandMap, + VocabMap &&PhyRegMap, VocabMap &&VirtRegMap, + const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, + const MachineRegisterInfo &MRI) { + if (OpcodeMap.empty() || CommonOperandMap.empty() || PhyRegMap.empty() || + VirtRegMap.empty()) return createStringError(errc::invalid_argument, "Empty vocabulary entries provided"); - return MIRVocabulary(std::move(Entries), TII); + MIRVocabulary Vocab(std::move(OpcodeMap), std::move(CommonOperandMap), + std::move(PhyRegMap), std::move(VirtRegMap), TII, TRI, + MRI); + + // Validate Storage after construction + if (!Vocab.Storage.isValid()) + return createStringError(errc::invalid_argument, + "Failed to create valid vocabulary storage"); + Vocab.ZeroEmbedding = Embedding(Vocab.Storage.getDimension(), 0.0); + return std::move(Vocab); } std::string MIRVocabulary::extractBaseOpcodeName(StringRef InstrName) { @@ -122,22 +155,74 @@ unsigned MIRVocabulary::getCanonicalOpcodeIndex(unsigned Opcode) const { return getCanonicalIndexForBaseName(BaseOpcode); } +unsigned +MIRVocabulary::getCanonicalIndexForOperandName(StringRef OperandName) const { + auto It = std::find(std::begin(CommonOperandNames), + std::end(CommonOperandNames), OperandName); + assert(It != std::end(CommonOperandNames) && + "Operand name not found in common operands"); + return Layout.CommonOperandBase + + std::distance(std::begin(CommonOperandNames), It); +} + +unsigned +MIRVocabulary::getCanonicalIndexForRegisterClass(StringRef RegName, + bool IsPhysical) const { + auto It = std::find(RegisterOperandNames.begin(), RegisterOperandNames.end(), + RegName); + assert(It != RegisterOperandNames.end() && + "Register name not found in register operands"); + unsigned LocalIndex = std::distance(RegisterOperandNames.begin(), It); + return (IsPhysical ? Layout.PhyRegBase : Layout.VirtRegBase) + LocalIndex; +} + std::string MIRVocabulary::getStringKey(unsigned Pos) const { assert(Pos < Layout.TotalEntries && "Position out of bounds in vocabulary"); - // For now, all entries are opcodes since we only have one section - if (Pos < Layout.OperandBase && Pos < UniqueBaseOpcodeNames.size()) { + // Handle opcodes section + if (Pos < Layout.CommonOperandBase) { // Convert canonical index back to base opcode name auto It = UniqueBaseOpcodeNames.begin(); std::advance(It, Pos); + assert(It != UniqueBaseOpcodeNames.end() && + "Canonical index out of bounds in opcode section"); return *It; } - llvm_unreachable("Invalid position in vocabulary"); - return ""; + auto getLocalIndex = [](unsigned Pos, size_t BaseOffset, size_t Bound, + const char *Msg) { + unsigned LocalIndex = Pos - BaseOffset; + assert(LocalIndex < Bound && Msg); + return LocalIndex; + }; + + // Handle common operands section + if (Pos < Layout.PhyRegBase) { + unsigned LocalIndex = getLocalIndex( + Pos, Layout.CommonOperandBase, std::size(CommonOperandNames), + "Local index out of bounds in common operands"); + return CommonOperandNames[LocalIndex].str(); + } + + // Handle physical registers section + if (Pos < Layout.VirtRegBase) { + unsigned LocalIndex = + getLocalIndex(Pos, Layout.PhyRegBase, RegisterOperandNames.size(), + "Local index out of bounds in physical registers"); + return "PhyReg_" + RegisterOperandNames[LocalIndex]; + } + + // Handle virtual registers section + unsigned LocalIndex = + getLocalIndex(Pos, Layout.VirtRegBase, RegisterOperandNames.size(), + "Local index out of bounds in virtual registers"); + return "VirtReg_" + RegisterOperandNames[LocalIndex]; } -void MIRVocabulary::generateStorage(const VocabMap &OpcodeMap) { +void MIRVocabulary::generateStorage(const VocabMap &OpcodeMap, + const VocabMap &CommonOperandsMap, + const VocabMap &PhyRegMap, + const VocabMap &VirtRegMap) { // Helper for handling missing entities in the vocabulary. // Currently, we use a zero vector. In the future, we will throw an error to @@ -151,14 +236,14 @@ void MIRVocabulary::generateStorage(const VocabMap &OpcodeMap) { // Initialize opcode embeddings section unsigned EmbeddingDim = OpcodeMap.begin()->second.size(); - std::vector OpcodeEmbeddings(Layout.OperandBase, + std::vector OpcodeEmbeddings(Layout.CommonOperandBase, Embedding(EmbeddingDim)); // Populate opcode embeddings using canonical mapping for (auto COpcodeName : UniqueBaseOpcodeNames) { if (auto It = OpcodeMap.find(COpcodeName); It != OpcodeMap.end()) { auto COpcodeIndex = getCanonicalIndexForBaseName(COpcodeName); - assert(COpcodeIndex < Layout.OperandBase && + assert(COpcodeIndex < Layout.CommonOperandBase && "Canonical index out of bounds"); OpcodeEmbeddings[COpcodeIndex] = It->second; } else { @@ -166,8 +251,39 @@ void MIRVocabulary::generateStorage(const VocabMap &OpcodeMap) { } } - // TODO: Add operand/argument embeddings as additional sections - // This will require extending the vocabulary format and layout + // Initialize common operand embeddings section + std::vector CommonOperandEmbeddings(std::size(CommonOperandNames), + Embedding(EmbeddingDim)); + unsigned OperandIndex = 0; + for (const auto &CommonOperandName : CommonOperandNames) { + if (auto It = CommonOperandsMap.find(CommonOperandName.str()); + It != CommonOperandsMap.end()) { + CommonOperandEmbeddings[OperandIndex] = It->second; + } else { + handleMissingEntity(CommonOperandName); + } + ++OperandIndex; + } + + // Helper lambda for creating register operand embeddings + auto createRegisterEmbeddings = [&](const VocabMap &RegMap) { + std::vector RegEmbeddings(TRI.getNumRegClasses(), + Embedding(EmbeddingDim)); + unsigned RegOperandIndex = 0; + for (const auto &RegOperandName : RegisterOperandNames) { + if (auto It = RegMap.find(RegOperandName); It != RegMap.end()) + RegEmbeddings[RegOperandIndex] = It->second; + else + handleMissingEntity(RegOperandName); + ++RegOperandIndex; + } + return RegEmbeddings; + }; + + // Initialize register operand embeddings sections + std::vector PhyRegEmbeddings = createRegisterEmbeddings(PhyRegMap); + std::vector VirtRegEmbeddings = + createRegisterEmbeddings(VirtRegMap); // Scale the vocabulary sections based on the provided weights auto scaleVocabSection = [](std::vector &Embeddings, @@ -176,9 +292,20 @@ void MIRVocabulary::generateStorage(const VocabMap &OpcodeMap) { Embedding *= Weight; }; scaleVocabSection(OpcodeEmbeddings, OpcWeight); - - std::vector> Sections(1); - Sections[0] = std::move(OpcodeEmbeddings); + scaleVocabSection(CommonOperandEmbeddings, CommonOperandWeight); + scaleVocabSection(PhyRegEmbeddings, RegOperandWeight); + scaleVocabSection(VirtRegEmbeddings, RegOperandWeight); + + std::vector> Sections( + static_cast(Section::MaxSections)); + Sections[static_cast(Section::Opcodes)] = + std::move(OpcodeEmbeddings); + Sections[static_cast(Section::CommonOperands)] = + std::move(CommonOperandEmbeddings); + Sections[static_cast(Section::PhyRegisters)] = + std::move(PhyRegEmbeddings); + Sections[static_cast(Section::VirtRegisters)] = + std::move(VirtRegEmbeddings); Storage = ir2vec::VocabStorage(std::move(Sections)); } @@ -199,26 +326,94 @@ void MIRVocabulary::buildCanonicalOpcodeMapping() { << " unique base opcodes\n"); } -Expected -MIRVocabulary::createDummyVocabForTest(const TargetInstrInfo &TII, - unsigned Dim) { +void MIRVocabulary::buildRegisterOperandMapping() { + // Check if already built + if (!RegisterOperandNames.empty()) + return; + + for (unsigned RC = 0; RC < TRI.getNumRegClasses(); ++RC) { + const TargetRegisterClass *RegClass = TRI.getRegClass(RC); + if (!RegClass) + continue; + + // Get the register class name + StringRef ClassName = TRI.getRegClassName(RegClass); + RegisterOperandNames.push_back(ClassName.str()); + } +} + +unsigned MIRVocabulary::getCommonOperandIndex( + MachineOperand::MachineOperandType OperandType) const { + assert(OperandType != MachineOperand::MO_Register && + "Expected non-register operand type"); + assert(OperandType > MachineOperand::MO_Register && + OperandType < MachineOperand::MO_Last && "Operand type out of bounds"); + return static_cast(OperandType) - 1; +} + +unsigned MIRVocabulary::getRegisterOperandIndex(Register Reg) const { + assert(!RegisterOperandNames.empty() && "Register operand mapping not built"); + assert(Reg.isValid() && "Invalid register; not expected here"); + assert((Reg.isPhysical() || Reg.isVirtual()) && + "Expected a physical or virtual register"); + + const TargetRegisterClass *RegClass = nullptr; + + // For physical registers, use TRI to get minimal register class as a + // physical register can belong to multiple classes. For virtual + // registers, use MRI to uniquely identify the assigned register class. + if (Reg.isPhysical()) + RegClass = TRI.getMinimalPhysRegClass(Reg); + else + RegClass = MRI.getRegClass(Reg); + + if (RegClass) + return RegClass->getID(); + // Fallback for registers without a class (shouldn't happen) + llvm_unreachable("Register operand without a valid register class"); + return 0; +} + +Expected MIRVocabulary::createDummyVocabForTest( + const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, + const MachineRegisterInfo &MRI, unsigned Dim) { assert(Dim > 0 && "Dimension must be greater than zero"); float DummyVal = 0.1f; - // Create dummy embeddings for all canonical opcode names - VocabMap DummyVocabMap; + VocabMap DummyOpcMap, DummyOperandMap, DummyPhyRegMap, DummyVirtRegMap; + + // Process opcodes directly without creating temporary vocabulary for (unsigned Opcode = 0; Opcode < TII.getNumOpcodes(); ++Opcode) { std::string BaseOpcode = extractBaseOpcodeName(TII.getName(Opcode)); - if (DummyVocabMap.count(BaseOpcode) == 0) { - // Only add if not already present - DummyVocabMap[BaseOpcode] = Embedding(Dim, DummyVal); + if (DummyOpcMap.count(BaseOpcode) == 0) { // Only add if not already present + DummyOpcMap[BaseOpcode] = Embedding(Dim, DummyVal); DummyVal += 0.1f; } } - // Create and return vocabulary with dummy embeddings - return MIRVocabulary::create(std::move(DummyVocabMap), TII); + // Add common operands + for (const auto &CommonOperandName : CommonOperandNames) { + DummyOperandMap[CommonOperandName.str()] = Embedding(Dim, DummyVal); + DummyVal += 0.1f; + } + + // Process register classes directly + for (unsigned RC = 0; RC < TRI.getNumRegClasses(); ++RC) { + const TargetRegisterClass *RegClass = TRI.getRegClass(RC); + if (!RegClass) + continue; + + std::string ClassName = TRI.getRegClassName(RegClass); + DummyPhyRegMap[ClassName] = Embedding(Dim, DummyVal); + DummyVirtRegMap[ClassName] = Embedding(Dim, DummyVal); + DummyVal += 0.1f; + } + + // Create vocabulary directly without temporary instance + return MIRVocabulary::create( + std::move(DummyOpcMap), std::move(DummyOperandMap), + std::move(DummyPhyRegMap), std::move(DummyVirtRegMap), TII, TRI, MRI); } //===----------------------------------------------------------------------===// @@ -236,9 +431,10 @@ StringRef MIR2VecVocabLegacyAnalysis::getPassName() const { return "MIR2Vec Vocabulary Analysis"; } -Error MIR2VecVocabLegacyAnalysis::readVocabulary() { - // TODO: Extend vocabulary format to support multiple sections - // (opcodes, operands, etc.) similar to IR2Vec structure +Error MIR2VecVocabLegacyAnalysis::readVocabulary(VocabMap &OpcodeVocab, + VocabMap &CommonOperandVocab, + VocabMap &PhyRegVocabMap, + VocabMap &VirtRegVocabMap) { if (VocabFile.empty()) return createStringError( errc::invalid_argument, @@ -255,21 +451,47 @@ Error MIR2VecVocabLegacyAnalysis::readVocabulary() { if (!ParsedVocabValue) return ParsedVocabValue.takeError(); - unsigned Dim = 0; + unsigned OpcodeDim = 0, CommonOperandDim = 0, PhyRegOperandDim = 0, + VirtRegOperandDim = 0; + if (auto Err = ir2vec::VocabStorage::parseVocabSection( + "Opcodes", *ParsedVocabValue, OpcodeVocab, OpcodeDim)) + return Err; + + if (auto Err = ir2vec::VocabStorage::parseVocabSection( + "CommonOperands", *ParsedVocabValue, CommonOperandVocab, + CommonOperandDim)) + return Err; + + if (auto Err = ir2vec::VocabStorage::parseVocabSection( + "PhysicalRegisters", *ParsedVocabValue, PhyRegVocabMap, + PhyRegOperandDim)) + return Err; + if (auto Err = ir2vec::VocabStorage::parseVocabSection( - "entities", *ParsedVocabValue, StrVocabMap, Dim)) + "VirtualRegisters", *ParsedVocabValue, VirtRegVocabMap, + VirtRegOperandDim)) return Err; + // All sections must have the same embedding dimension + if (!(OpcodeDim == CommonOperandDim && CommonOperandDim == PhyRegOperandDim && + PhyRegOperandDim == VirtRegOperandDim)) { + return createStringError( + errc::illegal_byte_sequence, + "MIR2Vec vocabulary sections have different dimensions"); + } + return Error::success(); } Expected MIR2VecVocabLegacyAnalysis::getMIR2VecVocabulary(const Module &M) { - if (StrVocabMap.empty()) { - if (Error Err = readVocabulary()) { - return std::move(Err); - } - } + if (Vocab.has_value()) + return std::move(Vocab.value()); + + VocabMap OpcMap, CommonOperandMap, PhyRegMap, VirtRegMap; + if (Error Err = + readVocabulary(OpcMap, CommonOperandMap, PhyRegMap, VirtRegMap)) + return std::move(Err); // Get machine module info to access machine functions and target info MachineModuleInfo &MMI = getAnalysis().getMMI(); @@ -280,8 +502,24 @@ MIR2VecVocabLegacyAnalysis::getMIR2VecVocabulary(const Module &M) { continue; if (auto *MF = MMI.getMachineFunction(F)) { - const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); - return mir2vec::MIRVocabulary::create(std::move(StrVocabMap), *TII); + auto &Subtarget = MF->getSubtarget(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + if (!TII) { + return createStringError(errc::invalid_argument, + "No TargetInstrInfo available; cannot create " + "MIR2Vec vocabulary"); + } + + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); + if (!TRI) { + return createStringError(errc::invalid_argument, + "No TargetRegisterInfo available; cannot " + "create MIR2Vec vocabulary"); + } + + return mir2vec::MIRVocabulary::create( + std::move(OpcMap), std::move(CommonOperandMap), std::move(PhyRegMap), + std::move(VirtRegMap), *TII, *TRI, MF->getRegInfo()); } } @@ -351,9 +589,14 @@ Embedding SymbolicMIREmbedder::computeEmbeddings(const MachineInstr &MI) const { if (MI.isDebugInstr()) return Embedding(Dimension, 0); - // Todo: Add operand/argument contributions + // Opcode embedding + Embedding InstructionEmbedding = Vocab[MI.getOpcode()]; + + // Add operand contributions + for (const MachineOperand &MO : MI.operands()) + InstructionEmbedding += Vocab[MO]; - return Vocab[MI.getOpcode()]; + return InstructionEmbedding; } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_2D_vocab.json b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_2D_vocab.json index 2894fff850ba7..da0d13d64e165 100644 --- a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_2D_vocab.json +++ b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_2D_vocab.json @@ -1,5 +1,5 @@ { - "entities" : { + "Opcodes" : { "ABS_Fp":[1, 2], "ADC":[3, 4], "ADD":[5, 6], @@ -7,5 +7,21 @@ "ADDPDrr":[9, 10], "ADDPSrr":[11, 12], "ADDSDrm":[13, 14] + }, + "CommonOperands": { + "Immediate": [0.1, 0.1], + "MBB": [0.2, 0.2], + "FrameIndex": [0.3, 0.3], + "GlobalAddress": [0.4, 0.4] + }, + "PhysicalRegisters": { + "GR32": [0.5, 0.5], + "GR64": [0.6, 0.6], + "XMM": [0.7, 0.7] + }, + "VirtualRegisters": { + "GR32": [0.8, 0.8], + "GR64": [0.9, 0.9], + "XMM": [1.0, 1.0] } } \ No newline at end of file diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json index 5de715bf80917..f4b14a4242c64 100644 --- a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json +++ b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json @@ -1,5 +1,5 @@ { - "entities": { + "Opcodes": { "KILL": [0.1, 0.2, 0.3], "MOV": [0.4, 0.5, 0.6], "LEA": [0.7, 0.8, 0.9], @@ -18,5 +18,21 @@ "POP": [4.6, 4.7, 4.8], "NOP": [4.9, 5.0, 5.1], "COPY": [5.2, 5.3, 5.4] + }, + "CommonOperands": { + "Immediate": [0.1, 0.1, 0.1], + "MBB": [0.2, 0.2, 0.2], + "FrameIndex": [0.3, 0.3, 0.3], + "GlobalAddress": [0.4, 0.4, 0.4] + }, + "PhysicalRegisters": { + "GR32": [0.5, 0.5, 0.5], + "GR64": [0.6, 0.6, 0.6], + "XMM": [0.7, 0.7, 0.7] + }, + "VirtualRegisters": { + "GR32": [0.8, 0.8, 0.8], + "GR64": [0.9, 0.9, 0.9], + "XMM": [1.0, 1.0, 1.0] } } \ No newline at end of file diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_inconsistent_dims.json b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_inconsistent_dims.json index bf041635fc603..6274fb747179c 100644 --- a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_inconsistent_dims.json +++ b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_inconsistent_dims.json @@ -1,7 +1,16 @@ { - "entities": { + "Opcodes": { "ADD": [1.0, 2.0, 3.0], "SUB": [1.5], "MUL": [2.0, 3.0] + }, + "CommonOperands": { + "Immediate": [1.0] + }, + "PhysicalRegisters": { + "GR32": [1.0, 2.0] + }, + "VirtualRegisters": { + "GR32": [1.0, 2.0, 3.0] } } diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_zero_vocab.json b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_zero_vocab.json index 63e8ccbd41034..7bfdf3bab831d 100644 --- a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_zero_vocab.json +++ b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_zero_vocab.json @@ -1,5 +1,5 @@ { - "entities": { + "Opcodes": { "ADD": [], "SUB": [], "MUL": [], @@ -8,5 +8,14 @@ "JMP": [], "CALL": [], "RET": [] + }, + "CommonOperands": { + "Immediate": [] + }, + "PhysicalRegisters": { + "GR32": [] + }, + "VirtualRegisters": { + "GR32": [] } } \ No newline at end of file diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt index 6327cffb77e74..d3c0da9862245 100644 --- a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt +++ b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt @@ -6880,3 +6880,294 @@ Key: XSHA: [ 0.00 0.00 ] Key: XSTORE: [ 0.00 0.00 ] Key: XSUSLDTRK: [ 0.00 0.00 ] Key: XTEST: [ 0.00 0.00 ] +Key: Immediate: [ 0.10 0.10 ] +Key: CImmediate: [ 0.00 0.00 ] +Key: FPImmediate: [ 0.00 0.00 ] +Key: MBB: [ 0.20 0.20 ] +Key: FrameIndex: [ 0.30 0.30 ] +Key: ConstantPoolIndex: [ 0.00 0.00 ] +Key: TargetIndex: [ 0.00 0.00 ] +Key: JumpTableIndex: [ 0.00 0.00 ] +Key: ExternalSymbol: [ 0.00 0.00 ] +Key: GlobalAddress: [ 0.40 0.40 ] +Key: BlockAddress: [ 0.00 0.00 ] +Key: RegisterMask: [ 0.00 0.00 ] +Key: RegisterLiveOut: [ 0.00 0.00 ] +Key: Metadata: [ 0.00 0.00 ] +Key: MCSymbol: [ 0.00 0.00 ] +Key: CFIIndex: [ 0.00 0.00 ] +Key: IntrinsicID: [ 0.00 0.00 ] +Key: Predicate: [ 0.00 0.00 ] +Key: ShuffleMask: [ 0.00 0.00 ] +Key: PhyReg_GR8: [ 0.00 0.00 ] +Key: PhyReg_GRH8: [ 0.00 0.00 ] +Key: PhyReg_GR8_NOREX2: [ 0.00 0.00 ] +Key: PhyReg_GR8_NOREX: [ 0.00 0.00 ] +Key: PhyReg_GR8_ABCD_H: [ 0.00 0.00 ] +Key: PhyReg_GR8_ABCD_L: [ 0.00 0.00 ] +Key: PhyReg_GRH16: [ 0.00 0.00 ] +Key: PhyReg_GR16: [ 0.00 0.00 ] +Key: PhyReg_GR16_NOREX2: [ 0.00 0.00 ] +Key: PhyReg_GR16_NOREX: [ 0.00 0.00 ] +Key: PhyReg_VK1: [ 0.00 0.00 ] +Key: PhyReg_VK16: [ 0.00 0.00 ] +Key: PhyReg_VK2: [ 0.00 0.00 ] +Key: PhyReg_VK4: [ 0.00 0.00 ] +Key: PhyReg_VK8: [ 0.00 0.00 ] +Key: PhyReg_VK16WM: [ 0.00 0.00 ] +Key: PhyReg_VK1WM: [ 0.00 0.00 ] +Key: PhyReg_VK2WM: [ 0.00 0.00 ] +Key: PhyReg_VK4WM: [ 0.00 0.00 ] +Key: PhyReg_VK8WM: [ 0.00 0.00 ] +Key: PhyReg_SEGMENT_REG: [ 0.00 0.00 ] +Key: PhyReg_GR16_ABCD: [ 0.00 0.00 ] +Key: PhyReg_FPCCR: [ 0.00 0.00 ] +Key: PhyReg_FR16X: [ 0.00 0.00 ] +Key: PhyReg_FR16: [ 0.00 0.00 ] +Key: PhyReg_VK16PAIR: [ 0.00 0.00 ] +Key: PhyReg_VK1PAIR: [ 0.00 0.00 ] +Key: PhyReg_VK2PAIR: [ 0.00 0.00 ] +Key: PhyReg_VK4PAIR: [ 0.00 0.00 ] +Key: PhyReg_VK8PAIR: [ 0.00 0.00 ] +Key: PhyReg_VK1PAIR_with_sub_mask_0_in_VK1WM: [ 0.00 0.00 ] +Key: PhyReg_LOW32_ADDR_ACCESS_RBP: [ 0.00 0.00 ] +Key: PhyReg_LOW32_ADDR_ACCESS: [ 0.00 0.00 ] +Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit: [ 0.00 0.00 ] +Key: PhyReg_FR32X: [ 0.00 0.00 ] +Key: PhyReg_GR32: [ 0.50 0.50 ] +Key: PhyReg_GR32_NOSP: [ 0.00 0.00 ] +Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2: [ 0.00 0.00 ] +Key: PhyReg_DEBUG_REG: [ 0.00 0.00 ] +Key: PhyReg_FR32: [ 0.00 0.00 ] +Key: PhyReg_GR32_NOREX2: [ 0.00 0.00 ] +Key: PhyReg_GR32_NOREX2_NOSP: [ 0.00 0.00 ] +Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ] +Key: PhyReg_GR32_NOREX: [ 0.00 0.00 ] +Key: PhyReg_VK32: [ 0.00 0.00 ] +Key: PhyReg_GR32_NOREX_NOSP: [ 0.00 0.00 ] +Key: PhyReg_RFP32: [ 0.00 0.00 ] +Key: PhyReg_VK32WM: [ 0.00 0.00 ] +Key: PhyReg_GR32_ABCD: [ 0.00 0.00 ] +Key: PhyReg_GR32_TC: [ 0.00 0.00 ] +Key: PhyReg_GR32_ABCD_and_GR32_TC: [ 0.00 0.00 ] +Key: PhyReg_GR32_AD: [ 0.00 0.00 ] +Key: PhyReg_GR32_ArgRef: [ 0.00 0.00 ] +Key: PhyReg_GR32_BPSP: [ 0.00 0.00 ] +Key: PhyReg_GR32_BSI: [ 0.00 0.00 ] +Key: PhyReg_GR32_CB: [ 0.00 0.00 ] +Key: PhyReg_GR32_DC: [ 0.00 0.00 ] +Key: PhyReg_GR32_DIBP: [ 0.00 0.00 ] +Key: PhyReg_GR32_SIDI: [ 0.00 0.00 ] +Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit: [ 0.00 0.00 ] +Key: PhyReg_CCR: [ 0.00 0.00 ] +Key: PhyReg_DFCCR: [ 0.00 0.00 ] +Key: PhyReg_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ] +Key: PhyReg_GR32_AD_and_GR32_ArgRef: [ 0.00 0.00 ] +Key: PhyReg_GR32_ArgRef_and_GR32_CB: [ 0.00 0.00 ] +Key: PhyReg_GR32_BPSP_and_GR32_DIBP: [ 0.00 0.00 ] +Key: PhyReg_GR32_BPSP_and_GR32_TC: [ 0.00 0.00 ] +Key: PhyReg_GR32_BSI_and_GR32_SIDI: [ 0.00 0.00 ] +Key: PhyReg_GR32_DIBP_and_GR32_SIDI: [ 0.00 0.00 ] +Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit: [ 0.00 0.00 ] +Key: PhyReg_LOW32_ADDR_ACCESS_with_sub_32bit: [ 0.00 0.00 ] +Key: PhyReg_RFP64: [ 0.00 0.00 ] +Key: PhyReg_GR64: [ 0.60 0.60 ] +Key: PhyReg_FR64X: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_8bit: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOSP: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX2: [ 0.00 0.00 ] +Key: PhyReg_CONTROL_REG: [ 0.00 0.00 ] +Key: PhyReg_FR64: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_16bit_in_GR16_NOREX2: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX2_NOSP: [ 0.00 0.00 ] +Key: PhyReg_GR64PLTSafe: [ 0.00 0.00 ] +Key: PhyReg_GR64_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX: [ 0.00 0.00 ] +Key: PhyReg_GR64_TCW64: [ 0.00 0.00 ] +Key: PhyReg_GR64_TC_with_sub_8bit: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX2_NOSP_and_GR64_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_TCW64_with_sub_8bit: [ 0.00 0.00 ] +Key: PhyReg_GR64_TC_and_GR64_TCW64: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ] +Key: PhyReg_VK64: [ 0.00 0.00 ] +Key: PhyReg_VR64: [ 0.00 0.00 ] +Key: PhyReg_GR64PLTSafe_and_GR64_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX2_NOSP_and_GR64_TCW64: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX_NOSP: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX_and_GR64_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_TCW64_and_GR64_TC_with_sub_8bit: [ 0.00 0.00 ] +Key: PhyReg_VK64WM: [ 0.00 0.00 ] +Key: PhyReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64: [ 0.00 0.00 ] +Key: PhyReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ] +Key: PhyReg_GR64PLTSafe_and_GR64_TCW64: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX_and_GR64_TCW64: [ 0.00 0.00 ] +Key: PhyReg_GR64_ABCD: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_AD: [ 0.00 0.00 ] +Key: PhyReg_GR64_ArgRef: [ 0.00 0.00 ] +Key: PhyReg_GR64_and_LOW32_ADDR_ACCESS_RBP: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_BSI: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_CB: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_DIBP: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_SIDI: [ 0.00 0.00 ] +Key: PhyReg_GR64_A: [ 0.00 0.00 ] +Key: PhyReg_GR64_ArgRef_and_GR64_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_and_LOW32_ADDR_ACCESS: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI: [ 0.00 0.00 ] +Key: PhyReg_RST: [ 0.00 0.00 ] +Key: PhyReg_RFP80: [ 0.00 0.00 ] +Key: PhyReg_RFP80_7: [ 0.00 0.00 ] +Key: PhyReg_VR128X: [ 0.00 0.00 ] +Key: PhyReg_VR128: [ 0.00 0.00 ] +Key: PhyReg_VR256X: [ 0.00 0.00 ] +Key: PhyReg_VR256: [ 0.00 0.00 ] +Key: PhyReg_VR512: [ 0.00 0.00 ] +Key: PhyReg_VR512_0_15: [ 0.00 0.00 ] +Key: PhyReg_TILE: [ 0.00 0.00 ] +Key: PhyReg_TILEPAIR: [ 0.00 0.00 ] +Key: VirtReg_GR8: [ 0.00 0.00 ] +Key: VirtReg_GRH8: [ 0.00 0.00 ] +Key: VirtReg_GR8_NOREX2: [ 0.00 0.00 ] +Key: VirtReg_GR8_NOREX: [ 0.00 0.00 ] +Key: VirtReg_GR8_ABCD_H: [ 0.00 0.00 ] +Key: VirtReg_GR8_ABCD_L: [ 0.00 0.00 ] +Key: VirtReg_GRH16: [ 0.00 0.00 ] +Key: VirtReg_GR16: [ 0.00 0.00 ] +Key: VirtReg_GR16_NOREX2: [ 0.00 0.00 ] +Key: VirtReg_GR16_NOREX: [ 0.00 0.00 ] +Key: VirtReg_VK1: [ 0.00 0.00 ] +Key: VirtReg_VK16: [ 0.00 0.00 ] +Key: VirtReg_VK2: [ 0.00 0.00 ] +Key: VirtReg_VK4: [ 0.00 0.00 ] +Key: VirtReg_VK8: [ 0.00 0.00 ] +Key: VirtReg_VK16WM: [ 0.00 0.00 ] +Key: VirtReg_VK1WM: [ 0.00 0.00 ] +Key: VirtReg_VK2WM: [ 0.00 0.00 ] +Key: VirtReg_VK4WM: [ 0.00 0.00 ] +Key: VirtReg_VK8WM: [ 0.00 0.00 ] +Key: VirtReg_SEGMENT_REG: [ 0.00 0.00 ] +Key: VirtReg_GR16_ABCD: [ 0.00 0.00 ] +Key: VirtReg_FPCCR: [ 0.00 0.00 ] +Key: VirtReg_FR16X: [ 0.00 0.00 ] +Key: VirtReg_FR16: [ 0.00 0.00 ] +Key: VirtReg_VK16PAIR: [ 0.00 0.00 ] +Key: VirtReg_VK1PAIR: [ 0.00 0.00 ] +Key: VirtReg_VK2PAIR: [ 0.00 0.00 ] +Key: VirtReg_VK4PAIR: [ 0.00 0.00 ] +Key: VirtReg_VK8PAIR: [ 0.00 0.00 ] +Key: VirtReg_VK1PAIR_with_sub_mask_0_in_VK1WM: [ 0.00 0.00 ] +Key: VirtReg_LOW32_ADDR_ACCESS_RBP: [ 0.00 0.00 ] +Key: VirtReg_LOW32_ADDR_ACCESS: [ 0.00 0.00 ] +Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit: [ 0.00 0.00 ] +Key: VirtReg_FR32X: [ 0.00 0.00 ] +Key: VirtReg_GR32: [ 0.80 0.80 ] +Key: VirtReg_GR32_NOSP: [ 0.00 0.00 ] +Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2: [ 0.00 0.00 ] +Key: VirtReg_DEBUG_REG: [ 0.00 0.00 ] +Key: VirtReg_FR32: [ 0.00 0.00 ] +Key: VirtReg_GR32_NOREX2: [ 0.00 0.00 ] +Key: VirtReg_GR32_NOREX2_NOSP: [ 0.00 0.00 ] +Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ] +Key: VirtReg_GR32_NOREX: [ 0.00 0.00 ] +Key: VirtReg_VK32: [ 0.00 0.00 ] +Key: VirtReg_GR32_NOREX_NOSP: [ 0.00 0.00 ] +Key: VirtReg_RFP32: [ 0.00 0.00 ] +Key: VirtReg_VK32WM: [ 0.00 0.00 ] +Key: VirtReg_GR32_ABCD: [ 0.00 0.00 ] +Key: VirtReg_GR32_TC: [ 0.00 0.00 ] +Key: VirtReg_GR32_ABCD_and_GR32_TC: [ 0.00 0.00 ] +Key: VirtReg_GR32_AD: [ 0.00 0.00 ] +Key: VirtReg_GR32_ArgRef: [ 0.00 0.00 ] +Key: VirtReg_GR32_BPSP: [ 0.00 0.00 ] +Key: VirtReg_GR32_BSI: [ 0.00 0.00 ] +Key: VirtReg_GR32_CB: [ 0.00 0.00 ] +Key: VirtReg_GR32_DC: [ 0.00 0.00 ] +Key: VirtReg_GR32_DIBP: [ 0.00 0.00 ] +Key: VirtReg_GR32_SIDI: [ 0.00 0.00 ] +Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit: [ 0.00 0.00 ] +Key: VirtReg_CCR: [ 0.00 0.00 ] +Key: VirtReg_DFCCR: [ 0.00 0.00 ] +Key: VirtReg_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ] +Key: VirtReg_GR32_AD_and_GR32_ArgRef: [ 0.00 0.00 ] +Key: VirtReg_GR32_ArgRef_and_GR32_CB: [ 0.00 0.00 ] +Key: VirtReg_GR32_BPSP_and_GR32_DIBP: [ 0.00 0.00 ] +Key: VirtReg_GR32_BPSP_and_GR32_TC: [ 0.00 0.00 ] +Key: VirtReg_GR32_BSI_and_GR32_SIDI: [ 0.00 0.00 ] +Key: VirtReg_GR32_DIBP_and_GR32_SIDI: [ 0.00 0.00 ] +Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit: [ 0.00 0.00 ] +Key: VirtReg_LOW32_ADDR_ACCESS_with_sub_32bit: [ 0.00 0.00 ] +Key: VirtReg_RFP64: [ 0.00 0.00 ] +Key: VirtReg_GR64: [ 0.90 0.90 ] +Key: VirtReg_FR64X: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_8bit: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOSP: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX2: [ 0.00 0.00 ] +Key: VirtReg_CONTROL_REG: [ 0.00 0.00 ] +Key: VirtReg_FR64: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_16bit_in_GR16_NOREX2: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX2_NOSP: [ 0.00 0.00 ] +Key: VirtReg_GR64PLTSafe: [ 0.00 0.00 ] +Key: VirtReg_GR64_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX: [ 0.00 0.00 ] +Key: VirtReg_GR64_TCW64: [ 0.00 0.00 ] +Key: VirtReg_GR64_TC_with_sub_8bit: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX2_NOSP_and_GR64_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_TCW64_with_sub_8bit: [ 0.00 0.00 ] +Key: VirtReg_GR64_TC_and_GR64_TCW64: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ] +Key: VirtReg_VK64: [ 0.00 0.00 ] +Key: VirtReg_VR64: [ 0.00 0.00 ] +Key: VirtReg_GR64PLTSafe_and_GR64_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX2_NOSP_and_GR64_TCW64: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX_NOSP: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX_and_GR64_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_TCW64_and_GR64_TC_with_sub_8bit: [ 0.00 0.00 ] +Key: VirtReg_VK64WM: [ 0.00 0.00 ] +Key: VirtReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64: [ 0.00 0.00 ] +Key: VirtReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ] +Key: VirtReg_GR64PLTSafe_and_GR64_TCW64: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX_and_GR64_TCW64: [ 0.00 0.00 ] +Key: VirtReg_GR64_ABCD: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_AD: [ 0.00 0.00 ] +Key: VirtReg_GR64_ArgRef: [ 0.00 0.00 ] +Key: VirtReg_GR64_and_LOW32_ADDR_ACCESS_RBP: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_BSI: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_CB: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_DIBP: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_SIDI: [ 0.00 0.00 ] +Key: VirtReg_GR64_A: [ 0.00 0.00 ] +Key: VirtReg_GR64_ArgRef_and_GR64_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_and_LOW32_ADDR_ACCESS: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI: [ 0.00 0.00 ] +Key: VirtReg_RST: [ 0.00 0.00 ] +Key: VirtReg_RFP80: [ 0.00 0.00 ] +Key: VirtReg_RFP80_7: [ 0.00 0.00 ] +Key: VirtReg_VR128X: [ 0.00 0.00 ] +Key: VirtReg_VR128: [ 0.00 0.00 ] +Key: VirtReg_VR256X: [ 0.00 0.00 ] +Key: VirtReg_VR256: [ 0.00 0.00 ] +Key: VirtReg_VR512: [ 0.00 0.00 ] +Key: VirtReg_VR512_0_15: [ 0.00 0.00 ] +Key: VirtReg_TILE: [ 0.00 0.00 ] +Key: VirtReg_TILEPAIR: [ 0.00 0.00 ] diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt index 4409e6dbd5c72..c6e5508248b9b 100644 --- a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt +++ b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt @@ -6880,3 +6880,294 @@ Key: XSHA: [ 0.00 0.00 ] Key: XSTORE: [ 0.00 0.00 ] Key: XSUSLDTRK: [ 0.00 0.00 ] Key: XTEST: [ 0.00 0.00 ] +Key: Immediate: [ 0.10 0.10 ] +Key: CImmediate: [ 0.00 0.00 ] +Key: FPImmediate: [ 0.00 0.00 ] +Key: MBB: [ 0.20 0.20 ] +Key: FrameIndex: [ 0.30 0.30 ] +Key: ConstantPoolIndex: [ 0.00 0.00 ] +Key: TargetIndex: [ 0.00 0.00 ] +Key: JumpTableIndex: [ 0.00 0.00 ] +Key: ExternalSymbol: [ 0.00 0.00 ] +Key: GlobalAddress: [ 0.40 0.40 ] +Key: BlockAddress: [ 0.00 0.00 ] +Key: RegisterMask: [ 0.00 0.00 ] +Key: RegisterLiveOut: [ 0.00 0.00 ] +Key: Metadata: [ 0.00 0.00 ] +Key: MCSymbol: [ 0.00 0.00 ] +Key: CFIIndex: [ 0.00 0.00 ] +Key: IntrinsicID: [ 0.00 0.00 ] +Key: Predicate: [ 0.00 0.00 ] +Key: ShuffleMask: [ 0.00 0.00 ] +Key: PhyReg_GR8: [ 0.00 0.00 ] +Key: PhyReg_GRH8: [ 0.00 0.00 ] +Key: PhyReg_GR8_NOREX2: [ 0.00 0.00 ] +Key: PhyReg_GR8_NOREX: [ 0.00 0.00 ] +Key: PhyReg_GR8_ABCD_H: [ 0.00 0.00 ] +Key: PhyReg_GR8_ABCD_L: [ 0.00 0.00 ] +Key: PhyReg_GRH16: [ 0.00 0.00 ] +Key: PhyReg_GR16: [ 0.00 0.00 ] +Key: PhyReg_GR16_NOREX2: [ 0.00 0.00 ] +Key: PhyReg_GR16_NOREX: [ 0.00 0.00 ] +Key: PhyReg_VK1: [ 0.00 0.00 ] +Key: PhyReg_VK16: [ 0.00 0.00 ] +Key: PhyReg_VK2: [ 0.00 0.00 ] +Key: PhyReg_VK4: [ 0.00 0.00 ] +Key: PhyReg_VK8: [ 0.00 0.00 ] +Key: PhyReg_VK16WM: [ 0.00 0.00 ] +Key: PhyReg_VK1WM: [ 0.00 0.00 ] +Key: PhyReg_VK2WM: [ 0.00 0.00 ] +Key: PhyReg_VK4WM: [ 0.00 0.00 ] +Key: PhyReg_VK8WM: [ 0.00 0.00 ] +Key: PhyReg_SEGMENT_REG: [ 0.00 0.00 ] +Key: PhyReg_GR16_ABCD: [ 0.00 0.00 ] +Key: PhyReg_FPCCR: [ 0.00 0.00 ] +Key: PhyReg_FR16X: [ 0.00 0.00 ] +Key: PhyReg_FR16: [ 0.00 0.00 ] +Key: PhyReg_VK16PAIR: [ 0.00 0.00 ] +Key: PhyReg_VK1PAIR: [ 0.00 0.00 ] +Key: PhyReg_VK2PAIR: [ 0.00 0.00 ] +Key: PhyReg_VK4PAIR: [ 0.00 0.00 ] +Key: PhyReg_VK8PAIR: [ 0.00 0.00 ] +Key: PhyReg_VK1PAIR_with_sub_mask_0_in_VK1WM: [ 0.00 0.00 ] +Key: PhyReg_LOW32_ADDR_ACCESS_RBP: [ 0.00 0.00 ] +Key: PhyReg_LOW32_ADDR_ACCESS: [ 0.00 0.00 ] +Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit: [ 0.00 0.00 ] +Key: PhyReg_FR32X: [ 0.00 0.00 ] +Key: PhyReg_GR32: [ 0.50 0.50 ] +Key: PhyReg_GR32_NOSP: [ 0.00 0.00 ] +Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2: [ 0.00 0.00 ] +Key: PhyReg_DEBUG_REG: [ 0.00 0.00 ] +Key: PhyReg_FR32: [ 0.00 0.00 ] +Key: PhyReg_GR32_NOREX2: [ 0.00 0.00 ] +Key: PhyReg_GR32_NOREX2_NOSP: [ 0.00 0.00 ] +Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ] +Key: PhyReg_GR32_NOREX: [ 0.00 0.00 ] +Key: PhyReg_VK32: [ 0.00 0.00 ] +Key: PhyReg_GR32_NOREX_NOSP: [ 0.00 0.00 ] +Key: PhyReg_RFP32: [ 0.00 0.00 ] +Key: PhyReg_VK32WM: [ 0.00 0.00 ] +Key: PhyReg_GR32_ABCD: [ 0.00 0.00 ] +Key: PhyReg_GR32_TC: [ 0.00 0.00 ] +Key: PhyReg_GR32_ABCD_and_GR32_TC: [ 0.00 0.00 ] +Key: PhyReg_GR32_AD: [ 0.00 0.00 ] +Key: PhyReg_GR32_ArgRef: [ 0.00 0.00 ] +Key: PhyReg_GR32_BPSP: [ 0.00 0.00 ] +Key: PhyReg_GR32_BSI: [ 0.00 0.00 ] +Key: PhyReg_GR32_CB: [ 0.00 0.00 ] +Key: PhyReg_GR32_DC: [ 0.00 0.00 ] +Key: PhyReg_GR32_DIBP: [ 0.00 0.00 ] +Key: PhyReg_GR32_SIDI: [ 0.00 0.00 ] +Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit: [ 0.00 0.00 ] +Key: PhyReg_CCR: [ 0.00 0.00 ] +Key: PhyReg_DFCCR: [ 0.00 0.00 ] +Key: PhyReg_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ] +Key: PhyReg_GR32_AD_and_GR32_ArgRef: [ 0.00 0.00 ] +Key: PhyReg_GR32_ArgRef_and_GR32_CB: [ 0.00 0.00 ] +Key: PhyReg_GR32_BPSP_and_GR32_DIBP: [ 0.00 0.00 ] +Key: PhyReg_GR32_BPSP_and_GR32_TC: [ 0.00 0.00 ] +Key: PhyReg_GR32_BSI_and_GR32_SIDI: [ 0.00 0.00 ] +Key: PhyReg_GR32_DIBP_and_GR32_SIDI: [ 0.00 0.00 ] +Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit: [ 0.00 0.00 ] +Key: PhyReg_LOW32_ADDR_ACCESS_with_sub_32bit: [ 0.00 0.00 ] +Key: PhyReg_RFP64: [ 0.00 0.00 ] +Key: PhyReg_GR64: [ 0.60 0.60 ] +Key: PhyReg_FR64X: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_8bit: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOSP: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX2: [ 0.00 0.00 ] +Key: PhyReg_CONTROL_REG: [ 0.00 0.00 ] +Key: PhyReg_FR64: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_16bit_in_GR16_NOREX2: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX2_NOSP: [ 0.00 0.00 ] +Key: PhyReg_GR64PLTSafe: [ 0.00 0.00 ] +Key: PhyReg_GR64_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX: [ 0.00 0.00 ] +Key: PhyReg_GR64_TCW64: [ 0.00 0.00 ] +Key: PhyReg_GR64_TC_with_sub_8bit: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX2_NOSP_and_GR64_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_TCW64_with_sub_8bit: [ 0.00 0.00 ] +Key: PhyReg_GR64_TC_and_GR64_TCW64: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ] +Key: PhyReg_VK64: [ 0.00 0.00 ] +Key: PhyReg_VR64: [ 0.00 0.00 ] +Key: PhyReg_GR64PLTSafe_and_GR64_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX2_NOSP_and_GR64_TCW64: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX_NOSP: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX_and_GR64_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_TCW64_and_GR64_TC_with_sub_8bit: [ 0.00 0.00 ] +Key: PhyReg_VK64WM: [ 0.00 0.00 ] +Key: PhyReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64: [ 0.00 0.00 ] +Key: PhyReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ] +Key: PhyReg_GR64PLTSafe_and_GR64_TCW64: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_NOREX_and_GR64_TCW64: [ 0.00 0.00 ] +Key: PhyReg_GR64_ABCD: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_AD: [ 0.00 0.00 ] +Key: PhyReg_GR64_ArgRef: [ 0.00 0.00 ] +Key: PhyReg_GR64_and_LOW32_ADDR_ACCESS_RBP: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_BSI: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_CB: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_DIBP: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_SIDI: [ 0.00 0.00 ] +Key: PhyReg_GR64_A: [ 0.00 0.00 ] +Key: PhyReg_GR64_ArgRef_and_GR64_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_and_LOW32_ADDR_ACCESS: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI: [ 0.00 0.00 ] +Key: PhyReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI: [ 0.00 0.00 ] +Key: PhyReg_RST: [ 0.00 0.00 ] +Key: PhyReg_RFP80: [ 0.00 0.00 ] +Key: PhyReg_RFP80_7: [ 0.00 0.00 ] +Key: PhyReg_VR128X: [ 0.00 0.00 ] +Key: PhyReg_VR128: [ 0.00 0.00 ] +Key: PhyReg_VR256X: [ 0.00 0.00 ] +Key: PhyReg_VR256: [ 0.00 0.00 ] +Key: PhyReg_VR512: [ 0.00 0.00 ] +Key: PhyReg_VR512_0_15: [ 0.00 0.00 ] +Key: PhyReg_TILE: [ 0.00 0.00 ] +Key: PhyReg_TILEPAIR: [ 0.00 0.00 ] +Key: VirtReg_GR8: [ 0.00 0.00 ] +Key: VirtReg_GRH8: [ 0.00 0.00 ] +Key: VirtReg_GR8_NOREX2: [ 0.00 0.00 ] +Key: VirtReg_GR8_NOREX: [ 0.00 0.00 ] +Key: VirtReg_GR8_ABCD_H: [ 0.00 0.00 ] +Key: VirtReg_GR8_ABCD_L: [ 0.00 0.00 ] +Key: VirtReg_GRH16: [ 0.00 0.00 ] +Key: VirtReg_GR16: [ 0.00 0.00 ] +Key: VirtReg_GR16_NOREX2: [ 0.00 0.00 ] +Key: VirtReg_GR16_NOREX: [ 0.00 0.00 ] +Key: VirtReg_VK1: [ 0.00 0.00 ] +Key: VirtReg_VK16: [ 0.00 0.00 ] +Key: VirtReg_VK2: [ 0.00 0.00 ] +Key: VirtReg_VK4: [ 0.00 0.00 ] +Key: VirtReg_VK8: [ 0.00 0.00 ] +Key: VirtReg_VK16WM: [ 0.00 0.00 ] +Key: VirtReg_VK1WM: [ 0.00 0.00 ] +Key: VirtReg_VK2WM: [ 0.00 0.00 ] +Key: VirtReg_VK4WM: [ 0.00 0.00 ] +Key: VirtReg_VK8WM: [ 0.00 0.00 ] +Key: VirtReg_SEGMENT_REG: [ 0.00 0.00 ] +Key: VirtReg_GR16_ABCD: [ 0.00 0.00 ] +Key: VirtReg_FPCCR: [ 0.00 0.00 ] +Key: VirtReg_FR16X: [ 0.00 0.00 ] +Key: VirtReg_FR16: [ 0.00 0.00 ] +Key: VirtReg_VK16PAIR: [ 0.00 0.00 ] +Key: VirtReg_VK1PAIR: [ 0.00 0.00 ] +Key: VirtReg_VK2PAIR: [ 0.00 0.00 ] +Key: VirtReg_VK4PAIR: [ 0.00 0.00 ] +Key: VirtReg_VK8PAIR: [ 0.00 0.00 ] +Key: VirtReg_VK1PAIR_with_sub_mask_0_in_VK1WM: [ 0.00 0.00 ] +Key: VirtReg_LOW32_ADDR_ACCESS_RBP: [ 0.00 0.00 ] +Key: VirtReg_LOW32_ADDR_ACCESS: [ 0.00 0.00 ] +Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit: [ 0.00 0.00 ] +Key: VirtReg_FR32X: [ 0.00 0.00 ] +Key: VirtReg_GR32: [ 0.80 0.80 ] +Key: VirtReg_GR32_NOSP: [ 0.00 0.00 ] +Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2: [ 0.00 0.00 ] +Key: VirtReg_DEBUG_REG: [ 0.00 0.00 ] +Key: VirtReg_FR32: [ 0.00 0.00 ] +Key: VirtReg_GR32_NOREX2: [ 0.00 0.00 ] +Key: VirtReg_GR32_NOREX2_NOSP: [ 0.00 0.00 ] +Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ] +Key: VirtReg_GR32_NOREX: [ 0.00 0.00 ] +Key: VirtReg_VK32: [ 0.00 0.00 ] +Key: VirtReg_GR32_NOREX_NOSP: [ 0.00 0.00 ] +Key: VirtReg_RFP32: [ 0.00 0.00 ] +Key: VirtReg_VK32WM: [ 0.00 0.00 ] +Key: VirtReg_GR32_ABCD: [ 0.00 0.00 ] +Key: VirtReg_GR32_TC: [ 0.00 0.00 ] +Key: VirtReg_GR32_ABCD_and_GR32_TC: [ 0.00 0.00 ] +Key: VirtReg_GR32_AD: [ 0.00 0.00 ] +Key: VirtReg_GR32_ArgRef: [ 0.00 0.00 ] +Key: VirtReg_GR32_BPSP: [ 0.00 0.00 ] +Key: VirtReg_GR32_BSI: [ 0.00 0.00 ] +Key: VirtReg_GR32_CB: [ 0.00 0.00 ] +Key: VirtReg_GR32_DC: [ 0.00 0.00 ] +Key: VirtReg_GR32_DIBP: [ 0.00 0.00 ] +Key: VirtReg_GR32_SIDI: [ 0.00 0.00 ] +Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit: [ 0.00 0.00 ] +Key: VirtReg_CCR: [ 0.00 0.00 ] +Key: VirtReg_DFCCR: [ 0.00 0.00 ] +Key: VirtReg_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ] +Key: VirtReg_GR32_AD_and_GR32_ArgRef: [ 0.00 0.00 ] +Key: VirtReg_GR32_ArgRef_and_GR32_CB: [ 0.00 0.00 ] +Key: VirtReg_GR32_BPSP_and_GR32_DIBP: [ 0.00 0.00 ] +Key: VirtReg_GR32_BPSP_and_GR32_TC: [ 0.00 0.00 ] +Key: VirtReg_GR32_BSI_and_GR32_SIDI: [ 0.00 0.00 ] +Key: VirtReg_GR32_DIBP_and_GR32_SIDI: [ 0.00 0.00 ] +Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit: [ 0.00 0.00 ] +Key: VirtReg_LOW32_ADDR_ACCESS_with_sub_32bit: [ 0.00 0.00 ] +Key: VirtReg_RFP64: [ 0.00 0.00 ] +Key: VirtReg_GR64: [ 0.90 0.90 ] +Key: VirtReg_FR64X: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_8bit: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOSP: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX2: [ 0.00 0.00 ] +Key: VirtReg_CONTROL_REG: [ 0.00 0.00 ] +Key: VirtReg_FR64: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_16bit_in_GR16_NOREX2: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX2_NOSP: [ 0.00 0.00 ] +Key: VirtReg_GR64PLTSafe: [ 0.00 0.00 ] +Key: VirtReg_GR64_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX: [ 0.00 0.00 ] +Key: VirtReg_GR64_TCW64: [ 0.00 0.00 ] +Key: VirtReg_GR64_TC_with_sub_8bit: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX2_NOSP_and_GR64_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_TCW64_with_sub_8bit: [ 0.00 0.00 ] +Key: VirtReg_GR64_TC_and_GR64_TCW64: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ] +Key: VirtReg_VK64: [ 0.00 0.00 ] +Key: VirtReg_VR64: [ 0.00 0.00 ] +Key: VirtReg_GR64PLTSafe_and_GR64_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX2_NOSP_and_GR64_TCW64: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX_NOSP: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX_and_GR64_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_TCW64_and_GR64_TC_with_sub_8bit: [ 0.00 0.00 ] +Key: VirtReg_VK64WM: [ 0.00 0.00 ] +Key: VirtReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64: [ 0.00 0.00 ] +Key: VirtReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ] +Key: VirtReg_GR64PLTSafe_and_GR64_TCW64: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_NOREX_and_GR64_TCW64: [ 0.00 0.00 ] +Key: VirtReg_GR64_ABCD: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_AD: [ 0.00 0.00 ] +Key: VirtReg_GR64_ArgRef: [ 0.00 0.00 ] +Key: VirtReg_GR64_and_LOW32_ADDR_ACCESS_RBP: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_BSI: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_CB: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_DIBP: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_SIDI: [ 0.00 0.00 ] +Key: VirtReg_GR64_A: [ 0.00 0.00 ] +Key: VirtReg_GR64_ArgRef_and_GR64_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_and_LOW32_ADDR_ACCESS: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI: [ 0.00 0.00 ] +Key: VirtReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI: [ 0.00 0.00 ] +Key: VirtReg_RST: [ 0.00 0.00 ] +Key: VirtReg_RFP80: [ 0.00 0.00 ] +Key: VirtReg_RFP80_7: [ 0.00 0.00 ] +Key: VirtReg_VR128X: [ 0.00 0.00 ] +Key: VirtReg_VR128: [ 0.00 0.00 ] +Key: VirtReg_VR256X: [ 0.00 0.00 ] +Key: VirtReg_VR256: [ 0.00 0.00 ] +Key: VirtReg_VR512: [ 0.00 0.00 ] +Key: VirtReg_VR512_0_15: [ 0.00 0.00 ] +Key: VirtReg_TILE: [ 0.00 0.00 ] +Key: VirtReg_TILEPAIR: [ 0.00 0.00 ] diff --git a/llvm/test/CodeGen/MIR2Vec/if-else.mir b/llvm/test/CodeGen/MIR2Vec/if-else.mir index 5734a23c9bcb6..f2572f55751e5 100644 --- a/llvm/test/CodeGen/MIR2Vec/if-else.mir +++ b/llvm/test/CodeGen/MIR2Vec/if-else.mir @@ -135,10 +135,10 @@ body: | # CHECK: Machine basic block vectors: # CHECK-NEXT: Machine basic block: abc:entry: -# CHECK-NEXT: [ 16.50 17.10 17.70 ] +# CHECK-NEXT: [ 23.60 24.20 24.80 ] # CHECK-NEXT: Machine basic block: abc:if.then: -# CHECK-NEXT: [ 4.50 4.80 5.10 ] +# CHECK-NEXT: [ 7.30 7.60 7.90 ] # CHECK-NEXT: Machine basic block: abc:if.else: -# CHECK-NEXT: [ 0.80 1.00 1.20 ] +# CHECK-NEXT: [ 3.40 3.60 3.80 ] # CHECK-NEXT: Machine basic block: abc:return: -# CHECK-NEXT: [ 6.60 6.90 7.20 ] \ No newline at end of file +# CHECK-NEXT: [ 8.80 9.10 9.40 ] diff --git a/llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir b/llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir index 338cb637bac80..0fdcc812c6b28 100644 --- a/llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir +++ b/llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir @@ -48,29 +48,29 @@ body: | RET 0 # CHECK: MIR2Vec embeddings for machine function add_function: -# CHECK: Function vector: [ 19.20 19.80 20.40 ] +# CHECK: Function vector: [ 26.50 27.10 27.70 ] # CHECK-NEXT: Machine basic block vectors: # CHECK-NEXT: Machine basic block: add_function:entry: -# CHECK-NEXT: [ 19.20 19.80 20.40 ] +# CHECK-NEXT: [ 26.50 27.10 27.70 ] # CHECK-NEXT: Machine instruction vectors: # CHECK-NEXT: Machine instruction: %1:gr32 = COPY $esi -# CHECK-NEXT: [ 5.20 5.30 5.40 ] +# CHECK-NEXT: [ 6.00 6.10 6.20 ] # CHECK-NEXT: Machine instruction: %0:gr32 = COPY $edi -# CHECK-NEXT: [ 5.20 5.30 5.40 ] +# CHECK-NEXT: [ 6.00 6.10 6.20 ] # CHECK-NEXT: Machine instruction: %2:gr32 = nsw ADD32rr %0:gr32(tied-def 0), %1:gr32, implicit-def dead $eflags -# CHECK-NEXT: [ 1.30 1.40 1.50 ] +# CHECK-NEXT: [ 3.70 3.80 3.90 ] # CHECK-NEXT: Machine instruction: %3:gr32 = ADD32rr %2:gr32(tied-def 0), %2:gr32, implicit-def dead $eflags -# CHECK-NEXT: [ 1.30 1.40 1.50 ] +# CHECK-NEXT: [ 3.70 3.80 3.90 ] # CHECK-NEXT: Machine instruction: $eax = COPY %3:gr32 -# CHECK-NEXT: [ 5.20 5.30 5.40 ] +# CHECK-NEXT: [ 6.00 6.10 6.20 ] # CHECK-NEXT: Machine instruction: RET 0, $eax -# CHECK-NEXT: [ 1.00 1.10 1.20 ] +# CHECK-NEXT: [ 1.10 1.20 1.30 ] # CHECK: MIR2Vec embeddings for machine function simple_function: -# CHECK-NEXT:Function vector: [ 1.00 1.10 1.20 ] +# CHECK-NEXT:Function vector: [ 1.10 1.20 1.30 ] # CHECK-NEXT: Machine basic block vectors: # CHECK-NEXT: Machine basic block: simple_function:entry: -# CHECK-NEXT: [ 1.00 1.10 1.20 ] +# CHECK-NEXT: [ 1.10 1.20 1.30 ] # CHECK-NEXT: Machine instruction vectors: # CHECK-NEXT: Machine instruction: RET 0 -# CHECK-NEXT: [ 1.00 1.10 1.20 ] \ No newline at end of file +# CHECK-NEXT: [ 1.10 1.20 1.30 ] diff --git a/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll b/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll index c6554bc0b7eca..13e908eb7ba11 100644 --- a/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll +++ b/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll @@ -10,6 +10,6 @@ define dso_local void @test() { } ; CHECK-INVALID: MIR2Vec Vocabulary Printer: Failed to get vocabulary - MIR2Vec vocabulary file path not specified; set it using --mir2vec-vocab-path -; CHECK-ZERO-DIM: MIR2Vec Vocabulary Printer: Failed to get vocabulary - Dimension of 'entities' section of the vocabulary is zero -; CHECK-NO-ENTITIES: MIR2Vec Vocabulary Printer: Failed to get vocabulary - Missing 'entities' section in vocabulary file -; CHECK-INCONSISTENT-DIMS: MIR2Vec Vocabulary Printer: Failed to get vocabulary - All vectors in the 'entities' section of the vocabulary are not of the same dimension +; CHECK-ZERO-DIM: MIR2Vec Vocabulary Printer: Failed to get vocabulary - Dimension of 'Opcodes' section of the vocabulary is zero +; CHECK-NO-ENTITIES: MIR2Vec Vocabulary Printer: Failed to get vocabulary - Missing 'Opcodes' section in vocabulary file +; CHECK-INCONSISTENT-DIMS: MIR2Vec Vocabulary Printer: Failed to get vocabulary - All vectors in the 'Opcodes' section of the vocabulary are not of the same dimension diff --git a/llvm/unittests/CodeGen/MIR2VecTest.cpp b/llvm/unittests/CodeGen/MIR2VecTest.cpp index 8710d6b45969a..d42749c553eea 100644 --- a/llvm/unittests/CodeGen/MIR2VecTest.cpp +++ b/llvm/unittests/CodeGen/MIR2VecTest.cpp @@ -54,6 +54,9 @@ class MIR2VecVocabTestFixture : public ::testing::Test { std::unique_ptr M; std::unique_ptr TM; const TargetInstrInfo *TII = nullptr; + const TargetRegisterInfo *TRI = nullptr; + std::unique_ptr MMI; + MachineFunction *MF = nullptr; static void SetUpTestCase() { InitializeAllTargets(); @@ -90,15 +93,24 @@ class MIR2VecVocabTestFixture : public ::testing::Test { Function *F = Function::Create(FT, Function::ExternalLinkage, "test", M.get()); - // Get the target instruction info + // Create MMI and MF to get TRI and MRI + MMI = std::make_unique(TM.get()); + MF = &MMI->getOrCreateMachineFunction(*F); + + // Get the target instruction info and register info TII = TM->getSubtargetImpl(*F)->getInstrInfo(); - if (!TII) { - GTEST_SKIP() << "Failed to get target instruction info; Skipping test"; + TRI = TM->getSubtargetImpl(*F)->getRegisterInfo(); + if (!TII || !TRI) { + GTEST_SKIP() + << "Failed to get target instruction/register info; Skipping test"; return; } } - void TearDown() override { TII = nullptr; } + void TearDown() override { + TII = nullptr; + TRI = nullptr; + } // Find an opcode by name int findOpcodeByName(StringRef Name) { @@ -110,17 +122,94 @@ class MIR2VecVocabTestFixture : public ::testing::Test { } // Create a vocabulary with specific opcodes and embeddings - Expected - createTestVocab(std::initializer_list> opcodes, - unsigned dimension = 2) { - assert(TII && "TargetInstrInfo not initialized"); - VocabMap VMap; - for (const auto &[name, value] : opcodes) - VMap[name] = Embedding(dimension, value); - return MIRVocabulary::create(std::move(VMap), *TII); + // This might cause errors in future when the validation in + // MIRVocabulary::generateStorage() enforces hard checks on the vocabulary + // entries. + Expected createTestVocab( + std::initializer_list> Opcodes, + std::initializer_list> CommonOperands, + std::initializer_list> PhyRegs, + std::initializer_list> VirtRegs, + unsigned Dimension = 2) { + assert(TII && TRI && MF && "Target info not initialized"); + VocabMap OpcodeMap, CommonOperandMap, PhyRegMap, VirtRegMap; + for (const auto &[Name, Value] : Opcodes) + OpcodeMap[Name] = Embedding(Dimension, Value); + + for (const auto &[Name, Value] : CommonOperands) + CommonOperandMap[Name] = Embedding(Dimension, Value); + + for (const auto &[Name, Value] : PhyRegs) + PhyRegMap[Name] = Embedding(Dimension, Value); + + for (const auto &[Name, Value] : VirtRegs) + VirtRegMap[Name] = Embedding(Dimension, Value); + + // If any section is empty, create minimal maps for other vocabulary + // sections to satisfy validation + if (Opcodes.size() == 0) + OpcodeMap["NOOP"] = Embedding(Dimension, 0.0f); + if (CommonOperands.size() == 0) + CommonOperandMap["Immediate"] = Embedding(Dimension, 0.0f); + if (PhyRegs.size() == 0) + PhyRegMap["GR32"] = Embedding(Dimension, 0.0f); + if (VirtRegs.size() == 0) + VirtRegMap["GR32"] = Embedding(Dimension, 0.0f); + + return MIRVocabulary::create( + std::move(OpcodeMap), std::move(CommonOperandMap), std::move(PhyRegMap), + std::move(VirtRegMap), *TII, *TRI, MF->getRegInfo()); } }; +// Parameterized test for empty vocab sections +class MIR2VecVocabEmptySectionTestFixture + : public MIR2VecVocabTestFixture, + public ::testing::WithParamInterface { +protected: + void SetUp() override { + MIR2VecVocabTestFixture::SetUp(); + // If base class setup was skipped (TII not initialized), skip derived setup + if (!TII) + GTEST_SKIP() << "Failed to get target instruction info in " + "the base class setup; Skipping test"; + } +}; + +TEST_P(MIR2VecVocabEmptySectionTestFixture, EmptySectionFailsValidation) { + int EmptySection = GetParam(); + VocabMap OpcodeMap, CommonOperandMap, PhyRegMap, VirtRegMap; + + if (EmptySection != 0) + OpcodeMap["ADD"] = Embedding(2, 1.0f); + if (EmptySection != 1) + CommonOperandMap["Immediate"] = Embedding(2, 0.0f); + if (EmptySection != 2) + PhyRegMap["GR32"] = Embedding(2, 0.0f); + if (EmptySection != 3) + VirtRegMap["GR32"] = Embedding(2, 0.0f); + + ASSERT_TRUE(TII != nullptr); + ASSERT_TRUE(TRI != nullptr); + ASSERT_TRUE(MF != nullptr); + + auto VocabOrErr = MIRVocabulary::create( + std::move(OpcodeMap), std::move(CommonOperandMap), std::move(PhyRegMap), + std::move(VirtRegMap), *TII, *TRI, MF->getRegInfo()); + EXPECT_FALSE(static_cast(VocabOrErr)) + << "Factory method should fail when section " << EmptySection + << " is empty"; + + if (!VocabOrErr) { + auto Err = VocabOrErr.takeError(); + std::string ErrorMsg = toString(std::move(Err)); + EXPECT_FALSE(ErrorMsg.empty()); + } +} + +INSTANTIATE_TEST_SUITE_P(EmptySection, MIR2VecVocabEmptySectionTestFixture, + ::testing::Values(0, 1, 2, 3)); + TEST_F(MIR2VecVocabTestFixture, CanonicalOpcodeMappingTest) { // Test that same base opcodes get same canonical indices std::string BaseName1 = MIRVocabulary::extractBaseOpcodeName("ADD16ri"); @@ -133,7 +222,7 @@ TEST_F(MIR2VecVocabTestFixture, CanonicalOpcodeMappingTest) { // Create a MIRVocabulary instance to test the mapping // Use a minimal MIRVocabulary to trigger canonical mapping construction Embedding Val = Embedding(64, 1.0f); - auto TestVocabOrErr = createTestVocab({{"ADD", 1.0f}}, 64); + auto TestVocabOrErr = createTestVocab({{"ADD", 1.0f}}, {}, {}, {}, 64); ASSERT_TRUE(static_cast(TestVocabOrErr)) << "Failed to create vocabulary: " << toString(TestVocabOrErr.takeError()); @@ -190,7 +279,7 @@ TEST_F(MIR2VecVocabTestFixture, DeterministicMapping) { // Create a MIRVocabulary instance to test deterministic mapping // Use a minimal MIRVocabulary to trigger canonical mapping construction - auto TestVocabOrErr = createTestVocab({{"ADD", 1.0f}}, 64); + auto TestVocabOrErr = createTestVocab({{"ADD", 1.0f}}, {}, {}, {}, 64); ASSERT_TRUE(static_cast(TestVocabOrErr)) << "Failed to create vocabulary: " << toString(TestVocabOrErr.takeError()); @@ -210,7 +299,8 @@ TEST_F(MIR2VecVocabTestFixture, DeterministicMapping) { // Test MIRVocabulary construction TEST_F(MIR2VecVocabTestFixture, VocabularyConstruction) { - auto VocabOrErr = createTestVocab({{"ADD", 1.0f}, {"SUB", 2.0f}}, 128); + auto VocabOrErr = + createTestVocab({{"ADD", 1.0f}, {"SUB", 2.0f}}, {}, {}, {}, 128); ASSERT_TRUE(static_cast(VocabOrErr)) << "Failed to create vocabulary: " << toString(VocabOrErr.takeError()); auto &Vocab = *VocabOrErr; @@ -231,42 +321,15 @@ TEST_F(MIR2VecVocabTestFixture, VocabularyConstruction) { EXPECT_GT(Count, 0u); } -// Test factory method with empty vocabulary -TEST_F(MIR2VecVocabTestFixture, EmptyVocabularyCreation) { - VocabMap EmptyVMap; - - auto VocabOrErr = MIRVocabulary::create(std::move(EmptyVMap), *TII); - EXPECT_FALSE(static_cast(VocabOrErr)) - << "Factory method should fail with empty vocabulary"; - - // Consume the error - if (!VocabOrErr) { - auto Err = VocabOrErr.takeError(); - std::string ErrorMsg = toString(std::move(Err)); - EXPECT_FALSE(ErrorMsg.empty()); - } -} - // Fixture for embedding related tests class MIR2VecEmbeddingTestFixture : public MIR2VecVocabTestFixture { protected: - std::unique_ptr MMI; - MachineFunction *MF = nullptr; - void SetUp() override { MIR2VecVocabTestFixture::SetUp(); // If base class setup was skipped (TII not initialized), skip derived setup if (!TII) GTEST_SKIP() << "Failed to get target instruction info in " "the base class setup; Skipping test"; - - // Create a dummy function for MachineFunction - FunctionType *FT = FunctionType::get(Type::getVoidTy(*Ctx), false); - Function *F = - Function::Create(FT, Function::ExternalLinkage, "test", M.get()); - - MMI = std::make_unique(TM.get()); - MF = &MMI->getOrCreateMachineFunction(*F); } void TearDown() override { MIR2VecVocabTestFixture::TearDown(); } @@ -298,7 +361,8 @@ class MIR2VecEmbeddingTestFixture : public MIR2VecVocabTestFixture { // Test factory method for creating embedder TEST_F(MIR2VecEmbeddingTestFixture, CreateSymbolicEmbedder) { - auto VocabOrErr = MIRVocabulary::createDummyVocabForTest(*TII, 1); + auto VocabOrErr = + MIRVocabulary::createDummyVocabForTest(*TII, *TRI, MF->getRegInfo(), 1); ASSERT_TRUE(static_cast(VocabOrErr)) << "Failed to create vocabulary: " << toString(VocabOrErr.takeError()); auto &V = *VocabOrErr; @@ -307,7 +371,8 @@ TEST_F(MIR2VecEmbeddingTestFixture, CreateSymbolicEmbedder) { } TEST_F(MIR2VecEmbeddingTestFixture, CreateInvalidMode) { - auto VocabOrErr = MIRVocabulary::createDummyVocabForTest(*TII, 1); + auto VocabOrErr = + MIRVocabulary::createDummyVocabForTest(*TII, *TRI, MF->getRegInfo(), 1); ASSERT_TRUE(static_cast(VocabOrErr)) << "Failed to create vocabulary: " << toString(VocabOrErr.takeError()); auto &V = *VocabOrErr; @@ -324,7 +389,7 @@ TEST_F(MIR2VecEmbeddingTestFixture, TestSymbolicEmbedder) { {"RET", 2.0f}, // [2.0, 2.0, 2.0, 2.0] {"TRAP", 3.0f} // [3.0, 3.0, 3.0, 3.0] }, - 4); + {}, {}, {}, 4); ASSERT_TRUE(static_cast(VocabOrErr)) << "Failed to create vocabulary: " << toString(VocabOrErr.takeError()); auto &Vocab = *VocabOrErr; @@ -378,7 +443,8 @@ TEST_F(MIR2VecEmbeddingTestFixture, TestSymbolicEmbedder) { // Test embedder with multiple basic blocks TEST_F(MIR2VecEmbeddingTestFixture, MultipleBasicBlocks) { // Create a test vocabulary - auto VocabOrErr = createTestVocab({{"NOOP", 1.0f}, {"TRAP", 2.0f}}); + auto VocabOrErr = + createTestVocab({{"NOOP", 1.0f}, {"TRAP", 2.0f}}, {}, {}, {}); ASSERT_TRUE(static_cast(VocabOrErr)) << "Failed to create vocabulary: " << toString(VocabOrErr.takeError()); auto &Vocab = *VocabOrErr; @@ -431,7 +497,8 @@ TEST_F(MIR2VecEmbeddingTestFixture, EmptyBasicBlock) { MF->push_back(MBB); // Create embedder - auto VocabOrErr = MIRVocabulary::createDummyVocabForTest(*TII, 2); + auto VocabOrErr = + MIRVocabulary::createDummyVocabForTest(*TII, *TRI, MF->getRegInfo(), 2); ASSERT_TRUE(static_cast(VocabOrErr)) << "Failed to create vocabulary: " << toString(VocabOrErr.takeError()); auto &V = *VocabOrErr; @@ -452,7 +519,7 @@ TEST_F(MIR2VecEmbeddingTestFixture, EmptyBasicBlock) { TEST_F(MIR2VecEmbeddingTestFixture, UnknownOpcodes) { // Create a test vocabulary with limited entries // SUB is intentionally not included - auto VocabOrErr = createTestVocab({{"ADD", 1.0f}}); + auto VocabOrErr = createTestVocab({{"ADD", 1.0f}}, {}, {}, {}); ASSERT_TRUE(static_cast(VocabOrErr)) << "Failed to create vocabulary: " << toString(VocabOrErr.takeError()); auto &Vocab = *VocabOrErr; @@ -494,4 +561,210 @@ TEST_F(MIR2VecEmbeddingTestFixture, UnknownOpcodes) { Embedding ExpectedBBVector(2, 1.0f * ExpectedWeight); EXPECT_TRUE(MBBVector.approximatelyEquals(ExpectedBBVector)); } + +// Test vocabulary string key generation +TEST_F(MIR2VecEmbeddingTestFixture, VocabularyStringKeys) { + auto VocabOrErr = + createTestVocab({{"ADD", 1.0f}, {"SUB", 2.0f}}, {}, {}, {}, 2); + ASSERT_TRUE(static_cast(VocabOrErr)) + << "Failed to create vocabulary: " << toString(VocabOrErr.takeError()); + auto &Vocab = *VocabOrErr; + + // Test that we can get string keys for all positions + for (size_t Pos = 0; Pos < Vocab.getCanonicalSize(); ++Pos) { + std::string Key = Vocab.getStringKey(Pos); + EXPECT_FALSE(Key.empty()) << "Empty key at position " << Pos; + } + + // Test specific known positions if we can identify them + unsigned AddIndex = Vocab.getCanonicalIndexForBaseName("ADD"); + std::string AddKey = Vocab.getStringKey(AddIndex); + EXPECT_EQ(AddKey, "ADD"); + + unsigned SubIndex = Vocab.getCanonicalIndexForBaseName("SUB"); + std::string SubKey = Vocab.getStringKey(SubIndex); + EXPECT_EQ(SubKey, "SUB"); + + unsigned ImmIndex = Vocab.getCanonicalIndexForOperandName("Immediate"); + std::string ImmKey = Vocab.getStringKey(ImmIndex); + EXPECT_EQ(ImmKey, "Immediate"); + + unsigned PhyRegIndex = Vocab.getCanonicalIndexForRegisterClass("GR32", true); + std::string PhyRegKey = Vocab.getStringKey(PhyRegIndex); + EXPECT_EQ(PhyRegKey, "PhyReg_GR32"); + + unsigned VirtRegIndex = + Vocab.getCanonicalIndexForRegisterClass("GR32", false); + std::string VirtRegKey = Vocab.getStringKey(VirtRegIndex); + EXPECT_EQ(VirtRegKey, "VirtReg_GR32"); +} + +// Test vocabulary dimension consistency +TEST_F(MIR2VecEmbeddingTestFixture, DimensionConsistency) { + auto VocabOrErr = createTestVocab({{"TEST", 1.0f}}, {}, {}, {}, 5); + ASSERT_TRUE(static_cast(VocabOrErr)) + << "Failed to create vocabulary: " << toString(VocabOrErr.takeError()); + auto &Vocab = *VocabOrErr; + + EXPECT_EQ(Vocab.getDimension(), 5u); + + // All embeddings should have the same dimension + for (auto IT = Vocab.begin(); IT != Vocab.end(); ++IT) + EXPECT_EQ((*IT).size(), 5u); +} + +// Test invalid register handling through machine instruction creation +TEST_F(MIR2VecEmbeddingTestFixture, InvalidRegisterHandling) { + float MOVValue = 1.5f; + float ImmValue = 0.5f; + float PhyRegValue = 0.2f; + auto VocabOrErr = createTestVocab( + {{"MOV", MOVValue}}, {{"Immediate", ImmValue}}, + {{"GR8_ABCD_H", PhyRegValue}, {"GR8_ABCD_L", PhyRegValue + 0.1f}}, {}, 3); + ASSERT_TRUE(static_cast(VocabOrErr)) + << "Failed to create vocabulary: " << toString(VocabOrErr.takeError()); + auto &Vocab = *VocabOrErr; + + MachineBasicBlock *MBB = MF->CreateMachineBasicBlock(); + MF->push_back(MBB); + + // Create a MOV instruction with actual operands including potential $noreg + // This tests the actual scenario where invalid registers are encountered + auto MovOpcode = findOpcodeByName("MOV32mr"); + ASSERT_NE(MovOpcode, -1) << "MOV32mr opcode not found"; + const MCInstrDesc &Desc = TII->get(MovOpcode); + + // Use available physical registers from the target + unsigned BaseReg = + TRI->getNumRegs() > 1 ? 1 : 0; // First available physical register + unsigned ValueReg = TRI->getNumRegs() > 2 ? 2 : BaseReg; + + // MOV32mr typically has: base, scale, index, displacement, segment, value + // Use the MachineInstrBuilder API properly + auto MovInst = BuildMI(*MBB, MBB->end(), DebugLoc(), Desc) + .addReg(BaseReg) // base + .addImm(1) // scale + .addReg(0) // index ($noreg) + .addImm(-4) // displacement + .addReg(0) // segment ($noreg) + .addReg(ValueReg); // value + + auto Embedder = SymbolicMIREmbedder::create(*MF, Vocab); + ASSERT_TRUE(Embedder != nullptr); + + // This should not crash even if the instruction has $noreg operands + auto InstEmb = Embedder->getMInstVector(*MovInst); + EXPECT_EQ(InstEmb.size(), 3u); + + // Test the expected embedding value + Embedding ExpectedOpcodeContribution(3, MOVValue * mir2vec::OpcWeight); + auto ExpectedOperandContribution = + Embedding(3, PhyRegValue * mir2vec::RegOperandWeight) // Base + + Embedding(3, ImmValue * mir2vec::CommonOperandWeight) // Scale + + Embedding(3, 0.0f) // noreg + + Embedding(3, ImmValue * mir2vec::CommonOperandWeight) // displacement + + Embedding(3, 0.0f) // noreg + + Embedding(3, (PhyRegValue + 0.1f) * mir2vec::RegOperandWeight); // Value + auto ExpectedEmb = ExpectedOpcodeContribution + ExpectedOperandContribution; + EXPECT_TRUE(InstEmb.approximatelyEquals(ExpectedEmb)) + << "MOV instruction embedding should match expected embedding"; +} + +// Test handling of both physical and virtual registers in an instruction +TEST_F(MIR2VecEmbeddingTestFixture, PhysicalAndVirtualRegisterHandling) { + float MOVValue = 2.0f; + float ImmValue = 0.7f; + float PhyRegValue = 0.3f; + float VirtRegValue = 0.9f; + + // Find GR32 register class + const TargetRegisterClass *GR32RC = nullptr; + for (unsigned i = 0; i < TRI->getNumRegClasses(); ++i) { + const TargetRegisterClass *RC = TRI->getRegClass(i); + if (std::string(TRI->getRegClassName(RC)) == "GR32") { + GR32RC = RC; + break; + } + } + ASSERT_TRUE(GR32RC != nullptr && GR32RC->isAllocatable()) + << "No allocatable GR32 register class found"; + + // Get first available physical register from GR32 + unsigned PhyReg = *GR32RC->begin(); + // Create a virtual register of class GR32 + unsigned VirtReg = MF->getRegInfo().createVirtualRegister(GR32RC); + + // Create vocabulary with register class based keys + auto VocabOrErr = + createTestVocab({{"MOV", MOVValue}}, {{"Immediate", ImmValue}}, + {{"GR32_AD", PhyRegValue}}, // GR32_AD is the minimal key + {{"GR32", VirtRegValue}}, 4); + ASSERT_TRUE(static_cast(VocabOrErr)) + << "Failed to create vocabulary: " << toString(VocabOrErr.takeError()); + auto &Vocab = *VocabOrErr; + + MachineBasicBlock *MBB = MF->CreateMachineBasicBlock(); + MF->push_back(MBB); + + // Create a MOV32rr instruction: MOV32rr dst, src + auto MovOpcode = findOpcodeByName("MOV32rr"); + ASSERT_NE(MovOpcode, -1) << "MOV32rr opcode not found"; + const MCInstrDesc &Desc = TII->get(MovOpcode); + + // MOV32rr: dst (physical), src (virtual) + auto MovInst = BuildMI(*MBB, MBB->end(), DebugLoc(), Desc) + .addReg(PhyReg) // physical register destination + .addReg(VirtReg); // virtual register source + + // Create embedder with virtual register support + auto Embedder = SymbolicMIREmbedder::create(*MF, Vocab); + ASSERT_TRUE(Embedder != nullptr); + + // This should not crash and should produce a valid embedding + auto InstEmb = Embedder->getMInstVector(*MovInst); + EXPECT_EQ(InstEmb.size(), 4u); + + // Test the expected embedding value + Embedding ExpectedOpcodeContribution(4, MOVValue * mir2vec::OpcWeight); + auto ExpectedOperandContribution = + Embedding(4, PhyRegValue * mir2vec::RegOperandWeight) // dst (physical) + + Embedding(4, VirtRegValue * mir2vec::RegOperandWeight); // src (virtual) + auto ExpectedEmb = ExpectedOpcodeContribution + ExpectedOperandContribution; + EXPECT_TRUE(InstEmb.approximatelyEquals(ExpectedEmb)) + << "MOV32rr instruction embedding should match expected embedding"; +} + +// Test precise embedding calculation with known operands +TEST_F(MIR2VecEmbeddingTestFixture, EmbeddingCalculation) { + auto VocabOrErr = createTestVocab({{"NOOP", 2.0f}}, {}, {}, {}, 2); + ASSERT_TRUE(static_cast(VocabOrErr)) + << "Failed to create vocabulary: " << toString(VocabOrErr.takeError()); + auto &Vocab = *VocabOrErr; + + MachineBasicBlock *MBB = MF->CreateMachineBasicBlock(); + MF->push_back(MBB); + + // Create a simple NOOP instruction (no operands) + auto NoopInst = createMachineInstr(*MBB, "NOOP"); + ASSERT_TRUE(NoopInst != nullptr); + + auto Embedder = SymbolicMIREmbedder::create(*MF, Vocab); + ASSERT_TRUE(Embedder != nullptr); + + // Get the instruction embedding + auto InstEmb = Embedder->getMInstVector(*NoopInst); + EXPECT_EQ(InstEmb.size(), 2u); + + // For NOOP with no operands, the embedding should be exactly the opcode + // embedding + float ExpectedWeight = mir2vec::OpcWeight; + Embedding ExpectedEmb(2, 2.0f * ExpectedWeight); + + EXPECT_TRUE(InstEmb.approximatelyEquals(ExpectedEmb)) + << "NOOP instruction embedding should match opcode embedding"; + + // Verify individual components + EXPECT_FLOAT_EQ(InstEmb[0], 2.0f * ExpectedWeight); + EXPECT_FLOAT_EQ(InstEmb[1], 2.0f * ExpectedWeight); +} } // namespace From 99abda7b02c9d6ba8b996867d2de624815ace1ce Mon Sep 17 00:00:00 2001 From: PiJoules <6019989+PiJoules@users.noreply.github.com> Date: Wed, 22 Oct 2025 11:00:26 -0700 Subject: [PATCH 146/530] Revert "[clang-format] Annotate ::operator and Foo::operator correctly" (#164670) Reverts llvm/llvm-project#164048 This led to a regression in clang-format where a space gets added in between the parameter type and `&`. For example, this ``` ::test_anonymous::FunctionApplication& ::test_anonymous::FunctionApplication::operator=(const ::test_anonymous::FunctionApplication& other) noexcept { ``` becomes ``` ::test_anonymous::FunctionApplication& ::test_anonymous::FunctionApplication::operator=(const ::test_anonymous::FunctionApplication & other) noexcept { ``` --- clang/lib/Format/TokenAnnotator.cpp | 10 +++++++--- clang/unittests/Format/TokenAnnotatorTest.cpp | 5 ----- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index c97a9e81eb59e..25971d2497f97 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -3791,12 +3791,18 @@ static bool isFunctionDeclarationName(const LangOptions &LangOpts, if (Current.is(TT_FunctionDeclarationName)) return true; - if (Current.isNoneOf(tok::identifier, tok::kw_operator)) + if (!Current.Tok.getIdentifierInfo()) return false; const auto *Prev = Current.getPreviousNonComment(); assert(Prev); + if (Prev->is(tok::coloncolon)) + Prev = Prev->Previous; + + if (!Prev) + return false; + const auto &Previous = *Prev; if (const auto *PrevPrev = Previous.getPreviousNonComment(); @@ -3845,8 +3851,6 @@ static bool isFunctionDeclarationName(const LangOptions &LangOpts, // Find parentheses of parameter list. if (Current.is(tok::kw_operator)) { - if (Line.startsWith(tok::kw_friend)) - return true; if (Previous.Tok.getIdentifierInfo() && Previous.isNoneOf(tok::kw_return, tok::kw_co_return)) { return true; diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index ca99940890984..f3637383a0a65 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -1129,11 +1129,6 @@ TEST_F(TokenAnnotatorTest, UnderstandsOverloadedOperators) { ASSERT_EQ(Tokens.size(), 7u) << Tokens; // Not TT_FunctionDeclarationName. EXPECT_TOKEN(Tokens[3], tok::kw_operator, TT_Unknown); - - Tokens = annotate("SomeAPI::operator()();"); - ASSERT_EQ(Tokens.size(), 9u) << Tokens; - // Not TT_FunctionDeclarationName. - EXPECT_TOKEN(Tokens[2], tok::kw_operator, TT_Unknown); } TEST_F(TokenAnnotatorTest, OverloadedOperatorInTemplate) { From 04b5cc6a2457dbd6b320c8345959cf60c94e3cc6 Mon Sep 17 00:00:00 2001 From: Anutosh Bhat Date: Wed, 22 Oct 2025 23:37:47 +0530 Subject: [PATCH 147/530] [clang-repl] Fix duplicate definition error for symbols in C mode (#164597) Fixes #164596 --- clang/lib/Sema/Sema.cpp | 7 +++++++ clang/test/Interpreter/pretty-print.c | 22 ++++++++++++++++++++-- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp index 215ac184a5337..8ed3df7cf17a5 100644 --- a/clang/lib/Sema/Sema.cpp +++ b/clang/lib/Sema/Sema.cpp @@ -1484,6 +1484,13 @@ void Sema::ActOnEndOfTranslationUnit() { Consumer.CompleteTentativeDefinition(VD); } + // In incremental mode, tentative definitions belong to the current + // partial translation unit (PTU). Once they have been completed and + // emitted to codegen, drop them to prevent re-emission in future PTUs. + if (PP.isIncrementalProcessingEnabled()) + TentativeDefinitions.erase(TentativeDefinitions.begin(ExternalSource.get()), + TentativeDefinitions.end()); + for (auto *D : ExternalDeclarations) { if (!D || D->isInvalidDecl() || D->getPreviousDecl() || !D->isUsed()) continue; diff --git a/clang/test/Interpreter/pretty-print.c b/clang/test/Interpreter/pretty-print.c index 588df70e33e84..d0712fb152107 100644 --- a/clang/test/Interpreter/pretty-print.c +++ b/clang/test/Interpreter/pretty-print.c @@ -75,9 +75,10 @@ int * ptr = (int*)0x123; ptr int * null_ptr = (int*)0; null_ptr // CHECK-NEXT: (int *) 0x0 +union U { int I; float F; } u; u.I = 12; u.I +// CHECK-NEXT: (int) 12 + // TODO: _Bool, _Complex, _Atomic, and _BitInt -// union U { int I; float F; } u; u.I = 12; u.I -// TODO-CHECK-NEXT: (int) 12 // struct S1{} s1; s1 // TODO-CHECK-NEXT: (S1 &) @0x{{[0-9a-f]+}} @@ -86,4 +87,21 @@ int * null_ptr = (int*)0; null_ptr // E.d // TODO-CHECK-NEXT: (int) 22 +// ----------------------------------------------------------------------------- +// Tentative definition handling (C99 6.9.2) +// Verify that multiple distinct tentative definitions across inputs no longer +// conflict. Each variable should emit correctly in its own incremental module. +// ----------------------------------------------------------------------------- + +int t1; +int t2; +int t3; +t1 = 1; t2 = 2; t3 = 3; +t1 + t2 + t3 +// CHECK-NEXT: (int) 6 + +// A redefinition of an existing tentative variable should still fail. +int t1; +// expected-error {{duplicate definition of symbol '_t1'}} + %quit From 9fc353ee36d646ed0dc609734a364437265ec374 Mon Sep 17 00:00:00 2001 From: Alexey Samsonov Date: Wed, 22 Oct 2025 11:09:42 -0700 Subject: [PATCH 148/530] [libc] Fix a couple issues in header (#164666) * Add FILE type declaration, as it should be presented in ``, as well as in `` * Fix argument type in `wcsrtombs` / `wcsnrtombs` function - it should be restrict pointer to `mbstate_t`. Add restrict qualifier to internal implementation as well. This brings us closer to being able to build libcxx with wide-character support against llvm-libc headers. --- libc/include/wchar.yaml | 11 ++++++----- libc/src/wchar/wcsnrtombs.cpp | 2 +- libc/src/wchar/wcsnrtombs.h | 2 +- libc/src/wchar/wcsrtombs.cpp | 2 +- libc/src/wchar/wcsrtombs.h | 2 +- 5 files changed, 10 insertions(+), 9 deletions(-) diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml index 8178091ab2202..b8a0a748cd3ad 100644 --- a/libc/include/wchar.yaml +++ b/libc/include/wchar.yaml @@ -4,6 +4,7 @@ macros: - macro_name: NULL macro_header: null-macro.h types: + - type_name: FILE - type_name: size_t - type_name: wint_t - type_name: wchar_t @@ -104,9 +105,9 @@ functions: - name: wmemset standards: - stdc - return_type: wchar_t* + return_type: wchar_t * arguments: - - type: wchar_t* + - type: wchar_t * - type: wchar_t - type: size_t - name: wcschr @@ -246,7 +247,7 @@ functions: - type: const wchar_t **__restrict - type: size_t - type: size_t - - type: mbstate_t + - type: mbstate_t *__restrict - name: wcsrtombs standards: - stdc @@ -255,7 +256,7 @@ functions: - type: char *__restrict - type: const wchar_t **__restrict - type: size_t - - type: mbstate_t + - type: mbstate_t *__restrict - name: wcrtomb standards: - stdc @@ -299,7 +300,7 @@ functions: arguments: - type: wchar_t *__restrict - type: const wchar_t *__restrict - - type: wchar_t** __restrict + - type: wchar_t **__restrict - name: wcpcpy standards: - stdc diff --git a/libc/src/wchar/wcsnrtombs.cpp b/libc/src/wchar/wcsnrtombs.cpp index 7f25b248a0863..a344c2331b532 100644 --- a/libc/src/wchar/wcsnrtombs.cpp +++ b/libc/src/wchar/wcsnrtombs.cpp @@ -22,7 +22,7 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(size_t, wcsnrtombs, (char *__restrict s, const wchar_t **__restrict pwcs, - size_t nwc, size_t len, mbstate_t *ps)) { + size_t nwc, size_t len, mbstate_t *__restrict ps)) { LIBC_CRASH_ON_NULLPTR(pwcs); static internal::mbstate internal_mbstate; auto result = internal::wcsnrtombs( diff --git a/libc/src/wchar/wcsnrtombs.h b/libc/src/wchar/wcsnrtombs.h index bf8add75b2951..2ca42c71e2e9d 100644 --- a/libc/src/wchar/wcsnrtombs.h +++ b/libc/src/wchar/wcsnrtombs.h @@ -17,7 +17,7 @@ namespace LIBC_NAMESPACE_DECL { size_t wcsnrtombs(char *__restrict s, const wchar_t **__restrict pwcs, - size_t nwc, size_t len, mbstate_t *ps); + size_t nwc, size_t len, mbstate_t *__restrict ps); } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/wchar/wcsrtombs.cpp b/libc/src/wchar/wcsrtombs.cpp index 9d2508cb81a8c..0167e857128de 100644 --- a/libc/src/wchar/wcsrtombs.cpp +++ b/libc/src/wchar/wcsrtombs.cpp @@ -22,7 +22,7 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(size_t, wcsrtombs, (char *__restrict s, const wchar_t **__restrict pwcs, - size_t n, mbstate_t *ps)) { + size_t n, mbstate_t *__restrict ps)) { LIBC_CRASH_ON_NULLPTR(pwcs); static internal::mbstate internal_mbstate; auto result = internal::wcsnrtombs( diff --git a/libc/src/wchar/wcsrtombs.h b/libc/src/wchar/wcsrtombs.h index d23573f5b9418..b85e2c6ed740a 100644 --- a/libc/src/wchar/wcsrtombs.h +++ b/libc/src/wchar/wcsrtombs.h @@ -17,7 +17,7 @@ namespace LIBC_NAMESPACE_DECL { size_t wcsrtombs(char *__restrict s, const wchar_t **__restrict pwcs, size_t n, - mbstate_t *ps); + mbstate_t *__restrict ps); } // namespace LIBC_NAMESPACE_DECL From 58d4f180676776ed70f7408189f0fcd087919439 Mon Sep 17 00:00:00 2001 From: Muhammad Bassiouni <60100307+bassiounix@users.noreply.github.com> Date: Wed, 22 Oct 2025 21:25:27 +0300 Subject: [PATCH 149/530] [libc][annex_k] Add Annex K support macros. (#163100) RFC https://discourse.llvm.org/t/rfc-bounds-checking-interfaces-for-llvm-libc/87685 Add internal support macros required by Annex K interface in LLVM libc. --- libc/include/llvm-libc-macros/CMakeLists.txt | 6 +++++ .../include/llvm-libc-macros/annex-k-macros.h | 27 +++++++++++++++++++ 2 files changed, 33 insertions(+) create mode 100644 libc/include/llvm-libc-macros/annex-k-macros.h diff --git a/libc/include/llvm-libc-macros/CMakeLists.txt b/libc/include/llvm-libc-macros/CMakeLists.txt index a33ff6cabbcf5..b16337cccd58b 100644 --- a/libc/include/llvm-libc-macros/CMakeLists.txt +++ b/libc/include/llvm-libc-macros/CMakeLists.txt @@ -31,6 +31,12 @@ if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS}) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS}) endif() +add_macro_header( + annex_k_macros + HDR + annex-k-macros.h +) + add_macro_header( assert_macros HDR diff --git a/libc/include/llvm-libc-macros/annex-k-macros.h b/libc/include/llvm-libc-macros/annex-k-macros.h new file mode 100644 index 0000000000000..7cfb5c1d3060b --- /dev/null +++ b/libc/include/llvm-libc-macros/annex-k-macros.h @@ -0,0 +1,27 @@ +//===-- Definition of macros to be used with Annex K functions ------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_INCLUDE_LLVM_LIBC_MACROS_ANNEX_K_MACROS_H +#define LLVM_LIBC_INCLUDE_LLVM_LIBC_MACROS_ANNEX_K_MACROS_H + +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || \ + (defined(__cplusplus) && __cplusplus >= 201703L) + +// TODO(bassiounix): Who should def this macro (clang vs libc)? Where? +// TODO(bassiounix): uncomment/move when Annex K is fully implemented. +// #define __STDC_LIB_EXT1__ 201112L + +#if defined(__STDC_WANT_LIB_EXT1__) && __STDC_WANT_LIB_EXT1__ == 1 + +#define LIBC_HAS_ANNEX_K + +#endif // defined(__STDC_WANT_LIB_EXT1__) && __STDC_WANT_LIB_EXT1__ == 1 + +#endif // (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || + // (defined(__cplusplus) && __cplusplus >= 201703L) +#endif // LLVM_LIBC_INCLUDE_LLVM_LIBC_MACROS_ANNEX_K_MACROS_H From 9a2e825c13655d955054bb65be5fc010c7ed69e3 Mon Sep 17 00:00:00 2001 From: Jakub Kuderski Date: Wed, 22 Oct 2025 14:30:19 -0400 Subject: [PATCH 150/530] [clang][CIR][mlir] Migrate to free create functions. NFC. (#164656) See https://discourse.llvm.org/t/psa-opty-create-now-with-100-more-tab-complete/87339. I plan to make these deprecated in https://github.com/llvm/llvm-project/pull/164649. --- clang/lib/CIR/CodeGen/CIRGenAsm.cpp | 6 +- clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp | 20 +- clang/lib/CIR/CodeGen/CIRGenCXX.cpp | 2 +- clang/lib/CIR/CodeGen/CIRGenClass.cpp | 7 +- clang/lib/CIR/CodeGen/CIRGenExpr.cpp | 30 +- clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp | 13 +- clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp | 118 ++--- clang/lib/CIR/CodeGen/CIRGenFunction.cpp | 30 +- clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp | 4 +- clang/lib/CIR/CodeGen/CIRGenModule.cpp | 8 +- clang/lib/CIR/CodeGen/CIRGenOpenACC.cpp | 12 +- clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp | 26 +- clang/lib/CIR/CodeGen/CIRGenStmt.cpp | 99 ++-- clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp | 24 +- .../lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp | 4 +- clang/lib/CIR/Dialect/IR/CIRDialect.cpp | 8 +- clang/lib/CIR/Dialect/IR/CIRMemorySlot.cpp | 4 +- .../lib/CIR/Dialect/Transforms/FlattenCFG.cpp | 53 +- .../Dialect/Transforms/LoweringPrepare.cpp | 18 +- .../LoweringPrepareItaniumCXXABI.cpp | 9 +- .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp | 474 +++++++++--------- clang/lib/CIR/Lowering/LoweringHelpers.cpp | 12 +- clang/unittests/CIR/PointerLikeTest.cpp | 34 +- 23 files changed, 516 insertions(+), 499 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenAsm.cpp b/clang/lib/CIR/CodeGen/CIRGenAsm.cpp index 17dffb3515d2a..88a7e85cb2a64 100644 --- a/clang/lib/CIR/CodeGen/CIRGenAsm.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenAsm.cpp @@ -117,9 +117,9 @@ mlir::LogicalResult CIRGenFunction::emitAsmStmt(const AsmStmt &s) { bool hasSideEffect = s.isVolatile() || s.getNumOutputs() == 0; - cir::InlineAsmOp ia = builder.create( - getLoc(s.getAsmLoc()), resultType, operands, asmString, constraints, - hasSideEffect, inferFlavor(cgm, s), mlir::ArrayAttr()); + cir::InlineAsmOp ia = cir::InlineAsmOp::create( + builder, getLoc(s.getAsmLoc()), resultType, operands, asmString, + constraints, hasSideEffect, inferFlavor(cgm, s), mlir::ArrayAttr()); if (isGCCAsmGoto) { assert(!cir::MissingFeatures::asmGoto()); diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp index 798e9d9fbb99e..27c4d11fa233a 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp @@ -46,9 +46,9 @@ static RValue emitBuiltinBitOp(CIRGenFunction &cgf, const CallExpr *e, Op op; if constexpr (std::is_same_v || std::is_same_v) - op = builder.create(cgf.getLoc(e->getSourceRange()), arg, poisonZero); + op = Op::create(builder, cgf.getLoc(e->getSourceRange()), arg, poisonZero); else - op = builder.create(cgf.getLoc(e->getSourceRange()), arg); + op = Op::create(builder, cgf.getLoc(e->getSourceRange()), arg); mlir::Value result = op.getResult(); mlir::Type exprTy = cgf.convertType(e->getType()); @@ -67,8 +67,8 @@ RValue CIRGenFunction::emitRotate(const CallExpr *e, bool isRotateLeft) { // to the type of input when necessary. assert(!cir::MissingFeatures::msvcBuiltins()); - auto r = builder.create(getLoc(e->getSourceRange()), input, - amount, isRotateLeft); + auto r = cir::RotateOp::create(builder, getLoc(e->getSourceRange()), input, + amount, isRotateLeft); return RValue::get(r); } @@ -227,14 +227,14 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID, return RValue::get(nullptr); mlir::Value argValue = emitCheckedArgForAssume(e->getArg(0)); - builder.create(loc, argValue); + cir::AssumeOp::create(builder, loc, argValue); return RValue::get(nullptr); } case Builtin::BI__builtin_assume_separate_storage: { mlir::Value value0 = emitScalarExpr(e->getArg(0)); mlir::Value value1 = emitScalarExpr(e->getArg(1)); - builder.create(loc, value0, value1); + cir::AssumeSepStorageOp::create(builder, loc, value0, value1); return RValue::get(nullptr); } @@ -363,8 +363,8 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID, probability); } - auto result = builder.create( - loc, argValue.getType(), argValue, expectedValue, probAttr); + auto result = cir::ExpectOp::create(builder, loc, argValue.getType(), + argValue, expectedValue, probAttr); return RValue::get(result); } @@ -375,7 +375,7 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID, case Builtin::BI_byteswap_ulong: case Builtin::BI_byteswap_uint64: { mlir::Value arg = emitScalarExpr(e->getArg(0)); - return RValue::get(builder.create(loc, arg)); + return RValue::get(cir::ByteSwapOp::create(builder, loc, arg)); } case Builtin::BI__builtin_bitreverse8: @@ -383,7 +383,7 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID, case Builtin::BI__builtin_bitreverse32: case Builtin::BI__builtin_bitreverse64: { mlir::Value arg = emitScalarExpr(e->getArg(0)); - return RValue::get(builder.create(loc, arg)); + return RValue::get(cir::BitReverseOp::create(builder, loc, arg)); } case Builtin::BI__builtin_rotateleft8: diff --git a/clang/lib/CIR/CodeGen/CIRGenCXX.cpp b/clang/lib/CIR/CodeGen/CIRGenCXX.cpp index 171ce1c950907..ca421e5faf38f 100644 --- a/clang/lib/CIR/CodeGen/CIRGenCXX.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenCXX.cpp @@ -151,7 +151,7 @@ static void emitDeclDestroy(CIRGenFunction &cgf, const VarDecl *vd, // Don't confuse lexical cleanup. builder.clearInsertionPoint(); } else { - builder.create(addr.getLoc()); + cir::YieldOp::create(builder, addr.getLoc()); } } diff --git a/clang/lib/CIR/CodeGen/CIRGenClass.cpp b/clang/lib/CIR/CodeGen/CIRGenClass.cpp index 89f492603b11c..5046e0945002f 100644 --- a/clang/lib/CIR/CodeGen/CIRGenClass.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenClass.cpp @@ -725,8 +725,9 @@ void CIRGenFunction::emitCXXAggrConstructorCall( // Emit the constructor call that will execute for every array element. mlir::Value arrayOp = builder.createPtrBitcast(arrayBase.getPointer(), arrayTy); - builder.create( - *currSrcLoc, arrayOp, [&](mlir::OpBuilder &b, mlir::Location loc) { + cir::ArrayCtor::create( + builder, *currSrcLoc, arrayOp, + [&](mlir::OpBuilder &b, mlir::Location loc) { mlir::BlockArgument arg = b.getInsertionBlock()->addArgument(ptrToElmType, loc); Address curAddr = Address(arg, elementType, eltAlignment); @@ -738,7 +739,7 @@ void CIRGenFunction::emitCXXAggrConstructorCall( emitCXXConstructorCall(ctor, Ctor_Complete, /*ForVirtualBase=*/false, /*Delegating=*/false, currAVS, e); - builder.create(loc); + cir::YieldOp::create(builder, loc); }); } } diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp index 52021fce1c675..4f7da16e15b63 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp @@ -251,8 +251,8 @@ void CIRGenFunction::emitStoreThroughLValue(RValue src, LValue dst, const mlir::Location loc = dst.getVectorPointer().getLoc(); const mlir::Value vector = builder.createLoad(loc, dst.getVectorAddress()); - const mlir::Value newVector = builder.create( - loc, vector, src.getValue(), dst.getVectorIdx()); + const mlir::Value newVector = cir::VecInsertOp::create( + builder, loc, vector, src.getValue(), dst.getVectorIdx()); builder.createStore(loc, newVector, dst.getVectorAddress()); return; } @@ -615,8 +615,8 @@ RValue CIRGenFunction::emitLoadOfLValue(LValue lv, SourceLocation loc) { if (lv.isVectorElt()) { const mlir::Value load = builder.createLoad(getLoc(loc), lv.getVectorAddress()); - return RValue::get(builder.create(getLoc(loc), load, - lv.getVectorIdx())); + return RValue::get(cir::VecExtractOp::create(builder, getLoc(loc), load, + lv.getVectorIdx())); } cgm.errorNYI(loc, "emitLoadOfLValue"); @@ -1685,8 +1685,8 @@ CIRGenCallee CIRGenFunction::emitDirectCallee(const GlobalDecl &gd) { mlir::OpBuilder::InsertionGuard guard(builder); builder.setInsertionPointToStart(cgm.getModule().getBody()); - clone = builder.create(calleeFunc.getLoc(), fdInlineName, - calleeFunc.getFunctionType()); + clone = cir::FuncOp::create(builder, calleeFunc.getLoc(), fdInlineName, + calleeFunc.getFunctionType()); clone.setLinkageAttr(cir::GlobalLinkageKindAttr::get( &cgm.getMLIRContext(), cir::GlobalLinkageKind::InternalLinkage)); clone.setSymVisibility("private"); @@ -1778,8 +1778,8 @@ RValue CIRGenFunction::emitCall(clang::QualType calleeTy, mlir::Operation *fn = callee.getFunctionPointer(); mlir::Value addr; if (auto funcOp = mlir::dyn_cast(fn)) { - addr = builder.create( - getLoc(e->getSourceRange()), + addr = cir::GetGlobalOp::create( + builder, getLoc(e->getSourceRange()), cir::PointerType::get(funcOp.getFunctionType()), funcOp.getSymName()); } else { addr = fn->getResult(0); @@ -1996,9 +1996,9 @@ cir::IfOp CIRGenFunction::emitIfOnBoolExpr( // Emit the code with the fully general case. mlir::Value condV = emitOpOnBoolExpr(loc, cond); - return builder.create(loc, condV, elseLoc.has_value(), - /*thenBuilder=*/thenBuilder, - /*elseBuilder=*/elseBuilder); + return cir::IfOp::create(builder, loc, condV, elseLoc.has_value(), + /*thenBuilder=*/thenBuilder, + /*elseBuilder=*/elseBuilder); } /// TODO(cir): see EmitBranchOnBoolExpr for extra ideas). @@ -2025,12 +2025,12 @@ mlir::Value CIRGenFunction::emitOpOnBoolExpr(mlir::Location loc, loc, condV, /*thenBuilder=*/ [this, trueExpr](mlir::OpBuilder &b, mlir::Location loc) { mlir::Value lhs = emitScalarExpr(trueExpr); - b.create(loc, lhs); + cir::YieldOp::create(b, loc, lhs); }, /*elseBuilder=*/ [this, falseExpr](mlir::OpBuilder &b, mlir::Location loc) { mlir::Value rhs = emitScalarExpr(falseExpr); - b.create(loc, rhs); + cir::YieldOp::create(b, loc, rhs); }) .getResult(); @@ -2211,8 +2211,8 @@ Address CIRGenFunction::emitLoadOfReference(LValue refLVal, mlir::Location loc, cgm.errorNYI(loc, "load of volatile reference"); cir::LoadOp load = - builder.create(loc, refLVal.getAddress().getElementType(), - refLVal.getAddress().getPointer()); + cir::LoadOp::create(builder, loc, refLVal.getAddress().getElementType(), + refLVal.getAddress().getPointer()); assert(!cir::MissingFeatures::opTBAA()); diff --git a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp index d8f4943a7755a..33ffa0d174746 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp @@ -390,7 +390,7 @@ ComplexExprEmitter::VisitImaginaryLiteral(const ImaginaryLiteral *il) { } auto complexAttr = cir::ConstComplexAttr::get(realValueAttr, imagValueAttr); - return builder.create(loc, complexAttr); + return cir::ConstantOp::create(builder, loc, complexAttr); } mlir::Value ComplexExprEmitter::VisitCallExpr(const CallExpr *e) { @@ -601,7 +601,7 @@ mlir::Value ComplexExprEmitter::emitBinAdd(const BinOpInfo &op) { if (mlir::isa(op.lhs.getType()) && mlir::isa(op.rhs.getType())) - return builder.create(op.loc, op.lhs, op.rhs); + return cir::ComplexAddOp::create(builder, op.loc, op.lhs, op.rhs); if (mlir::isa(op.lhs.getType())) { mlir::Value real = builder.createComplexReal(op.loc, op.lhs); @@ -623,7 +623,7 @@ mlir::Value ComplexExprEmitter::emitBinSub(const BinOpInfo &op) { if (mlir::isa(op.lhs.getType()) && mlir::isa(op.rhs.getType())) - return builder.create(op.loc, op.lhs, op.rhs); + return cir::ComplexSubOp::create(builder, op.loc, op.lhs, op.rhs); if (mlir::isa(op.lhs.getType())) { mlir::Value real = builder.createComplexReal(op.loc, op.lhs); @@ -664,7 +664,8 @@ mlir::Value ComplexExprEmitter::emitBinMul(const BinOpInfo &op) { mlir::isa(op.rhs.getType())) { cir::ComplexRangeKind rangeKind = getComplexRangeAttr(op.fpFeatures.getComplexRange()); - return builder.create(op.loc, op.lhs, op.rhs, rangeKind); + return cir::ComplexMulOp::create(builder, op.loc, op.lhs, op.rhs, + rangeKind); } if (mlir::isa(op.lhs.getType())) { @@ -975,14 +976,14 @@ mlir::Value ComplexExprEmitter::VisitAbstractConditionalOperator( [&](mlir::OpBuilder &b, mlir::Location loc) { eval.beginEvaluation(); mlir::Value trueValue = Visit(e->getTrueExpr()); - b.create(loc, trueValue); + cir::YieldOp::create(b, loc, trueValue); eval.endEvaluation(); }, /*elseBuilder=*/ [&](mlir::OpBuilder &b, mlir::Location loc) { eval.beginEvaluation(); mlir::Value falseValue = Visit(e->getFalseExpr()); - b.create(loc, falseValue); + cir::YieldOp::create(b, loc, falseValue); eval.endEvaluation(); }) .getResult(); diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp index 33eb748d72219..abfe07631da34 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp @@ -164,22 +164,22 @@ class ScalarExprEmitter : public StmtVisitor { mlir::Value VisitIntegerLiteral(const IntegerLiteral *e) { mlir::Type type = cgf.convertType(e->getType()); - return builder.create( - cgf.getLoc(e->getExprLoc()), cir::IntAttr::get(type, e->getValue())); + return cir::ConstantOp::create(builder, cgf.getLoc(e->getExprLoc()), + cir::IntAttr::get(type, e->getValue())); } mlir::Value VisitFloatingLiteral(const FloatingLiteral *e) { mlir::Type type = cgf.convertType(e->getType()); assert(mlir::isa(type) && "expect floating-point type"); - return builder.create( - cgf.getLoc(e->getExprLoc()), cir::FPAttr::get(type, e->getValue())); + return cir::ConstantOp::create(builder, cgf.getLoc(e->getExprLoc()), + cir::FPAttr::get(type, e->getValue())); } mlir::Value VisitCharacterLiteral(const CharacterLiteral *e) { mlir::Type ty = cgf.convertType(e->getType()); auto init = cir::IntAttr::get(ty, e->getValue()); - return builder.create(cgf.getLoc(e->getExprLoc()), init); + return cir::ConstantOp::create(builder, cgf.getLoc(e->getExprLoc()), init); } mlir::Value VisitCXXBoolLiteralExpr(const CXXBoolLiteralExpr *e) { @@ -227,7 +227,7 @@ class ScalarExprEmitter : public StmtVisitor { const mlir::Location loc = cgf.getLoc(e->getSourceRange()); const mlir::Value vecValue = Visit(e->getBase()); const mlir::Value indexValue = Visit(e->getIdx()); - return cgf.builder.create(loc, vecValue, indexValue); + return cir::VecExtractOp::create(cgf.builder, loc, vecValue, indexValue); } // Just load the lvalue formed by the subscript expression. return emitLoadOfLValue(e); @@ -238,8 +238,8 @@ class ScalarExprEmitter : public StmtVisitor { // The undocumented form of __builtin_shufflevector. mlir::Value inputVec = Visit(e->getExpr(0)); mlir::Value indexVec = Visit(e->getExpr(1)); - return cgf.builder.create( - cgf.getLoc(e->getSourceRange()), inputVec, indexVec); + return cir::VecShuffleDynamicOp::create( + cgf.builder, cgf.getLoc(e->getSourceRange()), inputVec, indexVec); } mlir::Value vec1 = Visit(e->getExpr(0)); @@ -257,9 +257,10 @@ class ScalarExprEmitter : public StmtVisitor { .getSExtValue())); } - return cgf.builder.create( - cgf.getLoc(e->getSourceRange()), cgf.convertType(e->getType()), vec1, - vec2, cgf.builder.getArrayAttr(indices)); + return cir::VecShuffleOp::create(cgf.builder, + cgf.getLoc(e->getSourceRange()), + cgf.convertType(e->getType()), vec1, vec2, + cgf.builder.getArrayAttr(indices)); } mlir::Value VisitConvertVectorExpr(ConvertVectorExpr *e) { @@ -296,8 +297,8 @@ class ScalarExprEmitter : public StmtVisitor { mlir::Value emitFloatToBoolConversion(mlir::Value src, mlir::Location loc) { cir::BoolType boolTy = builder.getBoolTy(); - return builder.create(loc, boolTy, - cir::CastKind::float_to_bool, src); + return cir::CastOp::create(builder, loc, boolTy, + cir::CastKind::float_to_bool, src); } mlir::Value emitIntToBoolConversion(mlir::Value srcVal, mlir::Location loc) { @@ -307,8 +308,8 @@ class ScalarExprEmitter : public StmtVisitor { // TODO: optimize this common case here or leave it for later // CIR passes? cir::BoolType boolTy = builder.getBoolTy(); - return builder.create(loc, boolTy, cir::CastKind::int_to_bool, - srcVal); + return cir::CastOp::create(builder, loc, boolTy, cir::CastKind::int_to_bool, + srcVal); } /// Convert the specified expression value to a boolean (!cir.bool) truth @@ -411,7 +412,8 @@ class ScalarExprEmitter : public StmtVisitor { } assert(castKind.has_value() && "Internal error: CastKind not set."); - return builder.create(src.getLoc(), fullDstTy, *castKind, src); + return cir::CastOp::create(builder, src.getLoc(), fullDstTy, *castKind, + src); } mlir::Value @@ -658,9 +660,9 @@ class ScalarExprEmitter : public StmtVisitor { mlir::Value emitUnaryOp(const UnaryOperator *e, cir::UnaryOpKind kind, mlir::Value input, bool nsw = false) { - return builder.create( - cgf.getLoc(e->getSourceRange().getBegin()), input.getType(), kind, - input, nsw); + return cir::UnaryOp::create(builder, + cgf.getLoc(e->getSourceRange().getBegin()), + input.getType(), kind, input, nsw); } mlir::Value VisitUnaryNot(const UnaryOperator *e) { @@ -967,9 +969,9 @@ class ScalarExprEmitter : public StmtVisitor { } else { // Other kinds of vectors. Element-wise comparison returning // a vector. - result = builder.create( - cgf.getLoc(boInfo.loc), cgf.convertType(boInfo.fullType), kind, - boInfo.lhs, boInfo.rhs); + result = cir::VecCmpOp::create(builder, cgf.getLoc(boInfo.loc), + cgf.convertType(boInfo.fullType), kind, + boInfo.lhs, boInfo.rhs); } } else if (boInfo.isFixedPointOp()) { assert(!cir::MissingFeatures::fixedPointType()); @@ -991,7 +993,7 @@ class ScalarExprEmitter : public StmtVisitor { assert(e->getOpcode() == BO_EQ || e->getOpcode() == BO_NE); BinOpInfo boInfo = emitBinOps(e); - result = builder.create(loc, kind, boInfo.lhs, boInfo.rhs); + result = cir::CmpOp::create(builder, loc, kind, boInfo.lhs, boInfo.rhs); } return emitScalarConversion(result, cgf.getContext().BoolTy, e->getType(), @@ -1093,8 +1095,8 @@ class ScalarExprEmitter : public StmtVisitor { CIRGenFunction::ConditionalEvaluation eval(cgf); mlir::Value lhsCondV = cgf.evaluateExprAsBool(e->getLHS()); - auto resOp = builder.create( - loc, lhsCondV, /*trueBuilder=*/ + auto resOp = cir::TernaryOp::create( + builder, loc, lhsCondV, /*trueBuilder=*/ [&](mlir::OpBuilder &b, mlir::Location loc) { CIRGenFunction::LexicalScope lexScope{cgf, loc, b.getInsertionBlock()}; @@ -1139,8 +1141,8 @@ class ScalarExprEmitter : public StmtVisitor { CIRGenFunction::ConditionalEvaluation eval(cgf); mlir::Value lhsCondV = cgf.evaluateExprAsBool(e->getLHS()); - auto resOp = builder.create( - loc, lhsCondV, /*trueBuilder=*/ + auto resOp = cir::TernaryOp::create( + builder, loc, lhsCondV, /*trueBuilder=*/ [&](mlir::OpBuilder &b, mlir::Location loc) { CIRGenFunction::LexicalScope lexScope{cgf, loc, b.getInsertionBlock()}; @@ -1609,19 +1611,19 @@ mlir::Value ScalarExprEmitter::emitMul(const BinOpInfo &ops) { return nullptr; } - return builder.create(cgf.getLoc(ops.loc), - cgf.convertType(ops.fullType), - cir::BinOpKind::Mul, ops.lhs, ops.rhs); + return cir::BinOp::create(builder, cgf.getLoc(ops.loc), + cgf.convertType(ops.fullType), cir::BinOpKind::Mul, + ops.lhs, ops.rhs); } mlir::Value ScalarExprEmitter::emitDiv(const BinOpInfo &ops) { - return builder.create(cgf.getLoc(ops.loc), - cgf.convertType(ops.fullType), - cir::BinOpKind::Div, ops.lhs, ops.rhs); + return cir::BinOp::create(builder, cgf.getLoc(ops.loc), + cgf.convertType(ops.fullType), cir::BinOpKind::Div, + ops.lhs, ops.rhs); } mlir::Value ScalarExprEmitter::emitRem(const BinOpInfo &ops) { - return builder.create(cgf.getLoc(ops.loc), - cgf.convertType(ops.fullType), - cir::BinOpKind::Rem, ops.lhs, ops.rhs); + return cir::BinOp::create(builder, cgf.getLoc(ops.loc), + cgf.convertType(ops.fullType), cir::BinOpKind::Rem, + ops.lhs, ops.rhs); } mlir::Value ScalarExprEmitter::emitAdd(const BinOpInfo &ops) { @@ -1668,8 +1670,8 @@ mlir::Value ScalarExprEmitter::emitAdd(const BinOpInfo &ops) { return {}; } - return builder.create(loc, cgf.convertType(ops.fullType), - cir::BinOpKind::Add, ops.lhs, ops.rhs); + return cir::BinOp::create(builder, loc, cgf.convertType(ops.fullType), + cir::BinOpKind::Add, ops.lhs, ops.rhs); } mlir::Value ScalarExprEmitter::emitSub(const BinOpInfo &ops) { @@ -1716,9 +1718,9 @@ mlir::Value ScalarExprEmitter::emitSub(const BinOpInfo &ops) { return {}; } - return builder.create(cgf.getLoc(ops.loc), - cgf.convertType(ops.fullType), - cir::BinOpKind::Sub, ops.lhs, ops.rhs); + return cir::BinOp::create(builder, cgf.getLoc(ops.loc), + cgf.convertType(ops.fullType), + cir::BinOpKind::Sub, ops.lhs, ops.rhs); } // If the RHS is not a pointer, then we have normal pointer @@ -1796,19 +1798,19 @@ mlir::Value ScalarExprEmitter::emitShr(const BinOpInfo &ops) { } mlir::Value ScalarExprEmitter::emitAnd(const BinOpInfo &ops) { - return builder.create(cgf.getLoc(ops.loc), - cgf.convertType(ops.fullType), - cir::BinOpKind::And, ops.lhs, ops.rhs); + return cir::BinOp::create(builder, cgf.getLoc(ops.loc), + cgf.convertType(ops.fullType), cir::BinOpKind::And, + ops.lhs, ops.rhs); } mlir::Value ScalarExprEmitter::emitXor(const BinOpInfo &ops) { - return builder.create(cgf.getLoc(ops.loc), - cgf.convertType(ops.fullType), - cir::BinOpKind::Xor, ops.lhs, ops.rhs); + return cir::BinOp::create(builder, cgf.getLoc(ops.loc), + cgf.convertType(ops.fullType), cir::BinOpKind::Xor, + ops.lhs, ops.rhs); } mlir::Value ScalarExprEmitter::emitOr(const BinOpInfo &ops) { - return builder.create(cgf.getLoc(ops.loc), - cgf.convertType(ops.fullType), - cir::BinOpKind::Or, ops.lhs, ops.rhs); + return cir::BinOp::create(builder, cgf.getLoc(ops.loc), + cgf.convertType(ops.fullType), cir::BinOpKind::Or, + ops.lhs, ops.rhs); } // Emit code for an explicit or implicit cast. Implicit @@ -2011,9 +2013,9 @@ mlir::Value ScalarExprEmitter::VisitCastExpr(CastExpr *ce) { case CK_VectorSplat: { // Create a vector object and fill all elements with the same scalar value. assert(destTy->isVectorType() && "CK_VectorSplat to non-vector type"); - return builder.create( - cgf.getLoc(subExpr->getSourceRange()), cgf.convertType(destTy), - Visit(subExpr)); + return cir::VecSplatOp::create(builder, + cgf.getLoc(subExpr->getSourceRange()), + cgf.convertType(destTy), Visit(subExpr)); } case CK_FunctionToPointerDecay: return cgf.emitLValue(subExpr).getPointer(); @@ -2310,8 +2312,8 @@ mlir::Value ScalarExprEmitter::VisitAbstractConditionalOperator( mlir::Value condValue = Visit(condExpr); mlir::Value lhsValue = Visit(lhsExpr); mlir::Value rhsValue = Visit(rhsExpr); - return builder.create(loc, condValue, lhsValue, - rhsValue); + return cir::VecTernaryOp::create(builder, loc, condValue, lhsValue, + rhsValue); } // If this is a really simple expression (like x ? 4 : 5), emit this as a @@ -2354,7 +2356,7 @@ mlir::Value ScalarExprEmitter::VisitAbstractConditionalOperator( if (branch) { yieldTy = branch.getType(); - b.create(loc, branch); + cir::YieldOp::create(b, loc, branch); } else { // If LHS or RHS is a throw or void expression we need to patch // arms as to properly match yield types. @@ -2387,10 +2389,10 @@ mlir::Value ScalarExprEmitter::VisitAbstractConditionalOperator( // Block does not return: build empty yield. if (mlir::isa(yieldTy)) { - builder.create(loc); + cir::YieldOp::create(builder, loc); } else { // Block returns: set null yield value. mlir::Value op0 = builder.getNullValue(yieldTy, loc); - builder.create(loc, op0); + cir::YieldOp::create(builder, loc, op0); } } } diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp index d3c0d9f109317..58feb36f78f23 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp @@ -264,11 +264,11 @@ void CIRGenFunction::LexicalScope::cleanup() { // If we now have one after `applyCleanup`, hook it up properly. if (!cleanupBlock && localScope->getCleanupBlock(builder)) { cleanupBlock = localScope->getCleanupBlock(builder); - builder.create(insPt->back().getLoc(), cleanupBlock); + cir::BrOp::create(builder, insPt->back().getLoc(), cleanupBlock); if (!cleanupBlock->mightHaveTerminator()) { mlir::OpBuilder::InsertionGuard guard(builder); builder.setInsertionPointToEnd(cleanupBlock); - builder.create(localScope->endLoc); + cir::YieldOp::create(builder, localScope->endLoc); } } @@ -286,7 +286,7 @@ void CIRGenFunction::LexicalScope::cleanup() { } } - builder.create(*returnLoc, returnBlock); + cir::BrOp::create(builder, *returnLoc, returnBlock); return; } } @@ -298,8 +298,8 @@ void CIRGenFunction::LexicalScope::cleanup() { // Ternary ops have to deal with matching arms for yielding types // and do return a value, it must do its own cir.yield insertion. if (!localScope->isTernary() && !insPt->mightHaveTerminator()) { - !retVal ? builder.create(localScope->endLoc) - : builder.create(localScope->endLoc, retVal); + !retVal ? cir::YieldOp::create(builder, localScope->endLoc) + : cir::YieldOp::create(builder, localScope->endLoc, retVal); } }; @@ -331,7 +331,7 @@ void CIRGenFunction::LexicalScope::cleanup() { // If there's a cleanup block, branch to it, nothing else to do. if (cleanupBlock) { - builder.create(curBlock->back().getLoc(), cleanupBlock); + cir::BrOp::create(builder, curBlock->back().getLoc(), cleanupBlock); return; } @@ -349,12 +349,12 @@ cir::ReturnOp CIRGenFunction::LexicalScope::emitReturn(mlir::Location loc) { assert(fn && "emitReturn from non-function"); if (!fn.getFunctionType().hasVoidReturn()) { // Load the value from `__retval` and return it via the `cir.return` op. - auto value = builder.create( - loc, fn.getFunctionType().getReturnType(), *cgf.fnRetAlloca); - return builder.create(loc, - llvm::ArrayRef(value.getResult())); + auto value = cir::LoadOp::create( + builder, loc, fn.getFunctionType().getReturnType(), *cgf.fnRetAlloca); + return cir::ReturnOp::create(builder, loc, + llvm::ArrayRef(value.getResult())); } - return builder.create(loc); + return cir::ReturnOp::create(builder, loc); } // This is copied from CodeGenModule::MayDropFunctionReturn. This is a @@ -389,9 +389,9 @@ void CIRGenFunction::LexicalScope::emitImplicitReturn() { if (shouldEmitUnreachable) { assert(!cir::MissingFeatures::sanitizers()); if (cgf.cgm.getCodeGenOpts().OptimizationLevel == 0) - builder.create(localScope->endLoc); + cir::TrapOp::create(builder, localScope->endLoc); else - builder.create(localScope->endLoc); + cir::UnreachableOp::create(builder, localScope->endLoc); builder.clearInsertionPoint(); return; } @@ -561,8 +561,8 @@ cir::FuncOp CIRGenFunction::generateCode(clang::GlobalDecl gd, cir::FuncOp fn, if (!clone) { mlir::OpBuilder::InsertionGuard guard(builder); builder.setInsertionPoint(fn); - clone = builder.create(fn.getLoc(), fdInlineName, - fn.getFunctionType()); + clone = cir::FuncOp::create(builder, fn.getLoc(), fdInlineName, + fn.getFunctionType()); clone.setLinkage(cir::GlobalLinkageKind::InternalLinkage); clone.setSymVisibility("private"); clone.setInlineKind(cir::InlineKind::AlwaysInline); diff --git a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp index e620310437219..1a5be0ecd62ad 100644 --- a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp @@ -1809,8 +1809,8 @@ CIRGenItaniumCXXABI::getVTableAddressPoint(BaseSubobject base, mlir::OpBuilder &builder = cgm.getBuilder(); auto vtablePtrTy = cir::VPtrType::get(builder.getContext()); - return builder.create( - cgm.getLoc(vtableClass->getSourceRange()), vtablePtrTy, + return cir::VTableAddrPointOp::create( + builder, cgm.getLoc(vtableClass->getSourceRange()), vtablePtrTy, mlir::FlatSymbolRefAttr::get(vtable.getSymNameAttr()), cir::AddressPointAttr::get(cgm.getBuilder().getContext(), addressPoint.VTableIndex, diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp index 6b293738fdf8a..46adfe28e377a 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp @@ -535,7 +535,7 @@ cir::GlobalOp CIRGenModule::createGlobalOp(CIRGenModule &cgm, builder.setInsertionPointToStart(cgm.getModule().getBody()); } - g = builder.create(loc, name, t, isConstant); + g = cir::GlobalOp::create(builder, loc, name, t, isConstant); if (!insertPoint) cgm.lastGlobalOp = g; @@ -739,8 +739,8 @@ mlir::Value CIRGenModule::getAddrOfGlobalVar(const VarDecl *d, mlir::Type ty, cir::GlobalOp g = getOrCreateCIRGlobal(d, ty, isForDefinition); mlir::Type ptrTy = builder.getPointerTo(g.getSymType()); - return builder.create(getLoc(d->getSourceRange()), ptrTy, - g.getSymName()); + return cir::GetGlobalOp::create(builder, getLoc(d->getSourceRange()), ptrTy, + g.getSymName()); } cir::GlobalViewAttr CIRGenModule::getAddrOfGlobalVarAttr(const VarDecl *d) { @@ -2176,7 +2176,7 @@ CIRGenModule::createCIRFunction(mlir::Location loc, StringRef name, if (cgf) builder.setInsertionPoint(cgf->curFn); - func = builder.create(loc, name, funcType); + func = cir::FuncOp::create(builder, loc, name, funcType); assert(!cir::MissingFeatures::opFuncAstDeclAttr()); diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACC.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACC.cpp index 5ba6bcb192b91..e7bf3bcc85c0b 100644 --- a/clang/lib/CIR/CodeGen/CIRGenOpenACC.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenOpenACC.cpp @@ -27,8 +27,8 @@ mlir::Value createBound(CIRGenFunction &cgf, CIRGen::CIRGenBuilderTy &builder, // Stride is always 1 in C/C++. mlir::Value stride = cgf.createOpenACCConstantInt(boundLoc, 64, 1); - auto bound = - builder.create(boundLoc, lowerBound, upperBound); + auto bound = mlir::acc::DataBoundsOp::create(builder, boundLoc, lowerBound, + upperBound); bound.getStartIdxMutable().assign(startIdx); if (extent) bound.getExtentMutable().assign(extent); @@ -48,8 +48,8 @@ mlir::Value CIRGenFunction::emitOpenACCIntExpr(const Expr *intExpr) { ? mlir::IntegerType::SignednessSemantics::Signed : mlir::IntegerType::SignednessSemantics::Unsigned); - auto conversionOp = builder.create( - exprLoc, targetType, expr); + auto conversionOp = mlir::UnrealizedConversionCastOp::create( + builder, exprLoc, targetType, expr); return conversionOp.getResult(0); } @@ -59,8 +59,8 @@ mlir::Value CIRGenFunction::createOpenACCConstantInt(mlir::Location loc, mlir::IntegerType ty = mlir::IntegerType::get(&getMLIRContext(), width, mlir::IntegerType::SignednessSemantics::Signless); - auto constOp = builder.create( - loc, builder.getIntegerAttr(ty, value)); + auto constOp = mlir::arith::ConstantOp::create( + builder, loc, builder.getIntegerAttr(ty, value)); return constOp; } diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp index 385f89c5544d6..50101373f3e9c 100644 --- a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp @@ -96,8 +96,8 @@ class OpenACCClauseCIREmitter final mlir::IntegerType targetType = mlir::IntegerType::get( &cgf.getMLIRContext(), /*width=*/1, mlir::IntegerType::SignednessSemantics::Signless); - auto conversionOp = builder.create( - exprLoc, targetType, condition); + auto conversionOp = mlir::UnrealizedConversionCastOp::create( + builder, exprLoc, targetType, condition); return conversionOp.getResult(0); } @@ -107,8 +107,8 @@ class OpenACCClauseCIREmitter final mlir::IntegerType ty = mlir::IntegerType::get( &cgf.getMLIRContext(), width, mlir::IntegerType::SignednessSemantics::Signless); - auto constOp = builder.create( - loc, builder.getIntegerAttr(ty, value)); + auto constOp = mlir::arith::ConstantOp::create( + builder, loc, builder.getIntegerAttr(ty, value)); return constOp; } @@ -217,8 +217,8 @@ class OpenACCClauseCIREmitter final cgf.getOpenACCDataOperandInfo(varOperand); auto beforeOp = - builder.create(opInfo.beginLoc, opInfo.varValue, structured, - implicit, opInfo.name, opInfo.bounds); + BeforeOpTy::create(builder, opInfo.beginLoc, opInfo.varValue, + structured, implicit, opInfo.name, opInfo.bounds); operation.getDataClauseOperandsMutable().append(beforeOp.getResult()); AfterOpTy afterOp; @@ -231,12 +231,12 @@ class OpenACCClauseCIREmitter final // Detach/Delete ops don't have the variable reference here, so they // take 1 fewer argument to their build function. afterOp = - builder.create(opInfo.beginLoc, beforeOp, structured, - implicit, opInfo.name, opInfo.bounds); + AfterOpTy::create(builder, opInfo.beginLoc, beforeOp, structured, + implicit, opInfo.name, opInfo.bounds); } else { - afterOp = builder.create( - opInfo.beginLoc, beforeOp, opInfo.varValue, structured, implicit, - opInfo.name, opInfo.bounds); + afterOp = AfterOpTy::create(builder, opInfo.beginLoc, beforeOp, + opInfo.varValue, structured, implicit, + opInfo.name, opInfo.bounds); } } @@ -258,8 +258,8 @@ class OpenACCClauseCIREmitter final CIRGenFunction::OpenACCDataOperandInfo opInfo = cgf.getOpenACCDataOperandInfo(varOperand); auto beforeOp = - builder.create(opInfo.beginLoc, opInfo.varValue, structured, - implicit, opInfo.name, opInfo.bounds); + BeforeOpTy::create(builder, opInfo.beginLoc, opInfo.varValue, + structured, implicit, opInfo.name, opInfo.bounds); operation.getDataClauseOperandsMutable().append(beforeOp.getResult()); // Set the 'rest' of the info for the operation. diff --git a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp index f486c46be2035..1eb7199ce6dfe 100644 --- a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp @@ -91,8 +91,9 @@ mlir::LogicalResult CIRGenFunction::emitCompoundStmt(const CompoundStmt &s, SymTableScopeTy varScope(symbolTable); mlir::Location scopeLoc = getLoc(s.getSourceRange()); mlir::OpBuilder::InsertPoint scopeInsPt; - builder.create( - scopeLoc, [&](mlir::OpBuilder &b, mlir::Type &type, mlir::Location loc) { + cir::ScopeOp::create( + builder, scopeLoc, + [&](mlir::OpBuilder &b, mlir::Type &type, mlir::Location loc) { scopeInsPt = b.saveInsertionPoint(); }); mlir::OpBuilder::InsertionGuard guard(builder); @@ -423,12 +424,12 @@ mlir::LogicalResult CIRGenFunction::emitIfStmt(const IfStmt &s) { // LexicalScope ConditionScope(*this, S.getCond()->getSourceRange()); // The if scope contains the full source range for IfStmt. mlir::Location scopeLoc = getLoc(s.getSourceRange()); - builder.create( - scopeLoc, /*scopeBuilder=*/ - [&](mlir::OpBuilder &b, mlir::Location loc) { - LexicalScope lexScope{*this, scopeLoc, builder.getInsertionBlock()}; - res = ifStmtBuilder(); - }); + cir::ScopeOp::create(builder, scopeLoc, /*scopeBuilder=*/ + [&](mlir::OpBuilder &b, mlir::Location loc) { + LexicalScope lexScope{*this, scopeLoc, + builder.getInsertionBlock()}; + res = ifStmtBuilder(); + }); return res; } @@ -576,11 +577,11 @@ mlir::LogicalResult CIRGenFunction::emitLabel(const clang::LabelDecl &d) { mlir::OpBuilder::InsertionGuard guard(builder); labelBlock = builder.createBlock(builder.getBlock()->getParent()); } - builder.create(getLoc(d.getSourceRange()), labelBlock); + cir::BrOp::create(builder, getLoc(d.getSourceRange()), labelBlock); } builder.setInsertionPointToEnd(labelBlock); - builder.create(getLoc(d.getSourceRange()), d.getName()); + cir::LabelOp::create(builder, getLoc(d.getSourceRange()), d.getName()); builder.setInsertionPointToEnd(labelBlock); // FIXME: emit debug info for labels, incrementProfileCounter @@ -617,7 +618,7 @@ CIRGenFunction::emitCaseDefaultCascade(const T *stmt, mlir::Type condType, const Stmt *sub = stmt->getSubStmt(); mlir::OpBuilder::InsertPoint insertPoint; - builder.create(loc, value, kind, insertPoint); + CaseOp::create(builder, loc, value, kind, insertPoint); { mlir::OpBuilder::InsertionGuard guardSwitch(builder); @@ -789,16 +790,16 @@ CIRGenFunction::emitCXXForRangeStmt(const CXXForRangeStmt &s, mlir::LogicalResult res = mlir::success(); mlir::Location scopeLoc = getLoc(s.getSourceRange()); - builder.create(scopeLoc, /*scopeBuilder=*/ - [&](mlir::OpBuilder &b, mlir::Location loc) { - // Create a cleanup scope for the condition - // variable cleanups. Logical equivalent from - // LLVM codegn for LexicalScope - // ConditionScope(*this, S.getSourceRange())... - LexicalScope lexScope{ - *this, loc, builder.getInsertionBlock()}; - res = forStmtBuilder(); - }); + cir::ScopeOp::create(builder, scopeLoc, /*scopeBuilder=*/ + [&](mlir::OpBuilder &b, mlir::Location loc) { + // Create a cleanup scope for the condition + // variable cleanups. Logical equivalent from + // LLVM codegn for LexicalScope + // ConditionScope(*this, S.getSourceRange())... + LexicalScope lexScope{*this, loc, + builder.getInsertionBlock()}; + res = forStmtBuilder(); + }); if (res.failed()) return res; @@ -841,7 +842,7 @@ mlir::LogicalResult CIRGenFunction::emitForStmt(const ForStmt &s) { // scalar type. condVal = evaluateExprAsBool(s.getCond()); } else { - condVal = b.create(loc, builder.getTrueAttr()); + condVal = cir::ConstantOp::create(b, loc, builder.getTrueAttr()); } builder.createCondition(condVal); }, @@ -865,12 +866,12 @@ mlir::LogicalResult CIRGenFunction::emitForStmt(const ForStmt &s) { auto res = mlir::success(); auto scopeLoc = getLoc(s.getSourceRange()); - builder.create(scopeLoc, /*scopeBuilder=*/ - [&](mlir::OpBuilder &b, mlir::Location loc) { - LexicalScope lexScope{ - *this, loc, builder.getInsertionBlock()}; - res = forStmtBuilder(); - }); + cir::ScopeOp::create(builder, scopeLoc, /*scopeBuilder=*/ + [&](mlir::OpBuilder &b, mlir::Location loc) { + LexicalScope lexScope{*this, loc, + builder.getInsertionBlock()}; + res = forStmtBuilder(); + }); if (res.failed()) return res; @@ -916,12 +917,12 @@ mlir::LogicalResult CIRGenFunction::emitDoStmt(const DoStmt &s) { mlir::LogicalResult res = mlir::success(); mlir::Location scopeLoc = getLoc(s.getSourceRange()); - builder.create(scopeLoc, /*scopeBuilder=*/ - [&](mlir::OpBuilder &b, mlir::Location loc) { - LexicalScope lexScope{ - *this, loc, builder.getInsertionBlock()}; - res = doStmtBuilder(); - }); + cir::ScopeOp::create(builder, scopeLoc, /*scopeBuilder=*/ + [&](mlir::OpBuilder &b, mlir::Location loc) { + LexicalScope lexScope{*this, loc, + builder.getInsertionBlock()}; + res = doStmtBuilder(); + }); if (res.failed()) return res; @@ -972,12 +973,12 @@ mlir::LogicalResult CIRGenFunction::emitWhileStmt(const WhileStmt &s) { mlir::LogicalResult res = mlir::success(); mlir::Location scopeLoc = getLoc(s.getSourceRange()); - builder.create(scopeLoc, /*scopeBuilder=*/ - [&](mlir::OpBuilder &b, mlir::Location loc) { - LexicalScope lexScope{ - *this, loc, builder.getInsertionBlock()}; - res = whileStmtBuilder(); - }); + cir::ScopeOp::create(builder, scopeLoc, /*scopeBuilder=*/ + [&](mlir::OpBuilder &b, mlir::Location loc) { + LexicalScope lexScope{*this, loc, + builder.getInsertionBlock()}; + res = whileStmtBuilder(); + }); if (res.failed()) return res; @@ -1048,8 +1049,8 @@ mlir::LogicalResult CIRGenFunction::emitSwitchStmt(const clang::SwitchStmt &s) { assert(!cir::MissingFeatures::insertBuiltinUnpredictable()); mlir::LogicalResult res = mlir::success(); - swop = builder.create( - getLoc(s.getBeginLoc()), condV, + swop = SwitchOp::create( + builder, getLoc(s.getBeginLoc()), condV, /*switchBuilder=*/ [&](mlir::OpBuilder &b, mlir::Location loc, mlir::OperationState &os) { curLexScope->setAsSwitch(); @@ -1067,12 +1068,12 @@ mlir::LogicalResult CIRGenFunction::emitSwitchStmt(const clang::SwitchStmt &s) { // The switch scope contains the full source range for SwitchStmt. mlir::Location scopeLoc = getLoc(s.getSourceRange()); mlir::LogicalResult res = mlir::success(); - builder.create(scopeLoc, /*scopeBuilder=*/ - [&](mlir::OpBuilder &b, mlir::Location loc) { - LexicalScope lexScope{ - *this, loc, builder.getInsertionBlock()}; - res = switchStmtBuilder(); - }); + cir::ScopeOp::create(builder, scopeLoc, /*scopeBuilder=*/ + [&](mlir::OpBuilder &b, mlir::Location loc) { + LexicalScope lexScope{*this, loc, + builder.getInsertionBlock()}; + res = switchStmtBuilder(); + }); llvm::SmallVector cases; swop.collectCases(cases); @@ -1096,7 +1097,7 @@ void CIRGenFunction::emitReturnOfRValue(mlir::Location loc, RValue rv, } mlir::Block *retBlock = curLexScope->getOrCreateRetBlock(*this, loc); assert(!cir::MissingFeatures::emitBranchThroughCleanup()); - builder.create(loc, retBlock); + cir::BrOp::create(builder, loc, retBlock); if (ehStack.stable_begin() != currentCleanupStackDepth) cgm.errorNYI(loc, "return of r-value with cleanup stack"); } diff --git a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp index 02bb46d0e4466..77e6f8389c361 100644 --- a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp @@ -30,7 +30,7 @@ mlir::LogicalResult CIRGenFunction::emitOpenACCOpAssociatedStmt( llvm::SmallVector retTy; llvm::SmallVector operands; - auto op = builder.create(start, retTy, operands); + auto op = Op::create(builder, start, retTy, operands); emitOpenACCClauses(op, dirKind, dirLoc, clauses); @@ -42,7 +42,7 @@ mlir::LogicalResult CIRGenFunction::emitOpenACCOpAssociatedStmt( LexicalScope ls{*this, start, builder.getInsertionBlock()}; res = emitStmt(associatedStmt, /*useCurrentScope=*/true); - builder.create(end); + TermOp::create(builder, end); } return res; } @@ -73,7 +73,7 @@ mlir::LogicalResult CIRGenFunction::emitOpenACCOpCombinedConstruct( llvm::SmallVector retTy; llvm::SmallVector operands; - auto computeOp = builder.create(start, retTy, operands); + auto computeOp = Op::create(builder, start, retTy, operands); computeOp.setCombinedAttr(builder.getUnitAttr()); mlir::acc::LoopOp loopOp; @@ -85,7 +85,7 @@ mlir::LogicalResult CIRGenFunction::emitOpenACCOpCombinedConstruct( builder.setInsertionPointToEnd(&block); LexicalScope ls{*this, start, builder.getInsertionBlock()}; - auto loopOp = builder.create(start, retTy, operands); + auto loopOp = LoopOp::create(builder, start, retTy, operands); loopOp.setCombinedAttr(mlir::acc::CombinedConstructsTypeAttr::get( builder.getContext(), CombinedType::value)); @@ -99,14 +99,14 @@ mlir::LogicalResult CIRGenFunction::emitOpenACCOpCombinedConstruct( res = emitStmt(loopStmt, /*useCurrentScope=*/true); - builder.create(end); + mlir::acc::YieldOp::create(builder, end); } emitOpenACCClauses(computeOp, loopOp, dirKind, dirLoc, clauses); updateLoopOpParallelism(loopOp, /*isOrphan=*/false, dirKind); - builder.create(end); + TermOp::create(builder, end); } return res; @@ -118,7 +118,7 @@ Op CIRGenFunction::emitOpenACCOp( llvm::ArrayRef clauses) { llvm::SmallVector retTy; llvm::SmallVector operands; - auto op = builder.create(start, retTy, operands); + auto op = Op::create(builder, start, retTy, operands); emitOpenACCClauses(op, dirKind, dirLoc, clauses); return op; @@ -197,8 +197,8 @@ CIRGenFunction::emitOpenACCWaitConstruct(const OpenACCWaitConstruct &s) { ? mlir::IntegerType::SignednessSemantics::Signed : mlir::IntegerType::SignednessSemantics::Unsigned); - auto conversionOp = builder.create( - exprLoc, targetType, expr); + auto conversionOp = mlir::UnrealizedConversionCastOp::create( + builder, exprLoc, targetType, expr); return conversionOp.getResult(0); }; @@ -294,9 +294,9 @@ CIRGenFunction::emitOpenACCCacheConstruct(const OpenACCCacheConstruct &s) { CIRGenFunction::OpenACCDataOperandInfo opInfo = getOpenACCDataOperandInfo(var); - auto cacheOp = builder.create( - opInfo.beginLoc, opInfo.varValue, - /*structured=*/false, /*implicit=*/false, opInfo.name, opInfo.bounds); + auto cacheOp = CacheOp::create(builder, opInfo.beginLoc, opInfo.varValue, + /*structured=*/false, /*implicit=*/false, + opInfo.name, opInfo.bounds); loopOp.getCacheOperandsMutable().append(cacheOp.getResult()); } diff --git a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp index f3911ae8b6772..c5b89bd7b1fb6 100644 --- a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp @@ -58,7 +58,7 @@ CIRGenFunction::emitOpenACCLoopConstruct(const OpenACCLoopConstruct &s) { mlir::Location end = getLoc(s.getSourceRange().getEnd()); llvm::SmallVector retTy; llvm::SmallVector operands; - auto op = builder.create(start, retTy, operands); + auto op = LoopOp::create(builder, start, retTy, operands); // TODO(OpenACC): In the future we are going to need to come up with a // transformation here that can teach the acc.loop how to figure out the @@ -133,7 +133,7 @@ CIRGenFunction::emitOpenACCLoopConstruct(const OpenACCLoopConstruct &s) { ActiveOpenACCLoopRAII activeLoop{*this, &op}; stmtRes = emitStmt(s.getLoop(), /*useCurrentScope=*/true); - builder.create(end); + mlir::acc::YieldOp::create(builder, end); } return stmtRes; diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp index fa180f54c4474..2d2ef422bfaef 100644 --- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp +++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp @@ -95,8 +95,8 @@ Operation *cir::CIRDialect::materializeConstant(mlir::OpBuilder &builder, mlir::Attribute value, mlir::Type type, mlir::Location loc) { - return builder.create(loc, type, - mlir::cast(value)); + return cir::ConstantOp::create(builder, loc, type, + mlir::cast(value)); } //===----------------------------------------------------------------------===// @@ -184,7 +184,7 @@ static LogicalResult ensureRegionTerm(OpAsmParser &parser, Region ®ion, // Terminator was omitted correctly: recreate it. builder.setInsertionPointToEnd(&block); - builder.create(eLoc); + cir::YieldOp::create(builder, eLoc); return success(); } @@ -977,7 +977,7 @@ void cir::IfOp::print(OpAsmPrinter &p) { /// Default callback for IfOp builders. void cir::buildTerminatedBody(OpBuilder &builder, Location loc) { // add cir.yield to end of the block - builder.create(loc); + cir::YieldOp::create(builder, loc); } /// Given the region at `index`, or the parent operation if `index` is None, diff --git a/clang/lib/CIR/Dialect/IR/CIRMemorySlot.cpp b/clang/lib/CIR/Dialect/IR/CIRMemorySlot.cpp index 7e96ae99b690b..66469e208d7b0 100644 --- a/clang/lib/CIR/Dialect/IR/CIRMemorySlot.cpp +++ b/clang/lib/CIR/Dialect/IR/CIRMemorySlot.cpp @@ -34,8 +34,8 @@ llvm::SmallVector cir::AllocaOp::getPromotableSlots() { Value cir::AllocaOp::getDefaultValue(const MemorySlot &slot, OpBuilder &builder) { - return builder.create(getLoc(), - cir::UndefAttr::get(slot.elemType)); + return cir::ConstantOp::create(builder, getLoc(), + cir::UndefAttr::get(slot.elemType)); } void cir::AllocaOp::handleBlockArgument(const MemorySlot &slot, diff --git a/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp b/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp index 46bd186ccada1..21c96febf8403 100644 --- a/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp +++ b/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp @@ -100,8 +100,8 @@ struct CIRIfFlattening : public mlir::OpRewritePattern { } rewriter.setInsertionPointToEnd(currentBlock); - rewriter.create(loc, ifOp.getCondition(), thenBeforeBody, - elseBeforeBody); + cir::BrCondOp::create(rewriter, loc, ifOp.getCondition(), thenBeforeBody, + elseBeforeBody); if (!emptyElse) { rewriter.setInsertionPointToEnd(elseAfterBody); @@ -154,7 +154,7 @@ class CIRScopeOpFlattening : public mlir::OpRewritePattern { // Save stack and then branch into the body of the region. rewriter.setInsertionPointToEnd(currentBlock); assert(!cir::MissingFeatures::stackSaveOp()); - rewriter.create(loc, mlir::ValueRange(), beforeBody); + cir::BrOp::create(rewriter, loc, mlir::ValueRange(), beforeBody); // Replace the scopeop return with a branch that jumps out of the body. // Stack restore before leaving the body region. @@ -195,26 +195,27 @@ class CIRSwitchOpFlattening : public mlir::OpRewritePattern { cir::IntType sIntType = cir::IntType::get(op.getContext(), 32, true); cir::IntType uIntType = cir::IntType::get(op.getContext(), 32, false); - cir::ConstantOp rangeLength = rewriter.create( - op.getLoc(), cir::IntAttr::get(sIntType, upperBound - lowerBound)); + cir::ConstantOp rangeLength = cir::ConstantOp::create( + rewriter, op.getLoc(), + cir::IntAttr::get(sIntType, upperBound - lowerBound)); - cir::ConstantOp lowerBoundValue = rewriter.create( - op.getLoc(), cir::IntAttr::get(sIntType, lowerBound)); + cir::ConstantOp lowerBoundValue = cir::ConstantOp::create( + rewriter, op.getLoc(), cir::IntAttr::get(sIntType, lowerBound)); cir::BinOp diffValue = - rewriter.create(op.getLoc(), sIntType, cir::BinOpKind::Sub, - op.getCondition(), lowerBoundValue); + cir::BinOp::create(rewriter, op.getLoc(), sIntType, cir::BinOpKind::Sub, + op.getCondition(), lowerBoundValue); // Use unsigned comparison to check if the condition is in the range. - cir::CastOp uDiffValue = rewriter.create( - op.getLoc(), uIntType, CastKind::integral, diffValue); - cir::CastOp uRangeLength = rewriter.create( - op.getLoc(), uIntType, CastKind::integral, rangeLength); - - cir::CmpOp cmpResult = rewriter.create( - op.getLoc(), cir::BoolType::get(op.getContext()), cir::CmpOpKind::le, - uDiffValue, uRangeLength); - rewriter.create(op.getLoc(), cmpResult, rangeDestination, - defaultDestination); + cir::CastOp uDiffValue = cir::CastOp::create( + rewriter, op.getLoc(), uIntType, CastKind::integral, diffValue); + cir::CastOp uRangeLength = cir::CastOp::create( + rewriter, op.getLoc(), uIntType, CastKind::integral, rangeLength); + + cir::CmpOp cmpResult = cir::CmpOp::create( + rewriter, op.getLoc(), cir::BoolType::get(op.getContext()), + cir::CmpOpKind::le, uDiffValue, uRangeLength); + cir::BrCondOp::create(rewriter, op.getLoc(), cmpResult, rangeDestination, + defaultDestination); return resBlock; } @@ -262,7 +263,7 @@ class CIRSwitchOpFlattening : public mlir::OpRewritePattern { rewriteYieldOp(rewriter, switchYield, exitBlock); rewriter.setInsertionPointToEnd(originalBlock); - rewriter.create(op.getLoc(), swopBlock); + cir::BrOp::create(rewriter, op.getLoc(), swopBlock); } // Allocate required data structures (disconsider default case in @@ -331,8 +332,8 @@ class CIRSwitchOpFlattening : public mlir::OpRewritePattern { mlir::Block *newBlock = rewriter.splitBlock(oldBlock, nextOp->getIterator()); rewriter.setInsertionPointToEnd(oldBlock); - rewriter.create(nextOp->getLoc(), mlir::ValueRange(), - newBlock); + cir::BrOp::create(rewriter, nextOp->getLoc(), mlir::ValueRange(), + newBlock); rewriteYieldOp(rewriter, yieldOp, newBlock); } } @@ -346,7 +347,7 @@ class CIRSwitchOpFlattening : public mlir::OpRewritePattern { // Create a branch to the entry of the inlined region. rewriter.setInsertionPointToEnd(oldBlock); - rewriter.create(caseOp.getLoc(), &entryBlock); + cir::BrOp::create(rewriter, caseOp.getLoc(), &entryBlock); } // Remove all cases since we've inlined the regions. @@ -427,7 +428,7 @@ class CIRLoopOpInterfaceFlattening // Setup loop entry branch. rewriter.setInsertionPointToEnd(entry); - rewriter.create(op.getLoc(), &op.getEntry().front()); + cir::BrOp::create(rewriter, op.getLoc(), &op.getEntry().front()); // Branch from condition region to body or exit. auto conditionOp = cast(cond->getTerminator()); @@ -499,7 +500,7 @@ class CIRTernaryOpFlattening : public mlir::OpRewritePattern { locs.push_back(loc); Block *continueBlock = rewriter.createBlock(remainingOpsBlock, op->getResultTypes(), locs); - rewriter.create(loc, remainingOpsBlock); + cir::BrOp::create(rewriter, loc, remainingOpsBlock); Region &trueRegion = op.getTrueRegion(); Block *trueBlock = &trueRegion.front(); @@ -542,7 +543,7 @@ class CIRTernaryOpFlattening : public mlir::OpRewritePattern { rewriter.inlineRegionBefore(falseRegion, continueBlock); rewriter.setInsertionPointToEnd(condBlock); - rewriter.create(loc, op.getCond(), trueBlock, falseBlock); + cir::BrCondOp::create(rewriter, loc, op.getCond(), trueBlock, falseBlock); rewriter.replaceOp(op, continueBlock->getArguments()); diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp index d99c36241ab68..813dc1b3049a4 100644 --- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp +++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp @@ -155,7 +155,7 @@ cir::FuncOp LoweringPreparePass::buildRuntimeFunction( cir::FuncOp f = dyn_cast_or_null(SymbolTable::lookupNearestSymbolFrom( mlirModule, StringAttr::get(mlirModule->getContext(), name))); if (!f) { - f = builder.create(loc, name, type); + f = cir::FuncOp::create(builder, loc, name, type); f.setLinkageAttr( cir::GlobalLinkageKindAttr::get(builder.getContext(), linkage)); mlir::SymbolTable::setSymbolVisibility( @@ -400,12 +400,12 @@ buildRangeReductionComplexDiv(CIRBaseBuilderTy &builder, mlir::Location loc, builder.createYield(loc, result); }; - auto cFabs = builder.create(loc, c); - auto dFabs = builder.create(loc, d); + auto cFabs = cir::FAbsOp::create(builder, loc, c); + auto dFabs = cir::FAbsOp::create(builder, loc, d); cir::CmpOp cmpResult = builder.createCompare(loc, cir::CmpOpKind::ge, cFabs, dFabs); - auto ternary = builder.create( - loc, cmpResult, trueBranchBuilder, falseBranchBuilder); + auto ternary = cir::TernaryOp::create(builder, loc, cmpResult, + trueBranchBuilder, falseBranchBuilder); return ternary.getResult(); } @@ -920,15 +920,15 @@ static void lowerArrayDtorCtorIntoLoop(cir::CIRBaseBuilderTy &builder, loc, /*condBuilder=*/ [&](mlir::OpBuilder &b, mlir::Location loc) { - auto currentElement = b.create(loc, eltTy, tmpAddr); + auto currentElement = cir::LoadOp::create(b, loc, eltTy, tmpAddr); mlir::Type boolTy = cir::BoolType::get(b.getContext()); - auto cmp = builder.create(loc, boolTy, cir::CmpOpKind::ne, - currentElement, stop); + auto cmp = cir::CmpOp::create(builder, loc, boolTy, cir::CmpOpKind::ne, + currentElement, stop); builder.createCondition(cmp); }, /*bodyBuilder=*/ [&](mlir::OpBuilder &b, mlir::Location loc) { - auto currentElement = b.create(loc, eltTy, tmpAddr); + auto currentElement = cir::LoadOp::create(b, loc, eltTy, tmpAddr); cir::CallOp ctorCall; op->walk([&](cir::CallOp c) { ctorCall = c; }); diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepareItaniumCXXABI.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepareItaniumCXXABI.cpp index 11ce2a81e7f39..cbf6f475b339c 100644 --- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepareItaniumCXXABI.cpp +++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepareItaniumCXXABI.cpp @@ -77,10 +77,11 @@ buildDynamicCastAfterNullCheck(cir::CIRBaseBuilderTy &builder, if (op.isRefCast()) { // Emit a cir.if that checks the casted value. mlir::Value castedValueIsNull = builder.createPtrIsNull(castedPtr); - builder.create( - loc, castedValueIsNull, false, [&](mlir::OpBuilder &, mlir::Location) { - buildBadCastCall(builder, loc, castInfo.getBadCastFunc()); - }); + cir::IfOp::create(builder, loc, castedValueIsNull, false, + [&](mlir::OpBuilder &, mlir::Location) { + buildBadCastCall(builder, loc, + castInfo.getBadCastFunc()); + }); } // Note that castedPtr is a void*. Cast it to a pointer to the destination diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp index bb75f2d94066f..61660db7f96f7 100644 --- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp +++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp @@ -90,12 +90,12 @@ static mlir::Value createIntCast(mlir::OpBuilder &bld, mlir::Value src, mlir::Location loc = src.getLoc(); if (dstWidth > srcWidth && isSigned) - return bld.create(loc, dstTy, src); + return mlir::LLVM::SExtOp::create(bld, loc, dstTy, src); if (dstWidth > srcWidth) - return bld.create(loc, dstTy, src); + return mlir::LLVM::ZExtOp::create(bld, loc, dstTy, src); if (dstWidth < srcWidth) - return bld.create(loc, dstTy, src); - return bld.create(loc, dstTy, src); + return mlir::LLVM::TruncOp::create(bld, loc, dstTy, src); + return mlir::LLVM::BitcastOp::create(bld, loc, dstTy, src); } static mlir::LLVM::Visibility @@ -204,12 +204,12 @@ static mlir::Value getLLVMIntCast(mlir::ConversionPatternRewriter &rewriter, auto loc = llvmSrc.getLoc(); if (cirSrcWidth < cirDstIntWidth) { if (isUnsigned) - return rewriter.create(loc, llvmDstIntTy, llvmSrc); - return rewriter.create(loc, llvmDstIntTy, llvmSrc); + return mlir::LLVM::ZExtOp::create(rewriter, loc, llvmDstIntTy, llvmSrc); + return mlir::LLVM::SExtOp::create(rewriter, loc, llvmDstIntTy, llvmSrc); } // Otherwise truncate - return rewriter.create(loc, llvmDstIntTy, llvmSrc); + return mlir::LLVM::TruncOp::create(rewriter, loc, llvmDstIntTy, llvmSrc); } class CIRAttrToValue { @@ -315,15 +315,17 @@ static mlir::LLVM::CallIntrinsicOp replaceOpWithCallLLVMIntrinsicOp( /// IntAttr visitor. mlir::Value CIRAttrToValue::visitCirAttr(cir::IntAttr intAttr) { mlir::Location loc = parentOp->getLoc(); - return rewriter.create( - loc, converter->convertType(intAttr.getType()), intAttr.getValue()); + return mlir::LLVM::ConstantOp::create( + rewriter, loc, converter->convertType(intAttr.getType()), + intAttr.getValue()); } /// FPAttr visitor. mlir::Value CIRAttrToValue::visitCirAttr(cir::FPAttr fltAttr) { mlir::Location loc = parentOp->getLoc(); - return rewriter.create( - loc, converter->convertType(fltAttr.getType()), fltAttr.getValue()); + return mlir::LLVM::ConstantOp::create( + rewriter, loc, converter->convertType(fltAttr.getType()), + fltAttr.getValue()); } /// ConstComplexAttr visitor. @@ -350,8 +352,8 @@ mlir::Value CIRAttrToValue::visitCirAttr(cir::ConstComplexAttr complexAttr) { } mlir::Location loc = parentOp->getLoc(); - return rewriter.create( - loc, converter->convertType(complexAttr.getType()), + return mlir::LLVM::ConstantOp::create( + rewriter, loc, converter->convertType(complexAttr.getType()), rewriter.getArrayAttr(components)); } @@ -359,15 +361,16 @@ mlir::Value CIRAttrToValue::visitCirAttr(cir::ConstComplexAttr complexAttr) { mlir::Value CIRAttrToValue::visitCirAttr(cir::ConstPtrAttr ptrAttr) { mlir::Location loc = parentOp->getLoc(); if (ptrAttr.isNullValue()) { - return rewriter.create( - loc, converter->convertType(ptrAttr.getType())); + return mlir::LLVM::ZeroOp::create( + rewriter, loc, converter->convertType(ptrAttr.getType())); } mlir::DataLayout layout(parentOp->getParentOfType()); - mlir::Value ptrVal = rewriter.create( - loc, rewriter.getIntegerType(layout.getTypeSizeInBits(ptrAttr.getType())), + mlir::Value ptrVal = mlir::LLVM::ConstantOp::create( + rewriter, loc, + rewriter.getIntegerType(layout.getTypeSizeInBits(ptrAttr.getType())), ptrAttr.getValue().getInt()); - return rewriter.create( - loc, converter->convertType(ptrAttr.getType()), ptrVal); + return mlir::LLVM::IntToPtrOp::create( + rewriter, loc, converter->convertType(ptrAttr.getType()), ptrVal); } // ConstArrayAttr visitor @@ -378,10 +381,10 @@ mlir::Value CIRAttrToValue::visitCirAttr(cir::ConstArrayAttr attr) { if (attr.hasTrailingZeros()) { mlir::Type arrayTy = attr.getType(); - result = rewriter.create( - loc, converter->convertType(arrayTy)); + result = mlir::LLVM::ZeroOp::create(rewriter, loc, + converter->convertType(arrayTy)); } else { - result = rewriter.create(loc, llvmTy); + result = mlir::LLVM::UndefOp::create(rewriter, loc, llvmTy); } // Iteratively lower each constant element of the array. @@ -390,7 +393,7 @@ mlir::Value CIRAttrToValue::visitCirAttr(cir::ConstArrayAttr attr) { mlir::DataLayout dataLayout(parentOp->getParentOfType()); mlir::Value init = visit(elt); result = - rewriter.create(loc, result, init, idx); + mlir::LLVM::InsertValueOp::create(rewriter, loc, result, init, idx); } } else if (auto strAttr = mlir::dyn_cast(attr.getElts())) { // TODO(cir): this diverges from traditional lowering. Normally the string @@ -399,10 +402,10 @@ mlir::Value CIRAttrToValue::visitCirAttr(cir::ConstArrayAttr attr) { assert(arrayTy && "String attribute must have an array type"); mlir::Type eltTy = arrayTy.getElementType(); for (auto [idx, elt] : llvm::enumerate(strAttr)) { - auto init = rewriter.create( - loc, converter->convertType(eltTy), elt); + auto init = mlir::LLVM::ConstantOp::create( + rewriter, loc, converter->convertType(eltTy), elt); result = - rewriter.create(loc, result, init, idx); + mlir::LLVM::InsertValueOp::create(rewriter, loc, result, init, idx); } } else { llvm_unreachable("unexpected ConstArrayAttr elements"); @@ -415,12 +418,13 @@ mlir::Value CIRAttrToValue::visitCirAttr(cir::ConstArrayAttr attr) { mlir::Value CIRAttrToValue::visitCirAttr(cir::ConstRecordAttr constRecord) { const mlir::Type llvmTy = converter->convertType(constRecord.getType()); const mlir::Location loc = parentOp->getLoc(); - mlir::Value result = rewriter.create(loc, llvmTy); + mlir::Value result = mlir::LLVM::UndefOp::create(rewriter, loc, llvmTy); // Iteratively lower each constant element of the record. for (auto [idx, elt] : llvm::enumerate(constRecord.getMembers())) { mlir::Value init = visit(elt); - result = rewriter.create(loc, result, init, idx); + result = + mlir::LLVM::InsertValueOp::create(rewriter, loc, result, init, idx); } return result; @@ -447,8 +451,8 @@ mlir::Value CIRAttrToValue::visitCirAttr(cir::ConstVectorAttr attr) { mlirValues.push_back(mlirAttr); } - return rewriter.create( - loc, llvmTy, + return mlir::LLVM::ConstantOp::create( + rewriter, loc, llvmTy, mlir::DenseElementsAttr::get(mlir::cast(llvmTy), mlirValues)); } @@ -483,8 +487,9 @@ mlir::Value CIRAttrToValue::visitCirAttr(cir::GlobalViewAttr globalAttr) { } mlir::Location loc = parentOp->getLoc(); - mlir::Value addrOp = rewriter.create( - loc, mlir::LLVM::LLVMPointerType::get(rewriter.getContext()), symName); + mlir::Value addrOp = mlir::LLVM::AddressOfOp::create( + rewriter, loc, mlir::LLVM::LLVMPointerType::get(rewriter.getContext()), + symName); if (globalAttr.getIndices()) { llvm::SmallVector indices; @@ -499,8 +504,9 @@ mlir::Value CIRAttrToValue::visitCirAttr(cir::GlobalViewAttr globalAttr) { } mlir::Type resTy = addrOp.getType(); mlir::Type eltTy = converter->convertType(sourceType); - addrOp = rewriter.create( - loc, resTy, eltTy, addrOp, indices, mlir::LLVM::GEPNoWrapFlags::none); + addrOp = + mlir::LLVM::GEPOp::create(rewriter, loc, resTy, eltTy, addrOp, indices, + mlir::LLVM::GEPNoWrapFlags::none); } // The incubator has handling here for the attribute having integer type, but @@ -517,8 +523,8 @@ mlir::Value CIRAttrToValue::visitCirAttr(cir::GlobalViewAttr globalAttr) { return addrOp; mlir::Type llvmDstTy = converter->convertType(globalAttr.getType()); - return rewriter.create(parentOp->getLoc(), llvmDstTy, - addrOp); + return mlir::LLVM::BitcastOp::create(rewriter, parentOp->getLoc(), + llvmDstTy, addrOp); } llvm_unreachable("Expecting pointer or integer type for GlobalViewAttr"); @@ -557,8 +563,8 @@ mlir::Value CIRAttrToValue::visitCirAttr(cir::VTableAttr vtableArr) { /// ZeroAttr visitor. mlir::Value CIRAttrToValue::visitCirAttr(cir::ZeroAttr attr) { mlir::Location loc = parentOp->getLoc(); - return rewriter.create( - loc, converter->convertType(attr.getType())); + return mlir::LLVM::ZeroOp::create(rewriter, loc, + converter->convertType(attr.getType())); } // This class handles rewriting initializer attributes for types that do not @@ -666,8 +672,8 @@ mlir::LogicalResult CIRToLLVMAssumeAlignedOpLowering::matchAndRewrite( mlir::LogicalResult CIRToLLVMAssumeSepStorageOpLowering::matchAndRewrite( cir::AssumeSepStorageOp op, OpAdaptor adaptor, mlir::ConversionPatternRewriter &rewriter) const { - auto cond = rewriter.create(op.getLoc(), - rewriter.getI1Type(), 1); + auto cond = mlir::LLVM::ConstantOp::create(rewriter, op.getLoc(), + rewriter.getI1Type(), 1); rewriter.replaceOpWithNewOp( op, cond, mlir::LLVM::AssumeSeparateStorageTag{}, adaptor.getPtr1(), adaptor.getPtr2()); @@ -914,28 +920,28 @@ mlir::LogicalResult CIRToLLVMAtomicFetchOpLowering::matchAndRewrite( mlir::LogicalResult CIRToLLVMBitClrsbOpLowering::matchAndRewrite( cir::BitClrsbOp op, OpAdaptor adaptor, mlir::ConversionPatternRewriter &rewriter) const { - auto zero = rewriter.create( - op.getLoc(), adaptor.getInput().getType(), 0); - auto isNeg = rewriter.create( - op.getLoc(), + auto zero = mlir::LLVM::ConstantOp::create(rewriter, op.getLoc(), + adaptor.getInput().getType(), 0); + auto isNeg = mlir::LLVM::ICmpOp::create( + rewriter, op.getLoc(), mlir::LLVM::ICmpPredicateAttr::get(rewriter.getContext(), mlir::LLVM::ICmpPredicate::slt), adaptor.getInput(), zero); - auto negOne = rewriter.create( - op.getLoc(), adaptor.getInput().getType(), -1); - auto flipped = rewriter.create(op.getLoc(), - adaptor.getInput(), negOne); + auto negOne = mlir::LLVM::ConstantOp::create( + rewriter, op.getLoc(), adaptor.getInput().getType(), -1); + auto flipped = mlir::LLVM::XOrOp::create(rewriter, op.getLoc(), + adaptor.getInput(), negOne); - auto select = rewriter.create( - op.getLoc(), isNeg, flipped, adaptor.getInput()); + auto select = mlir::LLVM::SelectOp::create(rewriter, op.getLoc(), isNeg, + flipped, adaptor.getInput()); auto resTy = getTypeConverter()->convertType(op.getType()); - auto clz = rewriter.create( - op.getLoc(), resTy, select, /*is_zero_poison=*/false); + auto clz = mlir::LLVM::CountLeadingZerosOp::create( + rewriter, op.getLoc(), resTy, select, /*is_zero_poison=*/false); - auto one = rewriter.create(op.getLoc(), resTy, 1); - auto res = rewriter.create(op.getLoc(), clz, one); + auto one = mlir::LLVM::ConstantOp::create(rewriter, op.getLoc(), resTy, 1); + auto res = mlir::LLVM::SubOp::create(rewriter, op.getLoc(), clz, one); rewriter.replaceOp(op, res); return mlir::LogicalResult::success(); @@ -945,8 +951,8 @@ mlir::LogicalResult CIRToLLVMBitClzOpLowering::matchAndRewrite( cir::BitClzOp op, OpAdaptor adaptor, mlir::ConversionPatternRewriter &rewriter) const { auto resTy = getTypeConverter()->convertType(op.getType()); - auto llvmOp = rewriter.create( - op.getLoc(), resTy, adaptor.getInput(), op.getPoisonZero()); + auto llvmOp = mlir::LLVM::CountLeadingZerosOp::create( + rewriter, op.getLoc(), resTy, adaptor.getInput(), op.getPoisonZero()); rewriter.replaceOp(op, llvmOp); return mlir::LogicalResult::success(); } @@ -955,8 +961,8 @@ mlir::LogicalResult CIRToLLVMBitCtzOpLowering::matchAndRewrite( cir::BitCtzOp op, OpAdaptor adaptor, mlir::ConversionPatternRewriter &rewriter) const { auto resTy = getTypeConverter()->convertType(op.getType()); - auto llvmOp = rewriter.create( - op.getLoc(), resTy, adaptor.getInput(), op.getPoisonZero()); + auto llvmOp = mlir::LLVM::CountTrailingZerosOp::create( + rewriter, op.getLoc(), resTy, adaptor.getInput(), op.getPoisonZero()); rewriter.replaceOp(op, llvmOp); return mlir::LogicalResult::success(); } @@ -965,23 +971,24 @@ mlir::LogicalResult CIRToLLVMBitFfsOpLowering::matchAndRewrite( cir::BitFfsOp op, OpAdaptor adaptor, mlir::ConversionPatternRewriter &rewriter) const { auto resTy = getTypeConverter()->convertType(op.getType()); - auto ctz = rewriter.create( - op.getLoc(), resTy, adaptor.getInput(), /*is_zero_poison=*/true); + auto ctz = mlir::LLVM::CountTrailingZerosOp::create(rewriter, op.getLoc(), + resTy, adaptor.getInput(), + /*is_zero_poison=*/true); - auto one = rewriter.create(op.getLoc(), resTy, 1); - auto ctzAddOne = rewriter.create(op.getLoc(), ctz, one); + auto one = mlir::LLVM::ConstantOp::create(rewriter, op.getLoc(), resTy, 1); + auto ctzAddOne = mlir::LLVM::AddOp::create(rewriter, op.getLoc(), ctz, one); - auto zeroInputTy = rewriter.create( - op.getLoc(), adaptor.getInput().getType(), 0); - auto isZero = rewriter.create( - op.getLoc(), + auto zeroInputTy = mlir::LLVM::ConstantOp::create( + rewriter, op.getLoc(), adaptor.getInput().getType(), 0); + auto isZero = mlir::LLVM::ICmpOp::create( + rewriter, op.getLoc(), mlir::LLVM::ICmpPredicateAttr::get(rewriter.getContext(), mlir::LLVM::ICmpPredicate::eq), adaptor.getInput(), zeroInputTy); - auto zero = rewriter.create(op.getLoc(), resTy, 0); - auto res = rewriter.create(op.getLoc(), isZero, zero, - ctzAddOne); + auto zero = mlir::LLVM::ConstantOp::create(rewriter, op.getLoc(), resTy, 0); + auto res = mlir::LLVM::SelectOp::create(rewriter, op.getLoc(), isZero, zero, + ctzAddOne); rewriter.replaceOp(op, res); return mlir::LogicalResult::success(); @@ -991,12 +998,12 @@ mlir::LogicalResult CIRToLLVMBitParityOpLowering::matchAndRewrite( cir::BitParityOp op, OpAdaptor adaptor, mlir::ConversionPatternRewriter &rewriter) const { auto resTy = getTypeConverter()->convertType(op.getType()); - auto popcnt = rewriter.create(op.getLoc(), resTy, - adaptor.getInput()); + auto popcnt = mlir::LLVM::CtPopOp::create(rewriter, op.getLoc(), resTy, + adaptor.getInput()); - auto one = rewriter.create(op.getLoc(), resTy, 1); + auto one = mlir::LLVM::ConstantOp::create(rewriter, op.getLoc(), resTy, 1); auto popcntMod2 = - rewriter.create(op.getLoc(), popcnt, one); + mlir::LLVM::AndOp::create(rewriter, op.getLoc(), popcnt, one); rewriter.replaceOp(op, popcntMod2); return mlir::LogicalResult::success(); @@ -1006,8 +1013,8 @@ mlir::LogicalResult CIRToLLVMBitPopcountOpLowering::matchAndRewrite( cir::BitPopcountOp op, OpAdaptor adaptor, mlir::ConversionPatternRewriter &rewriter) const { auto resTy = getTypeConverter()->convertType(op.getType()); - auto llvmOp = rewriter.create(op.getLoc(), resTy, - adaptor.getInput()); + auto llvmOp = mlir::LLVM::CtPopOp::create(rewriter, op.getLoc(), resTy, + adaptor.getInput()); rewriter.replaceOp(op, llvmOp); return mlir::LogicalResult::success(); } @@ -1067,8 +1074,8 @@ mlir::LogicalResult CIRToLLVMCastOpLowering::matchAndRewrite( } case cir::CastKind::int_to_bool: { mlir::Value llvmSrcVal = adaptor.getSrc(); - mlir::Value zeroInt = rewriter.create( - castOp.getLoc(), llvmSrcVal.getType(), 0); + mlir::Value zeroInt = mlir::LLVM::ConstantOp::create( + rewriter, castOp.getLoc(), llvmSrcVal.getType(), 0); rewriter.replaceOpWithNewOp( castOp, mlir::LLVM::ICmpPredicate::ne, llvmSrcVal, zeroInt); break; @@ -1132,8 +1139,8 @@ mlir::LogicalResult CIRToLLVMCastOpLowering::matchAndRewrite( auto kind = mlir::LLVM::FCmpPredicate::une; // Check if float is not equal to zero. - auto zeroFloat = rewriter.create( - castOp.getLoc(), llvmSrcVal.getType(), + auto zeroFloat = mlir::LLVM::ConstantOp::create( + rewriter, castOp.getLoc(), llvmSrcVal.getType(), mlir::FloatAttr::get(llvmSrcVal.getType(), 0.0)); // Extend comparison result to either bool (C++) or int (C). @@ -1204,8 +1211,8 @@ mlir::LogicalResult CIRToLLVMCastOpLowering::matchAndRewrite( } case cir::CastKind::ptr_to_bool: { mlir::Value llvmSrcVal = adaptor.getSrc(); - mlir::Value zeroPtr = rewriter.create( - castOp.getLoc(), llvmSrcVal.getType()); + mlir::Value zeroPtr = mlir::LLVM::ZeroOp::create(rewriter, castOp.getLoc(), + llvmSrcVal.getType()); rewriter.replaceOpWithNewOp( castOp, mlir::LLVM::ICmpPredicate::ne, llvmSrcVal, zeroPtr); break; @@ -1275,10 +1282,10 @@ mlir::LogicalResult CIRToLLVMPtrStrideOpLowering::matchAndRewrite( // Rewrite the sub in front of extensions/trunc if (rewriteSub) { - index = rewriter.create( - index.getLoc(), index.getType(), - rewriter.create(index.getLoc(), - index.getType(), 0), + index = mlir::LLVM::SubOp::create( + rewriter, index.getLoc(), index.getType(), + mlir::LLVM::ConstantOp::create(rewriter, index.getLoc(), + index.getType(), 0), index); rewriter.eraseOp(sub); } @@ -1310,11 +1317,11 @@ mlir::LogicalResult CIRToLLVMBaseClassAddrOpLowering::matchAndRewrite( baseClassOp, resultType, byteType, derivedAddr, offset); } else { auto loc = baseClassOp.getLoc(); - mlir::Value isNull = rewriter.create( - loc, mlir::LLVM::ICmpPredicate::eq, derivedAddr, - rewriter.create(loc, derivedAddr.getType())); - mlir::Value adjusted = rewriter.create( - loc, resultType, byteType, derivedAddr, offset); + mlir::Value isNull = mlir::LLVM::ICmpOp::create( + rewriter, loc, mlir::LLVM::ICmpPredicate::eq, derivedAddr, + mlir::LLVM::ZeroOp::create(rewriter, loc, derivedAddr.getType())); + mlir::Value adjusted = mlir::LLVM::GEPOp::create( + rewriter, loc, resultType, byteType, derivedAddr, offset); rewriter.replaceOpWithNewOp(baseClassOp, isNull, derivedAddr, adjusted); } @@ -1335,8 +1342,8 @@ mlir::LogicalResult CIRToLLVMAllocaOpLowering::matchAndRewrite( mlir::Value size = op.isDynamic() ? adaptor.getDynAllocSize() - : rewriter.create( - op.getLoc(), + : mlir::LLVM::ConstantOp::create( + rewriter, op.getLoc(), typeConverter->convertType(rewriter.getIndexType()), 1); mlir::Type elementTy = convertTypeForMemory(*getTypeConverter(), dataLayout, op.getAllocaType()); @@ -1694,13 +1701,13 @@ mlir::LogicalResult CIRToLLVMPtrDiffOpLowering::matchAndRewrite( auto dstTy = mlir::cast(op.getType()); mlir::Type llvmDstTy = getTypeConverter()->convertType(dstTy); - auto lhs = rewriter.create(op.getLoc(), llvmDstTy, - adaptor.getLhs()); - auto rhs = rewriter.create(op.getLoc(), llvmDstTy, - adaptor.getRhs()); + auto lhs = mlir::LLVM::PtrToIntOp::create(rewriter, op.getLoc(), llvmDstTy, + adaptor.getLhs()); + auto rhs = mlir::LLVM::PtrToIntOp::create(rewriter, op.getLoc(), llvmDstTy, + adaptor.getRhs()); auto diff = - rewriter.create(op.getLoc(), llvmDstTy, lhs, rhs); + mlir::LLVM::SubOp::create(rewriter, op.getLoc(), llvmDstTy, lhs, rhs); cir::PointerType ptrTy = op.getLhs().getType(); assert(!cir::MissingFeatures::llvmLoweringPtrDiffConsidersPointee()); @@ -1709,17 +1716,17 @@ mlir::LogicalResult CIRToLLVMPtrDiffOpLowering::matchAndRewrite( // Avoid silly division by 1. mlir::Value resultVal = diff.getResult(); if (typeSize != 1) { - auto typeSizeVal = rewriter.create( - op.getLoc(), llvmDstTy, typeSize); + auto typeSizeVal = mlir::LLVM::ConstantOp::create(rewriter, op.getLoc(), + llvmDstTy, typeSize); if (dstTy.isUnsigned()) { auto uDiv = - rewriter.create(op.getLoc(), diff, typeSizeVal); + mlir::LLVM::UDivOp::create(rewriter, op.getLoc(), diff, typeSizeVal); uDiv.setIsExact(true); resultVal = uDiv.getResult(); } else { auto sDiv = - rewriter.create(op.getLoc(), diff, typeSizeVal); + mlir::LLVM::SDivOp::create(rewriter, op.getLoc(), diff, typeSizeVal); sDiv.setIsExact(true); resultVal = sDiv.getResult(); } @@ -1847,8 +1854,8 @@ mlir::LogicalResult CIRToLLVMFuncOpLowering::matchAndRewrite( SmallVector attributes; lowerFuncAttributes(op, /*filterArgAndResAttrs=*/false, attributes); - mlir::LLVM::LLVMFuncOp fn = rewriter.create( - loc, op.getName(), llvmFnTy, linkage, isDsoLocal, cconv, + mlir::LLVM::LLVMFuncOp fn = mlir::LLVM::LLVMFuncOp::create( + rewriter, loc, op.getName(), llvmFnTy, linkage, isDsoLocal, cconv, mlir::SymbolRefAttr(), attributes); assert(!cir::MissingFeatures::opFuncMultipleReturnVals()); @@ -1884,8 +1891,8 @@ mlir::LogicalResult CIRToLLVMGetGlobalOpLowering::matchAndRewrite( } mlir::Type type = getTypeConverter()->convertType(op.getType()); - mlir::Operation *newop = - rewriter.create(op.getLoc(), type, op.getName()); + mlir::Operation *newop = mlir::LLVM::AddressOfOp::create( + rewriter, op.getLoc(), type, op.getName()); assert(!cir::MissingFeatures::opGlobalThreadLocal()); @@ -1941,7 +1948,7 @@ CIRToLLVMGlobalOpLowering::matchAndRewriteRegionInitializedGlobal( setupRegionInitializedLLVMGlobalOp(op, rewriter); CIRAttrToValue valueConverter(op, rewriter, typeConverter); mlir::Value value = valueConverter.visit(init); - rewriter.create(loc, value); + mlir::LLVM::ReturnOp::create(rewriter, loc, value); return mlir::success(); } @@ -2094,14 +2101,14 @@ mlir::LogicalResult CIRToLLVMUnaryOpLowering::matchAndRewrite( switch (op.getKind()) { case cir::UnaryOpKind::Inc: { assert(!isVector && "++ not allowed on vector types"); - auto one = rewriter.create(loc, llvmType, 1); + auto one = mlir::LLVM::ConstantOp::create(rewriter, loc, llvmType, 1); rewriter.replaceOpWithNewOp( op, llvmType, adaptor.getInput(), one, maybeNSW); return mlir::success(); } case cir::UnaryOpKind::Dec: { assert(!isVector && "-- not allowed on vector types"); - auto one = rewriter.create(loc, llvmType, 1); + auto one = mlir::LLVM::ConstantOp::create(rewriter, loc, llvmType, 1); rewriter.replaceOpWithNewOp(op, adaptor.getInput(), one, maybeNSW); return mlir::success(); @@ -2112,9 +2119,9 @@ mlir::LogicalResult CIRToLLVMUnaryOpLowering::matchAndRewrite( case cir::UnaryOpKind::Minus: { mlir::Value zero; if (isVector) - zero = rewriter.create(loc, llvmType); + zero = mlir::LLVM::ZeroOp::create(rewriter, loc, llvmType); else - zero = rewriter.create(loc, llvmType, 0); + zero = mlir::LLVM::ConstantOp::create(rewriter, loc, llvmType, 0); rewriter.replaceOpWithNewOp( op, zero, adaptor.getInput(), maybeNSW); return mlir::success(); @@ -2128,9 +2135,9 @@ mlir::LogicalResult CIRToLLVMUnaryOpLowering::matchAndRewrite( std::vector values(numElements, -1); mlir::DenseIntElementsAttr denseVec = rewriter.getI32VectorAttr(values); minusOne = - rewriter.create(loc, llvmType, denseVec); + mlir::LLVM::ConstantOp::create(rewriter, loc, llvmType, denseVec); } else { - minusOne = rewriter.create(loc, llvmType, -1); + minusOne = mlir::LLVM::ConstantOp::create(rewriter, loc, llvmType, -1); } rewriter.replaceOpWithNewOp(op, adaptor.getInput(), minusOne); @@ -2145,16 +2152,16 @@ mlir::LogicalResult CIRToLLVMUnaryOpLowering::matchAndRewrite( switch (op.getKind()) { case cir::UnaryOpKind::Inc: { assert(!isVector && "++ not allowed on vector types"); - mlir::LLVM::ConstantOp one = rewriter.create( - loc, llvmType, rewriter.getFloatAttr(llvmType, 1.0)); + mlir::LLVM::ConstantOp one = mlir::LLVM::ConstantOp::create( + rewriter, loc, llvmType, rewriter.getFloatAttr(llvmType, 1.0)); rewriter.replaceOpWithNewOp(op, llvmType, one, adaptor.getInput()); return mlir::success(); } case cir::UnaryOpKind::Dec: { assert(!isVector && "-- not allowed on vector types"); - mlir::LLVM::ConstantOp minusOne = rewriter.create( - loc, llvmType, rewriter.getFloatAttr(llvmType, -1.0)); + mlir::LLVM::ConstantOp minusOne = mlir::LLVM::ConstantOp::create( + rewriter, loc, llvmType, rewriter.getFloatAttr(llvmType, -1.0)); rewriter.replaceOpWithNewOp(op, llvmType, minusOne, adaptor.getInput()); return mlir::success(); @@ -2185,7 +2192,7 @@ mlir::LogicalResult CIRToLLVMUnaryOpLowering::matchAndRewrite( return op.emitError() << "Unsupported unary operation on boolean type"; case cir::UnaryOpKind::Not: { assert(!isVector && "NYI: op! on vector mask"); - auto one = rewriter.create(loc, llvmType, 1); + auto one = mlir::LLVM::ConstantOp::create(rewriter, loc, llvmType, 1); rewriter.replaceOpWithNewOp(op, adaptor.getInput(), one); return mlir::success(); @@ -2421,47 +2428,47 @@ mlir::LogicalResult CIRToLLVMCmpOpLowering::matchAndRewrite( mlir::Type complexElemTy = getTypeConverter()->convertType(complexType.getElementType()); - auto lhsReal = - rewriter.create(loc, complexElemTy, lhs, 0); - auto lhsImag = - rewriter.create(loc, complexElemTy, lhs, 1); - auto rhsReal = - rewriter.create(loc, complexElemTy, rhs, 0); - auto rhsImag = - rewriter.create(loc, complexElemTy, rhs, 1); + auto lhsReal = mlir::LLVM::ExtractValueOp::create( + rewriter, loc, complexElemTy, lhs, ArrayRef(int64_t{0})); + auto lhsImag = mlir::LLVM::ExtractValueOp::create( + rewriter, loc, complexElemTy, lhs, ArrayRef(int64_t{1})); + auto rhsReal = mlir::LLVM::ExtractValueOp::create( + rewriter, loc, complexElemTy, rhs, ArrayRef(int64_t{0})); + auto rhsImag = mlir::LLVM::ExtractValueOp::create( + rewriter, loc, complexElemTy, rhs, ArrayRef(int64_t{1})); if (cmpOp.getKind() == cir::CmpOpKind::eq) { if (complexElemTy.isInteger()) { - auto realCmp = rewriter.create( - loc, mlir::LLVM::ICmpPredicate::eq, lhsReal, rhsReal); - auto imagCmp = rewriter.create( - loc, mlir::LLVM::ICmpPredicate::eq, lhsImag, rhsImag); + auto realCmp = mlir::LLVM::ICmpOp::create( + rewriter, loc, mlir::LLVM::ICmpPredicate::eq, lhsReal, rhsReal); + auto imagCmp = mlir::LLVM::ICmpOp::create( + rewriter, loc, mlir::LLVM::ICmpPredicate::eq, lhsImag, rhsImag); rewriter.replaceOpWithNewOp(cmpOp, realCmp, imagCmp); return mlir::success(); } - auto realCmp = rewriter.create( - loc, mlir::LLVM::FCmpPredicate::oeq, lhsReal, rhsReal); - auto imagCmp = rewriter.create( - loc, mlir::LLVM::FCmpPredicate::oeq, lhsImag, rhsImag); + auto realCmp = mlir::LLVM::FCmpOp::create( + rewriter, loc, mlir::LLVM::FCmpPredicate::oeq, lhsReal, rhsReal); + auto imagCmp = mlir::LLVM::FCmpOp::create( + rewriter, loc, mlir::LLVM::FCmpPredicate::oeq, lhsImag, rhsImag); rewriter.replaceOpWithNewOp(cmpOp, realCmp, imagCmp); return mlir::success(); } if (cmpOp.getKind() == cir::CmpOpKind::ne) { if (complexElemTy.isInteger()) { - auto realCmp = rewriter.create( - loc, mlir::LLVM::ICmpPredicate::ne, lhsReal, rhsReal); - auto imagCmp = rewriter.create( - loc, mlir::LLVM::ICmpPredicate::ne, lhsImag, rhsImag); + auto realCmp = mlir::LLVM::ICmpOp::create( + rewriter, loc, mlir::LLVM::ICmpPredicate::ne, lhsReal, rhsReal); + auto imagCmp = mlir::LLVM::ICmpOp::create( + rewriter, loc, mlir::LLVM::ICmpPredicate::ne, lhsImag, rhsImag); rewriter.replaceOpWithNewOp(cmpOp, realCmp, imagCmp); return mlir::success(); } - auto realCmp = rewriter.create( - loc, mlir::LLVM::FCmpPredicate::une, lhsReal, rhsReal); - auto imagCmp = rewriter.create( - loc, mlir::LLVM::FCmpPredicate::une, lhsImag, rhsImag); + auto realCmp = mlir::LLVM::FCmpOp::create( + rewriter, loc, mlir::LLVM::FCmpPredicate::une, lhsReal, rhsReal); + auto imagCmp = mlir::LLVM::FCmpOp::create( + rewriter, loc, mlir::LLVM::FCmpPredicate::une, lhsImag, rhsImag); rewriter.replaceOpWithNewOp(cmpOp, realCmp, imagCmp); return mlir::success(); } @@ -2725,7 +2732,7 @@ static void buildCtorDtorList( index); } - builder.create(loc, result); + mlir::LLVM::ReturnOp::create(builder, loc, result); } // The applyPartialConversion function traverses blocks in the dominance order, @@ -2904,7 +2911,7 @@ void createLLVMFuncOpIfNotExist(mlir::ConversionPatternRewriter &rewriter, if (!sourceSymbol) { mlir::OpBuilder::InsertionGuard guard(rewriter); rewriter.setInsertionPoint(enclosingFnOp); - rewriter.create(srcOp->getLoc(), fnName, fnTy); + mlir::LLVM::LLVMFuncOp::create(rewriter, srcOp->getLoc(), fnName, fnTy); } } @@ -2983,12 +2990,12 @@ mlir::LogicalResult CIRToLLVMTrapOpLowering::matchAndRewrite( mlir::Location loc = op->getLoc(); rewriter.eraseOp(op); - rewriter.create(loc); + mlir::LLVM::Trap::create(rewriter, loc); // Note that the call to llvm.trap is not a terminator in LLVM dialect. // So we must emit an additional llvm.unreachable to terminate the current // block. - rewriter.create(loc); + mlir::LLVM::UnreachableOp::create(rewriter, loc); return mlir::success(); } @@ -3114,15 +3121,15 @@ mlir::LogicalResult CIRToLLVMVecCreateOpLowering::matchAndRewrite( const auto vecTy = mlir::cast(op.getType()); const mlir::Type llvmTy = typeConverter->convertType(vecTy); const mlir::Location loc = op.getLoc(); - mlir::Value result = rewriter.create(loc, llvmTy); + mlir::Value result = mlir::LLVM::PoisonOp::create(rewriter, loc, llvmTy); assert(vecTy.getSize() == op.getElements().size() && "cir.vec.create op count doesn't match vector type elements count"); for (uint64_t i = 0; i < vecTy.getSize(); ++i) { const mlir::Value indexValue = - rewriter.create(loc, rewriter.getI64Type(), i); - result = rewriter.create( - loc, result, adaptor.getElements()[i], indexValue); + mlir::LLVM::ConstantOp::create(rewriter, loc, rewriter.getI64Type(), i); + result = mlir::LLVM::InsertElementOp::create( + rewriter, loc, result, adaptor.getElements()[i], indexValue); } rewriter.replaceOp(op, result); @@ -3151,13 +3158,13 @@ mlir::LogicalResult CIRToLLVMVecCmpOpLowering::matchAndRewrite( mlir::Type elementType = elementTypeIfVector(op.getLhs().getType()); mlir::Value bitResult; if (auto intType = mlir::dyn_cast(elementType)) { - bitResult = rewriter.create( - op.getLoc(), + bitResult = mlir::LLVM::ICmpOp::create( + rewriter, op.getLoc(), convertCmpKindToICmpPredicate(op.getKind(), intType.isSigned()), adaptor.getLhs(), adaptor.getRhs()); } else if (mlir::isa(elementType)) { - bitResult = rewriter.create( - op.getLoc(), convertCmpKindToFCmpPredicate(op.getKind()), + bitResult = mlir::LLVM::FCmpOp::create( + rewriter, op.getLoc(), convertCmpKindToFCmpPredicate(op.getKind()), adaptor.getLhs(), adaptor.getRhs()); } else { return op.emitError() << "unsupported type for VecCmpOp: " << elementType; @@ -3181,7 +3188,7 @@ mlir::LogicalResult CIRToLLVMVecSplatOpLowering::matchAndRewrite( cir::VectorType vecTy = op.getType(); mlir::Type llvmTy = typeConverter->convertType(vecTy); mlir::Location loc = op.getLoc(); - mlir::Value poison = rewriter.create(loc, llvmTy); + mlir::Value poison = mlir::LLVM::PoisonOp::create(rewriter, loc, llvmTy); mlir::Value elementValue = adaptor.getValue(); if (elementValue.getDefiningOp()) { @@ -3210,9 +3217,9 @@ mlir::LogicalResult CIRToLLVMVecSplatOpLowering::matchAndRewrite( } mlir::Value indexValue = - rewriter.create(loc, rewriter.getI64Type(), 0); - mlir::Value oneElement = rewriter.create( - loc, poison, elementValue, indexValue); + mlir::LLVM::ConstantOp::create(rewriter, loc, rewriter.getI64Type(), 0); + mlir::Value oneElement = mlir::LLVM::InsertElementOp::create( + rewriter, loc, poison, elementValue, indexValue); SmallVector zeroValues(vecTy.getSize(), 0); rewriter.replaceOpWithNewOp(op, oneElement, poison, zeroValues); @@ -3260,31 +3267,32 @@ mlir::LogicalResult CIRToLLVMVecShuffleDynamicOpLowering::matchAndRewrite( mlir::cast(op.getVec().getType()).getSize(); uint64_t maskBits = llvm::NextPowerOf2(numElements - 1) - 1; - mlir::Value maskValue = rewriter.create( - loc, llvmIndexType, rewriter.getIntegerAttr(llvmIndexType, maskBits)); + mlir::Value maskValue = mlir::LLVM::ConstantOp::create( + rewriter, loc, llvmIndexType, + rewriter.getIntegerAttr(llvmIndexType, maskBits)); mlir::Value maskVector = - rewriter.create(loc, llvmIndexVecType); + mlir::LLVM::UndefOp::create(rewriter, loc, llvmIndexVecType); for (uint64_t i = 0; i < numElements; ++i) { mlir::Value idxValue = - rewriter.create(loc, rewriter.getI64Type(), i); - maskVector = rewriter.create( - loc, maskVector, maskValue, idxValue); + mlir::LLVM::ConstantOp::create(rewriter, loc, rewriter.getI64Type(), i); + maskVector = mlir::LLVM::InsertElementOp::create(rewriter, loc, maskVector, + maskValue, idxValue); } - mlir::Value maskedIndices = rewriter.create( - loc, llvmIndexVecType, adaptor.getIndices(), maskVector); - mlir::Value result = rewriter.create( - loc, getTypeConverter()->convertType(op.getVec().getType())); + mlir::Value maskedIndices = mlir::LLVM::AndOp::create( + rewriter, loc, llvmIndexVecType, adaptor.getIndices(), maskVector); + mlir::Value result = mlir::LLVM::UndefOp::create( + rewriter, loc, getTypeConverter()->convertType(op.getVec().getType())); for (uint64_t i = 0; i < numElements; ++i) { mlir::Value iValue = - rewriter.create(loc, rewriter.getI64Type(), i); - mlir::Value indexValue = rewriter.create( - loc, maskedIndices, iValue); + mlir::LLVM::ConstantOp::create(rewriter, loc, rewriter.getI64Type(), i); + mlir::Value indexValue = mlir::LLVM::ExtractElementOp::create( + rewriter, loc, maskedIndices, iValue); mlir::Value valueAtIndex = - rewriter.create(loc, input, indexValue); - result = rewriter.create(loc, result, - valueAtIndex, iValue); + mlir::LLVM::ExtractElementOp::create(rewriter, loc, input, indexValue); + result = mlir::LLVM::InsertElementOp::create(rewriter, loc, result, + valueAtIndex, iValue); } rewriter.replaceOp(op, result); return mlir::success(); @@ -3294,10 +3302,10 @@ mlir::LogicalResult CIRToLLVMVecTernaryOpLowering::matchAndRewrite( cir::VecTernaryOp op, OpAdaptor adaptor, mlir::ConversionPatternRewriter &rewriter) const { // Convert `cond` into a vector of i1, then use that in a `select` op. - mlir::Value bitVec = rewriter.create( - op.getLoc(), mlir::LLVM::ICmpPredicate::ne, adaptor.getCond(), - rewriter.create( - op.getCond().getLoc(), + mlir::Value bitVec = mlir::LLVM::ICmpOp::create( + rewriter, op.getLoc(), mlir::LLVM::ICmpPredicate::ne, adaptor.getCond(), + mlir::LLVM::ZeroOp::create( + rewriter, op.getCond().getLoc(), typeConverter->convertType(op.getCond().getType()))); rewriter.replaceOpWithNewOp( op, bitVec, adaptor.getLhs(), adaptor.getRhs()); @@ -3314,41 +3322,41 @@ mlir::LogicalResult CIRToLLVMComplexAddOpLowering::matchAndRewrite( auto complexType = mlir::cast(op.getLhs().getType()); mlir::Type complexElemTy = getTypeConverter()->convertType(complexType.getElementType()); - auto lhsReal = - rewriter.create(loc, complexElemTy, lhs, 0); - auto lhsImag = - rewriter.create(loc, complexElemTy, lhs, 1); - auto rhsReal = - rewriter.create(loc, complexElemTy, rhs, 0); - auto rhsImag = - rewriter.create(loc, complexElemTy, rhs, 1); + auto lhsReal = mlir::LLVM::ExtractValueOp::create( + rewriter, loc, complexElemTy, lhs, ArrayRef(int64_t{0})); + auto lhsImag = mlir::LLVM::ExtractValueOp::create( + rewriter, loc, complexElemTy, lhs, ArrayRef(int64_t{1})); + auto rhsReal = mlir::LLVM::ExtractValueOp::create( + rewriter, loc, complexElemTy, rhs, ArrayRef(int64_t{0})); + auto rhsImag = mlir::LLVM::ExtractValueOp::create( + rewriter, loc, complexElemTy, rhs, ArrayRef(int64_t{1})); mlir::Value newReal; mlir::Value newImag; if (complexElemTy.isInteger()) { - newReal = rewriter.create(loc, complexElemTy, lhsReal, - rhsReal); - newImag = rewriter.create(loc, complexElemTy, lhsImag, - rhsImag); + newReal = mlir::LLVM::AddOp::create(rewriter, loc, complexElemTy, lhsReal, + rhsReal); + newImag = mlir::LLVM::AddOp::create(rewriter, loc, complexElemTy, lhsImag, + rhsImag); } else { assert(!cir::MissingFeatures::fastMathFlags()); assert(!cir::MissingFeatures::fpConstraints()); - newReal = rewriter.create(loc, complexElemTy, lhsReal, - rhsReal); - newImag = rewriter.create(loc, complexElemTy, lhsImag, - rhsImag); + newReal = mlir::LLVM::FAddOp::create(rewriter, loc, complexElemTy, lhsReal, + rhsReal); + newImag = mlir::LLVM::FAddOp::create(rewriter, loc, complexElemTy, lhsImag, + rhsImag); } mlir::Type complexLLVMTy = getTypeConverter()->convertType(op.getResult().getType()); auto initialComplex = - rewriter.create(op->getLoc(), complexLLVMTy); + mlir::LLVM::PoisonOp::create(rewriter, op->getLoc(), complexLLVMTy); - auto realComplex = rewriter.create( - op->getLoc(), initialComplex, newReal, 0); + auto realComplex = mlir::LLVM::InsertValueOp::create( + rewriter, op->getLoc(), initialComplex, newReal, ArrayRef(int64_t{0})); - rewriter.replaceOpWithNewOp(op, realComplex, - newImag, 1); + rewriter.replaceOpWithNewOp( + op, realComplex, newImag, ArrayRef(int64_t{1})); return mlir::success(); } @@ -3359,13 +3367,15 @@ mlir::LogicalResult CIRToLLVMComplexCreateOpLowering::matchAndRewrite( mlir::Type complexLLVMTy = getTypeConverter()->convertType(op.getResult().getType()); auto initialComplex = - rewriter.create(op->getLoc(), complexLLVMTy); + mlir::LLVM::UndefOp::create(rewriter, op->getLoc(), complexLLVMTy); - auto realComplex = rewriter.create( - op->getLoc(), initialComplex, adaptor.getReal(), 0); + auto realComplex = mlir::LLVM::InsertValueOp::create( + rewriter, op->getLoc(), initialComplex, adaptor.getReal(), + ArrayRef(int64_t{0})); - auto complex = rewriter.create( - op->getLoc(), realComplex, adaptor.getImag(), 1); + auto complex = mlir::LLVM::InsertValueOp::create( + rewriter, op->getLoc(), realComplex, adaptor.getImag(), + ArrayRef(int64_t{1})); rewriter.replaceOp(op, complex); return mlir::success(); @@ -3395,41 +3405,41 @@ mlir::LogicalResult CIRToLLVMComplexSubOpLowering::matchAndRewrite( auto complexType = mlir::cast(op.getLhs().getType()); mlir::Type complexElemTy = getTypeConverter()->convertType(complexType.getElementType()); - auto lhsReal = - rewriter.create(loc, complexElemTy, lhs, 0); - auto lhsImag = - rewriter.create(loc, complexElemTy, lhs, 1); - auto rhsReal = - rewriter.create(loc, complexElemTy, rhs, 0); - auto rhsImag = - rewriter.create(loc, complexElemTy, rhs, 1); + auto lhsReal = mlir::LLVM::ExtractValueOp::create( + rewriter, loc, complexElemTy, lhs, ArrayRef(int64_t{0})); + auto lhsImag = mlir::LLVM::ExtractValueOp::create( + rewriter, loc, complexElemTy, lhs, ArrayRef(int64_t{1})); + auto rhsReal = mlir::LLVM::ExtractValueOp::create( + rewriter, loc, complexElemTy, rhs, ArrayRef(int64_t{0})); + auto rhsImag = mlir::LLVM::ExtractValueOp::create( + rewriter, loc, complexElemTy, rhs, ArrayRef(int64_t{1})); mlir::Value newReal; mlir::Value newImag; if (complexElemTy.isInteger()) { - newReal = rewriter.create(loc, complexElemTy, lhsReal, - rhsReal); - newImag = rewriter.create(loc, complexElemTy, lhsImag, - rhsImag); + newReal = mlir::LLVM::SubOp::create(rewriter, loc, complexElemTy, lhsReal, + rhsReal); + newImag = mlir::LLVM::SubOp::create(rewriter, loc, complexElemTy, lhsImag, + rhsImag); } else { assert(!cir::MissingFeatures::fastMathFlags()); assert(!cir::MissingFeatures::fpConstraints()); - newReal = rewriter.create(loc, complexElemTy, lhsReal, - rhsReal); - newImag = rewriter.create(loc, complexElemTy, lhsImag, - rhsImag); + newReal = mlir::LLVM::FSubOp::create(rewriter, loc, complexElemTy, lhsReal, + rhsReal); + newImag = mlir::LLVM::FSubOp::create(rewriter, loc, complexElemTy, lhsImag, + rhsImag); } mlir::Type complexLLVMTy = getTypeConverter()->convertType(op.getResult().getType()); auto initialComplex = - rewriter.create(op->getLoc(), complexLLVMTy); + mlir::LLVM::PoisonOp::create(rewriter, op->getLoc(), complexLLVMTy); - auto realComplex = rewriter.create( - op->getLoc(), initialComplex, newReal, 0); + auto realComplex = mlir::LLVM::InsertValueOp::create( + rewriter, op->getLoc(), initialComplex, newReal, ArrayRef(int64_t{0})); - rewriter.replaceOpWithNewOp(op, realComplex, - newImag, 1); + rewriter.replaceOpWithNewOp( + op, realComplex, newImag, ArrayRef(int64_t{1})); return mlir::success(); } @@ -3496,8 +3506,8 @@ mlir::LogicalResult CIRToLLVMSetBitfieldOpLowering::matchAndRewrite( if (storageSize != size) { assert(storageSize > size && "Invalid bitfield size."); - mlir::Value val = rewriter.create( - op.getLoc(), intType, adaptor.getAddr(), op.getAlignment(), + mlir::Value val = mlir::LLVM::LoadOp::create( + rewriter, op.getLoc(), intType, adaptor.getAddr(), op.getAlignment(), op.getIsVolatile()); srcVal = @@ -3510,11 +3520,11 @@ mlir::LogicalResult CIRToLLVMSetBitfieldOpLowering::matchAndRewrite( ~llvm::APInt::getBitsSet(srcWidth, offset, offset + size)); // Or together the unchanged values and the source value. - srcVal = rewriter.create(op.getLoc(), val, srcVal); + srcVal = mlir::LLVM::OrOp::create(rewriter, op.getLoc(), val, srcVal); } - rewriter.create(op.getLoc(), srcVal, adaptor.getAddr(), - op.getAlignment(), op.getIsVolatile()); + mlir::LLVM::StoreOp::create(rewriter, op.getLoc(), srcVal, adaptor.getAddr(), + op.getAlignment(), op.getIsVolatile()); mlir::Type resultTy = getTypeConverter()->convertType(op.getType()); @@ -3587,10 +3597,10 @@ mlir::LogicalResult CIRToLLVMGetBitfieldOpLowering::matchAndRewrite( mlir::IntegerType intType = computeBitfieldIntType(storageType, context, storageSize); - mlir::Value val = rewriter.create( - op.getLoc(), intType, adaptor.getAddr(), op.getAlignment(), + mlir::Value val = mlir::LLVM::LoadOp::create( + rewriter, op.getLoc(), intType, adaptor.getAddr(), op.getAlignment(), op.getIsVolatile()); - val = rewriter.create(op.getLoc(), intType, val); + val = mlir::LLVM::BitcastOp::create(rewriter, op.getLoc(), intType, val); if (info.getIsSigned()) { assert(static_cast(offset + size) <= storageSize); diff --git a/clang/lib/CIR/Lowering/LoweringHelpers.cpp b/clang/lib/CIR/Lowering/LoweringHelpers.cpp index d5f132431aba2..0786579a601b1 100644 --- a/clang/lib/CIR/Lowering/LoweringHelpers.cpp +++ b/clang/lib/CIR/Lowering/LoweringHelpers.cpp @@ -148,37 +148,37 @@ lowerConstArrayAttr(cir::ConstArrayAttr constArr, mlir::Value getConstAPInt(mlir::OpBuilder &bld, mlir::Location loc, mlir::Type typ, const llvm::APInt &val) { - return bld.create(loc, typ, val); + return mlir::LLVM::ConstantOp::create(bld, loc, typ, val); } mlir::Value getConst(mlir::OpBuilder &bld, mlir::Location loc, mlir::Type typ, unsigned val) { - return bld.create(loc, typ, val); + return mlir::LLVM::ConstantOp::create(bld, loc, typ, val); } mlir::Value createShL(mlir::OpBuilder &bld, mlir::Value lhs, unsigned rhs) { if (!rhs) return lhs; mlir::Value rhsVal = getConst(bld, lhs.getLoc(), lhs.getType(), rhs); - return bld.create(lhs.getLoc(), lhs, rhsVal); + return mlir::LLVM::ShlOp::create(bld, lhs.getLoc(), lhs, rhsVal); } mlir::Value createAShR(mlir::OpBuilder &bld, mlir::Value lhs, unsigned rhs) { if (!rhs) return lhs; mlir::Value rhsVal = getConst(bld, lhs.getLoc(), lhs.getType(), rhs); - return bld.create(lhs.getLoc(), lhs, rhsVal); + return mlir::LLVM::AShrOp::create(bld, lhs.getLoc(), lhs, rhsVal); } mlir::Value createAnd(mlir::OpBuilder &bld, mlir::Value lhs, const llvm::APInt &rhs) { mlir::Value rhsVal = getConstAPInt(bld, lhs.getLoc(), lhs.getType(), rhs); - return bld.create(lhs.getLoc(), lhs, rhsVal); + return mlir::LLVM::AndOp::create(bld, lhs.getLoc(), lhs, rhsVal); } mlir::Value createLShR(mlir::OpBuilder &bld, mlir::Value lhs, unsigned rhs) { if (!rhs) return lhs; mlir::Value rhsVal = getConst(bld, lhs.getLoc(), lhs.getType(), rhs); - return bld.create(lhs.getLoc(), lhs, rhsVal); + return mlir::LLVM::LShrOp::create(bld, lhs.getLoc(), lhs, rhsVal); } diff --git a/clang/unittests/CIR/PointerLikeTest.cpp b/clang/unittests/CIR/PointerLikeTest.cpp index 22690f2834cc4..b3dfba75f34ce 100644 --- a/clang/unittests/CIR/PointerLikeTest.cpp +++ b/clang/unittests/CIR/PointerLikeTest.cpp @@ -76,7 +76,7 @@ class CIROpenACCPointerLikeTest : public ::testing::Test { EXPECT_EQ(pltTy.getElementType(), ty); OwningOpRef varPtrOp = - b.create(loc, ptrTy, ty, "", getAlignOne(&context)); + cir::AllocaOp::create(b, loc, ptrTy, ty, "", getAlignOne(&context)); mlir::Value val = varPtrOp.get(); mlir::acc::VariableTypeCategory typeCategory = pltTy.getPointeeTypeCategory( @@ -110,7 +110,7 @@ class CIROpenACCPointerLikeTest : public ::testing::Test { // Create an alloca for the array OwningOpRef varPtrOp = - b.create(loc, ptrTy, arrTy, "", getAlignOne(&context)); + cir::AllocaOp::create(b, loc, ptrTy, arrTy, "", getAlignOne(&context)); // Verify that the type category is array. mlir::Value val = varPtrOp.get(); @@ -121,8 +121,8 @@ class CIROpenACCPointerLikeTest : public ::testing::Test { // Create an array-to-pointer decay cast. mlir::Type ptrToElemTy = cir::PointerType::get(ty); - OwningOpRef decayPtr = b.create( - loc, ptrToElemTy, cir::CastKind::array_to_ptrdecay, val); + OwningOpRef decayPtr = cir::CastOp::create( + b, loc, ptrToElemTy, cir::CastKind::array_to_ptrdecay, val); mlir::Value decayVal = decayPtr.get(); // Verify that we still get the expected element type. @@ -141,9 +141,9 @@ class CIROpenACCPointerLikeTest : public ::testing::Test { // Create an element access. mlir::Type i32Ty = cir::IntType::get(&context, 32, true); mlir::Value index = - b.create(loc, cir::IntAttr::get(i32Ty, 2)); + cir::ConstantOp::create(b, loc, cir::IntAttr::get(i32Ty, 2)); OwningOpRef accessPtr = - b.create(loc, ptrToElemTy, decayVal, index); + cir::PtrStrideOp::create(b, loc, ptrToElemTy, decayVal, index); mlir::Value accessVal = accessPtr.get(); // Verify that we still get the expected element type. @@ -175,8 +175,8 @@ class CIROpenACCPointerLikeTest : public ::testing::Test { EXPECT_EQ(pltTy.getElementType(), structTy); // Create an alloca for the array - OwningOpRef varPtrOp = b.create( - loc, ptrTy, structTy, "", getAlignOne(&context)); + OwningOpRef varPtrOp = cir::AllocaOp::create( + b, loc, ptrTy, structTy, "", getAlignOne(&context)); // Verify that the type category is composite. mlir::Value val = varPtrOp.get(); @@ -186,8 +186,8 @@ class CIROpenACCPointerLikeTest : public ::testing::Test { EXPECT_EQ(typeCategory, mlir::acc::VariableTypeCategory::composite); // Access the first element of the structure. - OwningOpRef access1 = b.create( - loc, cir::PointerType::get(ty1), val, b.getStringAttr("f1"), 0); + OwningOpRef access1 = cir::GetMemberOp::create( + b, loc, cir::PointerType::get(ty1), val, "f1", 0u); mlir::Value accessVal1 = access1.get(); // Verify that we get the expected element type. @@ -204,8 +204,8 @@ class CIROpenACCPointerLikeTest : public ::testing::Test { EXPECT_EQ(access1TypeCategory, mlir::acc::VariableTypeCategory::composite); // Access the second element of the structure. - OwningOpRef access2 = b.create( - loc, cir::PointerType::get(ty2), val, b.getStringAttr("f2"), 1); + OwningOpRef access2 = cir::GetMemberOp::create( + b, loc, cir::PointerType::get(ty2), val, "f2", 1u); mlir::Value accessVal2 = access2.get(); // Verify that we get the expected element type. @@ -252,17 +252,17 @@ class CIROpenACCPointerLikeTest : public ::testing::Test { mlir::Type structPptrTy = cir::PointerType::get(structTy); // Create an alloca for the struct. - OwningOpRef varPtrOp = b.create( - loc, structPptrTy, structTy, "S", getAlignOne(&context)); + OwningOpRef varPtrOp = cir::AllocaOp::create( + b, loc, structPptrTy, structTy, "S", getAlignOne(&context)); mlir::Value val = varPtrOp.get(); // Get a pointer to the second member. - OwningOpRef access = b.create( - loc, cir::PointerType::get(ptrTy), val, b.getStringAttr("f2"), 1); + OwningOpRef access = cir::GetMemberOp::create( + b, loc, cir::PointerType::get(ptrTy), val, b.getStringAttr("f2"), 1); mlir::Value accessVal = access.get(); // Load the value of the second member. This is the pointer we want to test. - OwningOpRef loadOp = b.create(loc, accessVal); + OwningOpRef loadOp = cir::LoadOp::create(b, loc, accessVal); mlir::Value loadVal = loadOp.get(); // Verify that the type category is the expected type category. From 5d6c00cd0f8f644106b976d65de56f9f4ffa1587 Mon Sep 17 00:00:00 2001 From: Vinay Deshmukh Date: Wed, 22 Oct 2025 14:32:21 -0400 Subject: [PATCH 151/530] [libc] Add -Werror for libc tests (#160413) Relates to https://github.com/llvm/llvm-project/issues/119281 Note: 1) As this PR enables `-Werror` for `libc` tests, it's very likely some downstream CI's may fail / start failing, so it's very likely this PR may need to be reverted and re-applied. P.S. I do not have merge permissions, so I will need one of the reviews to merge it for me. Thank you! --- libc/cmake/modules/LLVMLibCTestRules.cmake | 2 +- libc/test/include/complex_test.cpp | 4 +- .../src/pthread/pthread_barrier_test.cpp | 4 +- .../src/math/exhaustive/bfloat16_add_test.cpp | 5 +- .../src/math/exhaustive/bfloat16_div_test.cpp | 5 +- .../src/math/exhaustive/bfloat16_mul_test.cpp | 5 +- .../src/math/exhaustive/bfloat16_sub_test.cpp | 5 +- libc/test/src/signal/sigaltstack_test.cpp | 2 +- libc/test/src/stdio/fileop_test.cpp | 4 +- libc/test/src/stdio/fopen_test.cpp | 5 +- .../src/sys/socket/linux/send_recv_test.cpp | 10 +-- .../sys/socket/linux/sendmsg_recvmsg_test.cpp | 16 +++-- .../sys/socket/linux/sendto_recvfrom_test.cpp | 4 +- libc/test/src/time/ctime_r_test.cpp | 2 +- libc/test/src/time/mktime_test.cpp | 61 +++++++++++-------- 15 files changed, 77 insertions(+), 57 deletions(-) diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake index 933b81b9f1d46..53bba3e1ef201 100644 --- a/libc/cmake/modules/LLVMLibCTestRules.cmake +++ b/libc/cmake/modules/LLVMLibCTestRules.cmake @@ -38,7 +38,7 @@ function(_get_common_test_compile_options output_var c_test flags) list(APPEND compile_options "-Wextra") # -DLIBC_WNO_ERROR=ON if you can't build cleanly with -Werror. if(NOT LIBC_WNO_ERROR) - # list(APPEND compile_options "-Werror") + list(APPEND compile_options "-Werror") endif() list(APPEND compile_options "-Wconversion") # FIXME: convert to -Wsign-conversion diff --git a/libc/test/include/complex_test.cpp b/libc/test/include/complex_test.cpp index 6f900aa4fb674..70e8c1c3d9216 100644 --- a/libc/test/include/complex_test.cpp +++ b/libc/test/include/complex_test.cpp @@ -27,8 +27,8 @@ TEST(LlvmLibcComplexTest, CMPLXMacro) { EXPECT_CFP_EQ(CMPLXL(1.0l, 0), 1.0l); #ifdef LIBC_TYPES_HAS_CFLOAT16 - EXPECT_CFP_EQ(CMPLXF16(0, 1.0), I); - EXPECT_CFP_EQ(CMPLXF16(1.0, 0), 1.0); + EXPECT_CFP_EQ(CMPLXF16(0, 1.0), static_cast<_Complex _Float16>(I)); + EXPECT_CFP_EQ(CMPLXF16(1.0, 0), static_cast<_Complex _Float16>(1.0)); #endif // LIBC_TYPES_HAS_CFLOAT16 #ifdef LIBC_TYPES_HAS_CFLOAT128 diff --git a/libc/test/integration/src/pthread/pthread_barrier_test.cpp b/libc/test/integration/src/pthread/pthread_barrier_test.cpp index c8e11047e1d80..3fef16522c425 100644 --- a/libc/test/integration/src/pthread/pthread_barrier_test.cpp +++ b/libc/test/integration/src/pthread/pthread_barrier_test.cpp @@ -26,7 +26,7 @@ pthread_barrier_t barrier; LIBC_NAMESPACE::cpp::Atomic counter; -void *increment_counter_and_wait(void *args) { +void *increment_counter_and_wait([[maybe_unused]] void *args) { counter.fetch_add(1); return reinterpret_cast( LIBC_NAMESPACE::pthread_barrier_wait(&barrier)); @@ -102,7 +102,7 @@ void reused_barrier_test() { LIBC_NAMESPACE::pthread_barrier_destroy(&barrier); } -void *barrier_wait(void *in) { +void *barrier_wait([[maybe_unused]] void *in) { return reinterpret_cast( LIBC_NAMESPACE::pthread_barrier_wait(&barrier)); } diff --git a/libc/test/src/math/exhaustive/bfloat16_add_test.cpp b/libc/test/src/math/exhaustive/bfloat16_add_test.cpp index 3f4c77978a93d..20a66e27fa61b 100644 --- a/libc/test/src/math/exhaustive/bfloat16_add_test.cpp +++ b/libc/test/src/math/exhaustive/bfloat16_add_test.cpp @@ -23,8 +23,9 @@ struct Bfloat16AddChecker : public virtual LIBC_NAMESPACE::testing::Test { using FPBits = LIBC_NAMESPACE::fputil::FPBits; using StorageType = typename FPBits::StorageType; - uint64_t check(uint16_t x_start, uint16_t x_stop, uint16_t y_start, - uint16_t y_stop, mpfr::RoundingMode rounding) { + uint64_t check(uint16_t x_start, uint16_t x_stop, + [[maybe_unused]] uint16_t y_start, uint16_t y_stop, + mpfr::RoundingMode rounding) { mpfr::ForceRoundingMode r(rounding); if (!r.success) return true; diff --git a/libc/test/src/math/exhaustive/bfloat16_div_test.cpp b/libc/test/src/math/exhaustive/bfloat16_div_test.cpp index 2648d5f775af5..43e033108861e 100644 --- a/libc/test/src/math/exhaustive/bfloat16_div_test.cpp +++ b/libc/test/src/math/exhaustive/bfloat16_div_test.cpp @@ -23,8 +23,9 @@ struct Bfloat16DivChecker : public virtual LIBC_NAMESPACE::testing::Test { using FPBits = LIBC_NAMESPACE::fputil::FPBits; using StorageType = typename FPBits::StorageType; - uint64_t check(uint16_t x_start, uint16_t x_stop, uint16_t y_start, - uint16_t y_stop, mpfr::RoundingMode rounding) { + uint64_t check(uint16_t x_start, uint16_t x_stop, + [[maybe_unused]] uint16_t y_start, uint16_t y_stop, + mpfr::RoundingMode rounding) { mpfr::ForceRoundingMode r(rounding); if (!r.success) return true; diff --git a/libc/test/src/math/exhaustive/bfloat16_mul_test.cpp b/libc/test/src/math/exhaustive/bfloat16_mul_test.cpp index 3cbbcb500a4fb..4fd9ccf440a9d 100644 --- a/libc/test/src/math/exhaustive/bfloat16_mul_test.cpp +++ b/libc/test/src/math/exhaustive/bfloat16_mul_test.cpp @@ -23,8 +23,9 @@ struct Bfloat16MulChecker : public virtual LIBC_NAMESPACE::testing::Test { using FPBits = LIBC_NAMESPACE::fputil::FPBits; using StorageType = typename FPBits::StorageType; - uint64_t check(uint16_t x_start, uint16_t x_stop, uint16_t y_start, - uint16_t y_stop, mpfr::RoundingMode rounding) { + uint64_t check(uint16_t x_start, uint16_t x_stop, + [[maybe_unused]] uint16_t y_start, uint16_t y_stop, + mpfr::RoundingMode rounding) { mpfr::ForceRoundingMode r(rounding); if (!r.success) return true; diff --git a/libc/test/src/math/exhaustive/bfloat16_sub_test.cpp b/libc/test/src/math/exhaustive/bfloat16_sub_test.cpp index 11bc6f59dd294..350af52930744 100644 --- a/libc/test/src/math/exhaustive/bfloat16_sub_test.cpp +++ b/libc/test/src/math/exhaustive/bfloat16_sub_test.cpp @@ -23,8 +23,9 @@ struct Bfloat16SubChecker : public virtual LIBC_NAMESPACE::testing::Test { using FPBits = LIBC_NAMESPACE::fputil::FPBits; using StorageType = typename FPBits::StorageType; - uint64_t check(uint16_t x_start, uint16_t x_stop, uint16_t y_start, - uint16_t y_stop, mpfr::RoundingMode rounding) { + uint64_t check(uint16_t x_start, uint16_t x_stop, + [[maybe_unused]] uint16_t y_start, uint16_t y_stop, + mpfr::RoundingMode rounding) { mpfr::ForceRoundingMode r(rounding); if (!r.success) return true; diff --git a/libc/test/src/signal/sigaltstack_test.cpp b/libc/test/src/signal/sigaltstack_test.cpp index 8c252c47452df..43c679b23bb29 100644 --- a/libc/test/src/signal/sigaltstack_test.cpp +++ b/libc/test/src/signal/sigaltstack_test.cpp @@ -33,7 +33,7 @@ static void handler(int) { // out or mapped to a register. uint8_t var[LOCAL_VAR_SIZE]; for (int i = 0; i < LOCAL_VAR_SIZE; ++i) - var[i] = i; + var[i] = static_cast(i); // Verify that array is completely on the alt_stack. for (int i = 0; i < LOCAL_VAR_SIZE; ++i) { if (!(uintptr_t(var + i) < uintptr_t(alt_stack + ALT_STACK_SIZE) && diff --git a/libc/test/src/stdio/fileop_test.cpp b/libc/test/src/stdio/fileop_test.cpp index 02328042b92b3..1dc9b6522006d 100644 --- a/libc/test/src/stdio/fileop_test.cpp +++ b/libc/test/src/stdio/fileop_test.cpp @@ -101,7 +101,7 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) { // This is not a readable file. ASSERT_THAT(LIBC_NAMESPACE::fread(data, 1, 1, file), - returns(EQ(0)).with_errno(NE(0))); + returns(EQ(static_cast(0))).with_errno(NE(0))); ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); @@ -175,7 +175,7 @@ TEST_F(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) { // Trying to read more should fetch nothing. ASSERT_THAT( LIBC_NAMESPACE::fread(read_data, sizeof(MyStruct), WRITE_NMEMB, file), - returns(EQ(0)).with_errno(EQ(0))); + returns(EQ(static_cast(0))).with_errno(EQ(0))); EXPECT_NE(LIBC_NAMESPACE::feof(file), 0); EXPECT_EQ(LIBC_NAMESPACE::ferror(file), 0); ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0); diff --git a/libc/test/src/stdio/fopen_test.cpp b/libc/test/src/stdio/fopen_test.cpp index 3f651f755e7f3..e427e496895e0 100644 --- a/libc/test/src/stdio/fopen_test.cpp +++ b/libc/test/src/stdio/fopen_test.cpp @@ -15,15 +15,14 @@ #include "test/UnitTest/Test.h" TEST(LlvmLibcFOpenTest, PrintToFile) { - int result; FILE *file = LIBC_NAMESPACE::fopen(APPEND_LIBC_TEST("testdata/test.txt"), "w"); ASSERT_FALSE(file == nullptr); static constexpr char STRING[] = "A simple string written to a file\n"; - result = LIBC_NAMESPACE::fwrite(STRING, 1, sizeof(STRING) - 1, file); - EXPECT_GE(result, 0); + size_t result = LIBC_NAMESPACE::fwrite(STRING, 1, sizeof(STRING) - 1, file); + EXPECT_GE(result, static_cast(0)); ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); diff --git a/libc/test/src/sys/socket/linux/send_recv_test.cpp b/libc/test/src/sys/socket/linux/send_recv_test.cpp index 46f73a29d6f7a..5bfa4fb2ed572 100644 --- a/libc/test/src/sys/socket/linux/send_recv_test.cpp +++ b/libc/test/src/sys/socket/linux/send_recv_test.cpp @@ -50,13 +50,15 @@ TEST_F(LlvmLibcSendRecvTest, SendFails) { const char TEST_MESSAGE[] = "connection terminated"; const size_t MESSAGE_LEN = sizeof(TEST_MESSAGE); - ASSERT_THAT(LIBC_NAMESPACE::send(-1, TEST_MESSAGE, MESSAGE_LEN, 0), - Fails(EBADF)); + ASSERT_THAT( + static_cast(LIBC_NAMESPACE::send(-1, TEST_MESSAGE, MESSAGE_LEN, 0)), + Fails(EBADF)); } TEST_F(LlvmLibcSendRecvTest, RecvFails) { char buffer[256]; - ASSERT_THAT(LIBC_NAMESPACE::recv(-1, buffer, sizeof(buffer), 0), - Fails(EBADF)); + ASSERT_THAT( + static_cast(LIBC_NAMESPACE::recv(-1, buffer, sizeof(buffer), 0)), + Fails(EBADF)); } diff --git a/libc/test/src/sys/socket/linux/sendmsg_recvmsg_test.cpp b/libc/test/src/sys/socket/linux/sendmsg_recvmsg_test.cpp index 7ed94b20c564a..31099650dd20b 100644 --- a/libc/test/src/sys/socket/linux/sendmsg_recvmsg_test.cpp +++ b/libc/test/src/sys/socket/linux/sendmsg_recvmsg_test.cpp @@ -45,8 +45,9 @@ TEST_F(LlvmLibcSendMsgRecvMsgTest, SucceedsWithSocketPair) { send_message.msg_controllen = 0; send_message.msg_flags = 0; - ASSERT_THAT(LIBC_NAMESPACE::sendmsg(sockpair[0], &send_message, 0), - Succeeds(static_cast(MESSAGE_LEN))); + ASSERT_THAT( + static_cast(LIBC_NAMESPACE::sendmsg(sockpair[0], &send_message, 0)), + Succeeds(static_cast(MESSAGE_LEN))); char buffer[256]; @@ -63,8 +64,9 @@ TEST_F(LlvmLibcSendMsgRecvMsgTest, SucceedsWithSocketPair) { recv_message.msg_controllen = 0; recv_message.msg_flags = 0; - ASSERT_THAT(LIBC_NAMESPACE::recvmsg(sockpair[1], &recv_message, 0), - Succeeds(static_cast(MESSAGE_LEN))); + ASSERT_THAT( + static_cast(LIBC_NAMESPACE::recvmsg(sockpair[1], &recv_message, 0)), + Succeeds(static_cast(MESSAGE_LEN))); ASSERT_STREQ(buffer, TEST_MESSAGE); @@ -91,7 +93,8 @@ TEST_F(LlvmLibcSendMsgRecvMsgTest, SendFails) { send_message.msg_controllen = 0; send_message.msg_flags = 0; - ASSERT_THAT(LIBC_NAMESPACE::sendmsg(-1, &send_message, 0), Fails(EBADF)); + ASSERT_THAT(static_cast(LIBC_NAMESPACE::sendmsg(-1, &send_message, 0)), + Fails(EBADF)); } TEST_F(LlvmLibcSendMsgRecvMsgTest, RecvFails) { @@ -110,5 +113,6 @@ TEST_F(LlvmLibcSendMsgRecvMsgTest, RecvFails) { recv_message.msg_controllen = 0; recv_message.msg_flags = 0; - ASSERT_THAT(LIBC_NAMESPACE::recvmsg(-1, &recv_message, 0), Fails(EBADF)); + ASSERT_THAT(static_cast(LIBC_NAMESPACE::recvmsg(-1, &recv_message, 0)), + Fails(EBADF)); } diff --git a/libc/test/src/sys/socket/linux/sendto_recvfrom_test.cpp b/libc/test/src/sys/socket/linux/sendto_recvfrom_test.cpp index 8377260252c37..c3f84266d464c 100644 --- a/libc/test/src/sys/socket/linux/sendto_recvfrom_test.cpp +++ b/libc/test/src/sys/socket/linux/sendto_recvfrom_test.cpp @@ -54,7 +54,7 @@ TEST_F(LlvmLibcSendToRecvFromTest, SendToFails) { ASSERT_THAT( LIBC_NAMESPACE::sendto(-1, TEST_MESSAGE, MESSAGE_LEN, 0, nullptr, 0), - Fails(EBADF)); + Fails(static_cast(EBADF), static_cast(-1))); } TEST_F(LlvmLibcSendToRecvFromTest, RecvFromFails) { @@ -62,5 +62,5 @@ TEST_F(LlvmLibcSendToRecvFromTest, RecvFromFails) { ASSERT_THAT( LIBC_NAMESPACE::recvfrom(-1, buffer, sizeof(buffer), 0, nullptr, 0), - Fails(EBADF)); + Fails(static_cast(EBADF), static_cast(-1))); } diff --git a/libc/test/src/time/ctime_r_test.cpp b/libc/test/src/time/ctime_r_test.cpp index ee06c706734fb..084db300bf2d6 100644 --- a/libc/test/src/time/ctime_r_test.cpp +++ b/libc/test/src/time/ctime_r_test.cpp @@ -23,7 +23,7 @@ TEST_F(LlvmLibcCtimeR, Nullptr) { result = LIBC_NAMESPACE::ctime_r(nullptr, buffer); ASSERT_STREQ(nullptr, result); - time_t t; + time_t t = 2147483648; // invalid argument result = LIBC_NAMESPACE::ctime_r(&t, nullptr); ASSERT_STREQ(nullptr, result); } diff --git a/libc/test/src/time/mktime_test.cpp b/libc/test/src/time/mktime_test.cpp index 1dfdd73de5440..1062fa45ef644 100644 --- a/libc/test/src/time/mktime_test.cpp +++ b/libc/test/src/time/mktime_test.cpp @@ -36,7 +36,8 @@ TEST(LlvmLibcMkTime, FailureSetsErrno) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Fails(EOVERFLOW)); + EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), + Fails(EOVERFLOW)); } TEST(LlvmLibcMkTime, InvalidSeconds) { @@ -51,7 +52,8 @@ TEST(LlvmLibcMkTime, InvalidSeconds) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Succeeds(-1)); + EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), + Succeeds(-1)); EXPECT_TM_EQ((tm{.tm_sec = 59, .tm_min = 59, .tm_hour = 23, @@ -75,7 +77,8 @@ TEST(LlvmLibcMkTime, InvalidSeconds) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Succeeds(60)); + EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), + Succeeds(60)); EXPECT_TM_EQ((tm{.tm_sec = 0, .tm_min = 1, .tm_hour = 0, @@ -101,7 +104,7 @@ TEST(LlvmLibcMkTime, InvalidMinutes) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), + EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), Succeeds(-LIBC_NAMESPACE::time_constants::SECONDS_PER_MIN)); EXPECT_TM_EQ((tm{.tm_sec = 0, .tm_min = 59, @@ -126,7 +129,7 @@ TEST(LlvmLibcMkTime, InvalidMinutes) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), + EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), Succeeds(60 * LIBC_NAMESPACE::time_constants::SECONDS_PER_MIN)); EXPECT_TM_EQ((tm{.tm_sec = 0, .tm_min = 0, @@ -153,7 +156,7 @@ TEST(LlvmLibcMkTime, InvalidHours) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), + EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), Succeeds(-LIBC_NAMESPACE::time_constants::SECONDS_PER_HOUR)); EXPECT_TM_EQ((tm{.tm_sec = 0, .tm_min = 0, @@ -179,7 +182,7 @@ TEST(LlvmLibcMkTime, InvalidHours) { .tm_yday = 0, .tm_isdst = 0}; EXPECT_THAT( - LIBC_NAMESPACE::mktime(&tm_data), + static_cast(LIBC_NAMESPACE::mktime(&tm_data)), Succeeds(24 * LIBC_NAMESPACE::time_constants::SECONDS_PER_HOUR)); EXPECT_TM_EQ((tm{.tm_sec = 0, .tm_min = 0, @@ -205,7 +208,7 @@ TEST(LlvmLibcMkTime, InvalidYear) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), + EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), Succeeds(-LIBC_NAMESPACE::time_constants::DAYS_PER_NON_LEAP_YEAR * LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY)); EXPECT_TM_EQ((tm{.tm_sec = 0, @@ -234,7 +237,8 @@ TEST(LlvmLibcMkTime, InvalidEndOf32BitEpochYear) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Fails(EOVERFLOW)); + EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), + Fails(EOVERFLOW)); } { @@ -248,7 +252,8 @@ TEST(LlvmLibcMkTime, InvalidEndOf32BitEpochYear) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Fails(EOVERFLOW)); + EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), + Fails(EOVERFLOW)); } { @@ -262,7 +267,8 @@ TEST(LlvmLibcMkTime, InvalidEndOf32BitEpochYear) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Fails(EOVERFLOW)); + EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), + Fails(EOVERFLOW)); } { @@ -276,7 +282,8 @@ TEST(LlvmLibcMkTime, InvalidEndOf32BitEpochYear) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Fails(EOVERFLOW)); + EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), + Fails(EOVERFLOW)); } { @@ -290,7 +297,8 @@ TEST(LlvmLibcMkTime, InvalidEndOf32BitEpochYear) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Fails(EOVERFLOW)); + EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), + Fails(EOVERFLOW)); } { @@ -304,7 +312,8 @@ TEST(LlvmLibcMkTime, InvalidEndOf32BitEpochYear) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Fails(EOVERFLOW)); + EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), + Fails(EOVERFLOW)); } } @@ -321,7 +330,7 @@ TEST(LlvmLibcMkTime, InvalidMonths) { .tm_yday = 0, .tm_isdst = 0}; EXPECT_THAT( - LIBC_NAMESPACE::mktime(&tm_data), + static_cast(LIBC_NAMESPACE::mktime(&tm_data)), Succeeds(-32 * LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY)); EXPECT_TM_EQ((tm{.tm_sec = 0, .tm_min = 0, @@ -347,7 +356,7 @@ TEST(LlvmLibcMkTime, InvalidMonths) { .tm_yday = 0, .tm_isdst = 0}; EXPECT_THAT( - LIBC_NAMESPACE::mktime(&tm_data), + static_cast(LIBC_NAMESPACE::mktime(&tm_data)), Succeeds(LIBC_NAMESPACE::time_constants::DAYS_PER_NON_LEAP_YEAR * LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY)); EXPECT_TM_EQ((tm{.tm_sec = 0, @@ -375,7 +384,7 @@ TEST(LlvmLibcMkTime, InvalidDays) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), + EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), Succeeds(-1 * LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY)); EXPECT_TM_EQ((tm{.tm_sec = 0, .tm_min = 0, @@ -400,7 +409,7 @@ TEST(LlvmLibcMkTime, InvalidDays) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), + EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), Succeeds(31 * LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY)); EXPECT_TM_EQ((tm{.tm_sec = 0, .tm_min = 0, @@ -425,7 +434,7 @@ TEST(LlvmLibcMkTime, InvalidDays) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), + EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), Succeeds(59 * LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY)); EXPECT_TM_EQ((tm{.tm_sec = 0, .tm_min = 0, @@ -451,7 +460,7 @@ TEST(LlvmLibcMkTime, InvalidDays) { .tm_yday = 0, .tm_isdst = 0}; EXPECT_THAT( - LIBC_NAMESPACE::mktime(&tm_data), + static_cast(LIBC_NAMESPACE::mktime(&tm_data)), Succeeds(((2 * LIBC_NAMESPACE::time_constants::DAYS_PER_NON_LEAP_YEAR) + 60) * LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY)); @@ -481,7 +490,8 @@ TEST(LlvmLibcMkTime, EndOf32BitEpochYear) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Succeeds(0x7FFFFFFF)); + EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), + Succeeds(0x7FFFFFFF)); EXPECT_TM_EQ((tm{.tm_sec = 7, .tm_min = 14, .tm_hour = 3, @@ -507,7 +517,8 @@ TEST(LlvmLibcMkTime, EndOf32BitEpochYear) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Succeeds(0x7FFFFFFF - 8)); + EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), + Succeeds(0x7FFFFFFF - 8)); EXPECT_TM_EQ((tm{.tm_sec = 59, .tm_min = 13, .tm_hour = 3, @@ -532,7 +543,7 @@ TEST(LlvmLibcMkTime, EndOf32BitEpochYear) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), + EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), Succeeds(0x7FFFFFFF - 8 - 14 * LIBC_NAMESPACE::time_constants::SECONDS_PER_MIN)); EXPECT_TM_EQ((tm{.tm_sec = 59, @@ -559,7 +570,7 @@ TEST(LlvmLibcMkTime, EndOf32BitEpochYear) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), + EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), Succeeds(0x7FFFFFFF - 8 - 14 * LIBC_NAMESPACE::time_constants::SECONDS_PER_MIN - 3 * LIBC_NAMESPACE::time_constants::SECONDS_PER_HOUR)); @@ -587,7 +598,7 @@ TEST(LlvmLibcMkTime, EndOf32BitEpochYear) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), + EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), Succeeds(0x7FFFFFFF - 8 - 14 * LIBC_NAMESPACE::time_constants::SECONDS_PER_MIN - 3 * LIBC_NAMESPACE::time_constants::SECONDS_PER_HOUR - From 25a915a91cf4a5e5fd34e6613b0d329e32451a4f Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Wed, 22 Oct 2025 11:33:58 -0700 Subject: [PATCH 152/530] [ThinLTO] Add HasLocal flag to GlobalValueSummaryInfo (#164647) Add a flag to the GlobalValueSummaryInfo indicating whether the associated SummaryList (all summaries with the same GUID) contains any summaries with local linkage. This flag is set when building the index, so it is associated with the original linkage type before internalization and promotion. Consumers should check the withInternalizeAndPromote() flag on the index before using it. In most cases we expect a 1-1 mapping between a GUID and a summary with local linkage, because for locals the GUID is computed from the hash of "modulepath;name". However, there can be multiple locals with the same GUID if translation units are not compiled with enough path. And in rare but theoretically possible cases, there can be hash collisions on the underlying MD5 computation. So to be safe when looking for local summaries, analyses currently look through all summaries in the list. These lists can be extremely long in the case of large binaries with template function defs in widely used headers (i.e. linkonce_odr). A follow on change will use this flag to reduce ThinLTO analysis time in WPD by 5-6% for a large target (details in PR164046 which will be reworked to use this flag). Note that in the past we have tried to keep bits related to the GUID in the ValueInfo (which has a pointer to the associated GlobalValueSummaryInfo), via its PointerIntPair. However, we are out of bits there. This change does add a byte to every GlobalValueSummaryInfo instance, which I measured as a little under 0.90% overhead in a large target. However, it enables adding 7 bits of other per-GUID flags in the future without adding more overhead. Note that it was lower overhead to add this to the GlobalValueSummaryInfo than the ValueInfo, which tends to be copied into other maps. --- llvm/include/llvm/IR/ModuleSummaryIndex.h | 44 ++++++++++++++++++++--- llvm/lib/LTO/LTO.cpp | 4 +++ 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h index 98df06ae43ad0..cdfee727387a5 100644 --- a/llvm/include/llvm/IR/ModuleSummaryIndex.h +++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h @@ -172,9 +172,11 @@ struct alignas(8) GlobalValueSummaryInfo { /// Add a summary corresponding to a global value definition in a module with /// the corresponding GUID. - void addSummary(std::unique_ptr Summary) { - return SummaryList.push_back(std::move(Summary)); - } + inline void addSummary(std::unique_ptr Summary); + + /// Verify that the HasLocal flag is consistent with the SummaryList. Should + /// only be called prior to index-based internalization and promotion. + inline void verifyLocal() const; private: /// List of global value summary structures for a particular value held @@ -183,6 +185,22 @@ struct alignas(8) GlobalValueSummaryInfo { /// compiling without sufficient distinguishing path, or (theoretically) hash /// collisions. Each summary is from a different module. GlobalValueSummaryList SummaryList; + + /// True if the SummaryList contains at least one summary with local linkage. + /// In most cases there should be only one, unless translation units with + /// same-named locals were compiled without distinguishing path. And generally + /// there should not be a mix of local and non-local summaries, because the + /// GUID for a local is computed with the path prepended and a ';' delimiter. + /// In extremely rare cases there could be a GUID hash collision. Having the + /// flag saves having to walk through all summaries to prove the existence or + /// not of any locals. + /// NOTE: this flag is set when the index is built. It does not reflect + /// index-based internalization and promotion decisions. Generally most + /// index-based analysis occurs before then, but any users should assert that + /// the withInternalizeAndPromote() flag is not set on the index. + /// TODO: Replace checks in various ThinLTO analyses that loop through all + /// summaries to handle the local case with a check of the flag. + bool HasLocal : 1; }; /// Map from global value GUID to corresponding summary structures. Use a @@ -219,6 +237,8 @@ struct ValueInfo { return getRef()->second.getSummaryList(); } + void verifyLocal() const { getRef()->second.verifyLocal(); } + // Even if the index is built with GVs available, we may not have one for // summary entries synthesized for profiled indirect call targets. bool hasName() const { return !haveGVs() || getValue(); } @@ -649,7 +669,23 @@ class GlobalValueSummary { friend class ModuleSummaryIndex; }; -GlobalValueSummaryInfo::GlobalValueSummaryInfo(bool HaveGVs) : U(HaveGVs) {} +GlobalValueSummaryInfo::GlobalValueSummaryInfo(bool HaveGVs) + : U(HaveGVs), HasLocal(false) {} + +void GlobalValueSummaryInfo::addSummary( + std::unique_ptr Summary) { + if (GlobalValue::isLocalLinkage(Summary->linkage())) + HasLocal = true; + return SummaryList.push_back(std::move(Summary)); +} + +void GlobalValueSummaryInfo::verifyLocal() const { + assert(HasLocal == + llvm::any_of(SummaryList, + [](const std::unique_ptr &Summary) { + return GlobalValue::isLocalLinkage(Summary->linkage()); + })); +} /// Alias summary information. class AliasSummary : public GlobalValueSummary { diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index 72ae064e46a0c..86780e19f22b8 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -477,6 +477,10 @@ static void thinLTOInternalizeAndPromoteGUID( return !GlobalValue::isLocalLinkage(Summary->linkage()); }); + // Before performing index-based internalization and promotion for this GUID, + // the local flag should be consistent with the summary list linkage types. + VI.verifyLocal(); + for (auto &S : VI.getSummaryList()) { // First see if we need to promote an internal value because it is not // exported. From 10113c4ab425167d3a18ba0425466cab118902d7 Mon Sep 17 00:00:00 2001 From: Walter Lee <49250218+googlewalt@users.noreply.github.com> Date: Wed, 22 Oct 2025 14:37:30 -0400 Subject: [PATCH 153/530] [bazel][libc] Add missing dependency (#164668) Fixes #164522. --- .../libc/test/src/math/libc_math_test_rules.bzl | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl b/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl index d2297d51383cc..4b8a34646d637 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl +++ b/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl @@ -35,6 +35,7 @@ def math_test(name, hdrs = [], deps = [], **kwargs): "//libc:__support_fputil_manipulation_functions", "//libc:__support_fputil_nearest_integer_operations", "//libc:__support_fputil_normal_float", + "//libc:__support_macros_optimization", "//libc:__support_macros_properties_architectures", "//libc:__support_macros_properties_os", "//libc:__support_macros_properties_types", From f8f7f1b67c8ee5d81847955dc36fab86a6d129ad Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Wed, 22 Oct 2025 11:50:46 -0700 Subject: [PATCH 154/530] [lit] Update shtest-ulimit.py to explicitly check default limits (#164678) The recently added ulimit_reset.txt section in shtest-ulimit.py was failing on some builders if the default file descriptor limit started with 50. This patch fixes that by explicitly checking that the file descriptor limit is equal to the default value. --- llvm/utils/lit/tests/shtest-ulimit.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/llvm/utils/lit/tests/shtest-ulimit.py b/llvm/utils/lit/tests/shtest-ulimit.py index 9a8febd3333c4..dadde70336812 100644 --- a/llvm/utils/lit/tests/shtest-ulimit.py +++ b/llvm/utils/lit/tests/shtest-ulimit.py @@ -5,7 +5,11 @@ # as well. # UNSUPPORTED: system-windows, system-solaris -# RUN: not %{lit} -a -v %{inputs}/shtest-ulimit --order=lexical | FileCheck %s +# RUN: %{python} %S/Inputs/shtest-ulimit/print_limits.py | grep RLIMIT_NOFILE \ +# RUN: | sed -n -e 's/.*=//p' | tr -d '\n' > %t.nofile_limit + +# RUN: not %{lit} -a -v %{inputs}/shtest-ulimit --order=lexical \ +# RUN: | FileCheck -DBASE_NOFILE_LIMIT=%{readfile:%t.nofile_limit} %s # CHECK: -- Testing: 3 tests{{.*}} @@ -18,4 +22,4 @@ # CHECK: RLIMIT_NOFILE=50 # CHECK-LABEL: FAIL: shtest-ulimit :: ulimit_reset.txt ({{[^)]*}}) -# CHECK-NOT: RLIMIT_NOFILE=50 +# CHECK: RLIMIT_NOFILE=[[BASE_NOFILE_LIMIT]] From 9a97717df0647c0073bade6504e8c8c155ef5186 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Wed, 22 Oct 2025 14:50:48 -0400 Subject: [PATCH 155/530] [gn] port be9c083cf7ec (CAS on-disk changes) --- .../gn/secondary/llvm/include/llvm/Config/BUILD.gn | 7 ++++++- .../secondary/llvm/include/llvm/Config/config.gni | 3 +++ llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn | 2 ++ .../utils/gn/secondary/llvm/unittests/CAS/BUILD.gn | 14 +++++++++++--- 4 files changed, 22 insertions(+), 4 deletions(-) diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn index 022cd87a5b303..1ca929219a953 100644 --- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn @@ -301,7 +301,6 @@ write_cmake_config("llvm-config") { "LLVM_BUILD_SHARED_LIBS=", "LLVM_ENABLE_LLVM_C_EXPORT_ANNOTATIONS=", "LLVM_ENABLE_TELEMETRY=", - "LLVM_ENABLE_ONDISK_CAS=", "LLVM_DEFAULT_TARGET_TRIPLE=$llvm_target_triple", "LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE=", "LLVM_ENABLE_DEBUGLOC_TRACKING_ORIGIN=", @@ -367,6 +366,12 @@ write_cmake_config("llvm-config") { values += [ "LLVM_ENABLE_DIA_SDK=" ] } + if (llvm_enable_ondisk_cas) { + values += [ "LLVM_ENABLE_ONDISK_CAS=1" ] + } else { + values += [ "LLVM_ENABLE_ONDISK_CAS=" ] + } + if (llvm_enable_threads) { values += [ "LLVM_ENABLE_THREADS=1" ] } else { diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/config.gni b/llvm/utils/gn/secondary/llvm/include/llvm/Config/config.gni index 8c2ab8af8ed1f..715b03e91c756 100644 --- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/config.gni +++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/config.gni @@ -1,4 +1,7 @@ declare_args() { # Iterate unordered llvm containers in reverse. llvm_enable_reverse_iteration = false + + # Iterate unordered llvm containers in reverse. + llvm_enable_ondisk_cas = false } diff --git a/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn index b4edd8d65ecb0..5590b27ac3784 100644 --- a/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn @@ -10,6 +10,8 @@ static_library("CAS") { "ObjectStore.cpp", "OnDiskCommon.cpp", "OnDiskDataAllocator.cpp", + "OnDiskGraphDB.cpp", + "OnDiskKeyValueDB.cpp", "OnDiskTrieRawHashMap.cpp", ] } diff --git a/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn index 52a64beb9cac2..2d9eb6814c376 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn @@ -1,3 +1,4 @@ +import("//llvm/include/llvm/Config/config.gni") import("//third-party/unittest/unittest.gni") unittest("CASTests") { @@ -10,8 +11,15 @@ unittest("CASTests") { "ActionCacheTest.cpp", "CASTestConfig.cpp", "ObjectStoreTest.cpp", - "OnDiskDataAllocatorTest.cpp", - "OnDiskTrieRawHashMapTest.cpp", - "ProgramTest.cpp", ] + + if (llvm_enable_ondisk_cas) { + sources += [ + "OnDiskDataAllocatorTest.cpp", + "OnDiskGraphDBTest.cpp", + "OnDiskKeyValueDBTest.cpp", + "OnDiskTrieRawHashMapTest.cpp", + "ProgramTest.cpp", + ] + } } From f607b2a6ea36d61e7526b44826b6a61c172f6c4d Mon Sep 17 00:00:00 2001 From: Victor Chernyakin Date: Wed, 22 Oct 2025 11:54:18 -0700 Subject: [PATCH 156/530] [clang][Sema][NFC] Remove unused delayed template parsing cleanup callback (#164149) The last use of it was removed in 6163aa9. --- clang/include/clang/Sema/Sema.h | 6 +----- clang/lib/Parse/Parser.cpp | 2 +- clang/lib/Sema/Sema.cpp | 10 +++------- 3 files changed, 5 insertions(+), 13 deletions(-) diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 87b96c2d5ad09..189798f71dbad 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -1321,15 +1321,11 @@ class Sema final : public SemaBase { /// Callback to the parser to parse templated functions when needed. typedef void LateTemplateParserCB(void *P, LateParsedTemplate &LPT); - typedef void LateTemplateParserCleanupCB(void *P); LateTemplateParserCB *LateTemplateParser; - LateTemplateParserCleanupCB *LateTemplateParserCleanup; void *OpaqueParser; - void SetLateTemplateParser(LateTemplateParserCB *LTP, - LateTemplateParserCleanupCB *LTPCleanup, void *P) { + void SetLateTemplateParser(LateTemplateParserCB *LTP, void *P) { LateTemplateParser = LTP; - LateTemplateParserCleanup = LTPCleanup; OpaqueParser = P; } diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp index ec01faf446e8d..a6fc676f23a51 100644 --- a/clang/lib/Parse/Parser.cpp +++ b/clang/lib/Parse/Parser.cpp @@ -708,7 +708,7 @@ bool Parser::ParseTopLevelDecl(DeclGroupPtrTy &Result, } // Late template parsing can begin. - Actions.SetLateTemplateParser(LateTemplateParserCallback, nullptr, this); + Actions.SetLateTemplateParser(LateTemplateParserCallback, this); Actions.ActOnEndOfTranslationUnit(); //else don't tell Sema that we ended parsing: more input might come. return true; diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp index 8ed3df7cf17a5..23bf7f217a01a 100644 --- a/clang/lib/Sema/Sema.cpp +++ b/clang/lib/Sema/Sema.cpp @@ -276,10 +276,9 @@ Sema::Sema(Preprocessor &pp, ASTContext &ctxt, ASTConsumer &consumer, Context(ctxt), Consumer(consumer), Diags(PP.getDiagnostics()), SourceMgr(PP.getSourceManager()), APINotes(SourceMgr, LangOpts), AnalysisWarnings(*this), ThreadSafetyDeclCache(nullptr), - LateTemplateParser(nullptr), LateTemplateParserCleanup(nullptr), - OpaqueParser(nullptr), CurContext(nullptr), ExternalSource(nullptr), - StackHandler(Diags), CurScope(nullptr), Ident_super(nullptr), - AMDGPUPtr(std::make_unique(*this)), + LateTemplateParser(nullptr), OpaqueParser(nullptr), CurContext(nullptr), + ExternalSource(nullptr), StackHandler(Diags), CurScope(nullptr), + Ident_super(nullptr), AMDGPUPtr(std::make_unique(*this)), ARMPtr(std::make_unique(*this)), AVRPtr(std::make_unique(*this)), BPFPtr(std::make_unique(*this)), @@ -1248,9 +1247,6 @@ void Sema::ActOnEndOfTranslationUnit() { ? TUFragmentKind::Private : TUFragmentKind::Normal); - if (LateTemplateParserCleanup) - LateTemplateParserCleanup(OpaqueParser); - CheckDelayedMemberExceptionSpecs(); } else { // If we are building a TU prefix for serialization, it is safe to transfer From c0b42ec05344707d94805ec795a7bc8d33a09594 Mon Sep 17 00:00:00 2001 From: Jakub Kuderski Date: Wed, 22 Oct 2025 14:58:10 -0400 Subject: [PATCH 157/530] [mlir][IR] Deprecate `OpBuilder::create` in favor of free functions (#164649) These have been soft-deprecated since July: https://discourse.llvm.org/t/psa-opty-create-now-with-100-more-tab-complete/87339 Add a deprecation attribute to prevent new uses from creeping in. --- mlir/include/mlir/IR/Builders.h | 5 ++-- .../OpenMPOffloadPrivatizationPrepare.cpp | 24 +++++++++---------- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/mlir/include/mlir/IR/Builders.h b/mlir/include/mlir/IR/Builders.h index 9205f16f97bbb..3ba6818204ba0 100644 --- a/mlir/include/mlir/IR/Builders.h +++ b/mlir/include/mlir/IR/Builders.h @@ -502,6 +502,7 @@ class OpBuilder : public Builder { public: /// Create an operation of specific op type at the current insertion point. template + [[deprecated("Use OpTy::create instead")]] OpTy create(Location location, Args &&...args) { OperationState state(location, getCheckRegisteredInfo(location.getContext())); @@ -517,9 +518,9 @@ class OpBuilder : public Builder { /// the results of the operation. /// /// Note: This performs opportunistic eager folding during IR construction. - /// The folders are designed to operate efficiently on canonical IR, which + /// The folders are designed to operate efficiently on canonical IR, which /// this API does not enforce. Complete folding isn't only expected in the - /// context of canonicalization which intertwine folders with pattern + /// context of canonicalization which intertwine folders with pattern /// rewrites until fixed-point. template void createOrFold(SmallVectorImpl &results, Location location, diff --git a/mlir/lib/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.cpp b/mlir/lib/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.cpp index db54eaa2628a8..735b905bffb85 100644 --- a/mlir/lib/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.cpp +++ b/mlir/lib/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.cpp @@ -205,14 +205,14 @@ class PrepareForOMPOffloadPrivatizationPass assert(!region.empty() && "region cannot be empty"); LLVM::LLVMFuncOp func = createFuncOpForRegion( loc, mod, region, funcName, rewriter, returnsValue); - auto call = rewriter.create(loc, func, args); + auto call = LLVM::CallOp::create(rewriter, loc, func, args); return call.getResult(); }; Value moldArg, newArg; if (isPrivatizedByValue) { - moldArg = rewriter.create(loc, varType, varPtr); - newArg = rewriter.create(loc, varType, heapMem); + moldArg = LLVM::LoadOp::create(rewriter, loc, varType, varPtr); + newArg = LLVM::LoadOp::create(rewriter, loc, varType, heapMem); } else { moldArg = varPtr; newArg = heapMem; @@ -234,7 +234,7 @@ class PrepareForOMPOffloadPrivatizationPass {moldArg, initializedVal}, /*returnsValue=*/true); if (isPrivatizedByValue) - (void)rewriter.create(loc, initializedVal, heapMem); + (void)LLVM::StoreOp::create(rewriter, loc, initializedVal, heapMem); // clone origOp, replace all uses of varPtr with heapMem and // erase origOp. @@ -275,8 +275,8 @@ class PrepareForOMPOffloadPrivatizationPass // targetOp. if (isPrivatizedByValue) { rewriter.setInsertionPoint(targetOp); - auto newPrivVar = rewriter.create(mapInfoOp.getLoc(), - varType, heapMem); + auto newPrivVar = LLVM::LoadOp::create(rewriter, mapInfoOp.getLoc(), + varType, heapMem); newPrivVars.push_back(newPrivVar); } @@ -294,7 +294,7 @@ class PrepareForOMPOffloadPrivatizationPass cleanupTaskOp = omp::TaskOp::create(rewriter, loc, taskOperands); Block *taskBlock = rewriter.createBlock(&cleanupTaskOp.getRegion()); rewriter.setInsertionPointToEnd(taskBlock); - rewriter.create(cleanupTaskOp.getLoc()); + omp::TerminatorOp::create(rewriter, cleanupTaskOp.getLoc()); } rewriter.setInsertionPointToStart( &*cleanupTaskOp.getRegion().getBlocks().begin()); @@ -307,8 +307,8 @@ class PrepareForOMPOffloadPrivatizationPass LLVM::lookupOrCreateFreeFn(rewriter, mod); assert(llvm::succeeded(freeFunc) && "Could not find free in the module"); - (void)rewriter.create(loc, freeFunc.value(), - ValueRange{heapMem}); + (void)LLVM::CallOp::create(rewriter, loc, freeFunc.value(), + ValueRange{heapMem}); } } assert(newPrivVars.size() == privateVars.size() && @@ -390,11 +390,11 @@ class PrepareForOMPOffloadPrivatizationPass const DataLayout &dl = DataLayout(mod); std::int64_t distance = getSizeInBytes(dl, varType); - Value sizeBytes = rewriter.create( - loc, mallocFn.getFunctionType().getParamType(0), distance); + Value sizeBytes = LLVM::ConstantOp::create( + rewriter, loc, mallocFn.getFunctionType().getParamType(0), distance); auto mallocCallOp = - rewriter.create(loc, mallocFn, ValueRange{sizeBytes}); + LLVM::CallOp::create(rewriter, loc, mallocFn, ValueRange{sizeBytes}); return mallocCallOp.getResult(); } From c597a4f476655cfa5ab2983964e10eb289554547 Mon Sep 17 00:00:00 2001 From: lntue Date: Wed, 22 Oct 2025 15:05:10 -0400 Subject: [PATCH 158/530] Revert "[libc] Add -Werror for libc tests" (#164684) Reverts llvm/llvm-project#160413 --- libc/cmake/modules/LLVMLibCTestRules.cmake | 2 +- libc/test/include/complex_test.cpp | 4 +- .../src/pthread/pthread_barrier_test.cpp | 4 +- .../src/math/exhaustive/bfloat16_add_test.cpp | 5 +- .../src/math/exhaustive/bfloat16_div_test.cpp | 5 +- .../src/math/exhaustive/bfloat16_mul_test.cpp | 5 +- .../src/math/exhaustive/bfloat16_sub_test.cpp | 5 +- libc/test/src/signal/sigaltstack_test.cpp | 2 +- libc/test/src/stdio/fileop_test.cpp | 4 +- libc/test/src/stdio/fopen_test.cpp | 5 +- .../src/sys/socket/linux/send_recv_test.cpp | 10 ++- .../sys/socket/linux/sendmsg_recvmsg_test.cpp | 16 ++--- .../sys/socket/linux/sendto_recvfrom_test.cpp | 4 +- libc/test/src/time/ctime_r_test.cpp | 2 +- libc/test/src/time/mktime_test.cpp | 61 ++++++++----------- 15 files changed, 57 insertions(+), 77 deletions(-) diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake index 53bba3e1ef201..933b81b9f1d46 100644 --- a/libc/cmake/modules/LLVMLibCTestRules.cmake +++ b/libc/cmake/modules/LLVMLibCTestRules.cmake @@ -38,7 +38,7 @@ function(_get_common_test_compile_options output_var c_test flags) list(APPEND compile_options "-Wextra") # -DLIBC_WNO_ERROR=ON if you can't build cleanly with -Werror. if(NOT LIBC_WNO_ERROR) - list(APPEND compile_options "-Werror") + # list(APPEND compile_options "-Werror") endif() list(APPEND compile_options "-Wconversion") # FIXME: convert to -Wsign-conversion diff --git a/libc/test/include/complex_test.cpp b/libc/test/include/complex_test.cpp index 70e8c1c3d9216..6f900aa4fb674 100644 --- a/libc/test/include/complex_test.cpp +++ b/libc/test/include/complex_test.cpp @@ -27,8 +27,8 @@ TEST(LlvmLibcComplexTest, CMPLXMacro) { EXPECT_CFP_EQ(CMPLXL(1.0l, 0), 1.0l); #ifdef LIBC_TYPES_HAS_CFLOAT16 - EXPECT_CFP_EQ(CMPLXF16(0, 1.0), static_cast<_Complex _Float16>(I)); - EXPECT_CFP_EQ(CMPLXF16(1.0, 0), static_cast<_Complex _Float16>(1.0)); + EXPECT_CFP_EQ(CMPLXF16(0, 1.0), I); + EXPECT_CFP_EQ(CMPLXF16(1.0, 0), 1.0); #endif // LIBC_TYPES_HAS_CFLOAT16 #ifdef LIBC_TYPES_HAS_CFLOAT128 diff --git a/libc/test/integration/src/pthread/pthread_barrier_test.cpp b/libc/test/integration/src/pthread/pthread_barrier_test.cpp index 3fef16522c425..c8e11047e1d80 100644 --- a/libc/test/integration/src/pthread/pthread_barrier_test.cpp +++ b/libc/test/integration/src/pthread/pthread_barrier_test.cpp @@ -26,7 +26,7 @@ pthread_barrier_t barrier; LIBC_NAMESPACE::cpp::Atomic counter; -void *increment_counter_and_wait([[maybe_unused]] void *args) { +void *increment_counter_and_wait(void *args) { counter.fetch_add(1); return reinterpret_cast( LIBC_NAMESPACE::pthread_barrier_wait(&barrier)); @@ -102,7 +102,7 @@ void reused_barrier_test() { LIBC_NAMESPACE::pthread_barrier_destroy(&barrier); } -void *barrier_wait([[maybe_unused]] void *in) { +void *barrier_wait(void *in) { return reinterpret_cast( LIBC_NAMESPACE::pthread_barrier_wait(&barrier)); } diff --git a/libc/test/src/math/exhaustive/bfloat16_add_test.cpp b/libc/test/src/math/exhaustive/bfloat16_add_test.cpp index 20a66e27fa61b..3f4c77978a93d 100644 --- a/libc/test/src/math/exhaustive/bfloat16_add_test.cpp +++ b/libc/test/src/math/exhaustive/bfloat16_add_test.cpp @@ -23,9 +23,8 @@ struct Bfloat16AddChecker : public virtual LIBC_NAMESPACE::testing::Test { using FPBits = LIBC_NAMESPACE::fputil::FPBits; using StorageType = typename FPBits::StorageType; - uint64_t check(uint16_t x_start, uint16_t x_stop, - [[maybe_unused]] uint16_t y_start, uint16_t y_stop, - mpfr::RoundingMode rounding) { + uint64_t check(uint16_t x_start, uint16_t x_stop, uint16_t y_start, + uint16_t y_stop, mpfr::RoundingMode rounding) { mpfr::ForceRoundingMode r(rounding); if (!r.success) return true; diff --git a/libc/test/src/math/exhaustive/bfloat16_div_test.cpp b/libc/test/src/math/exhaustive/bfloat16_div_test.cpp index 43e033108861e..2648d5f775af5 100644 --- a/libc/test/src/math/exhaustive/bfloat16_div_test.cpp +++ b/libc/test/src/math/exhaustive/bfloat16_div_test.cpp @@ -23,9 +23,8 @@ struct Bfloat16DivChecker : public virtual LIBC_NAMESPACE::testing::Test { using FPBits = LIBC_NAMESPACE::fputil::FPBits; using StorageType = typename FPBits::StorageType; - uint64_t check(uint16_t x_start, uint16_t x_stop, - [[maybe_unused]] uint16_t y_start, uint16_t y_stop, - mpfr::RoundingMode rounding) { + uint64_t check(uint16_t x_start, uint16_t x_stop, uint16_t y_start, + uint16_t y_stop, mpfr::RoundingMode rounding) { mpfr::ForceRoundingMode r(rounding); if (!r.success) return true; diff --git a/libc/test/src/math/exhaustive/bfloat16_mul_test.cpp b/libc/test/src/math/exhaustive/bfloat16_mul_test.cpp index 4fd9ccf440a9d..3cbbcb500a4fb 100644 --- a/libc/test/src/math/exhaustive/bfloat16_mul_test.cpp +++ b/libc/test/src/math/exhaustive/bfloat16_mul_test.cpp @@ -23,9 +23,8 @@ struct Bfloat16MulChecker : public virtual LIBC_NAMESPACE::testing::Test { using FPBits = LIBC_NAMESPACE::fputil::FPBits; using StorageType = typename FPBits::StorageType; - uint64_t check(uint16_t x_start, uint16_t x_stop, - [[maybe_unused]] uint16_t y_start, uint16_t y_stop, - mpfr::RoundingMode rounding) { + uint64_t check(uint16_t x_start, uint16_t x_stop, uint16_t y_start, + uint16_t y_stop, mpfr::RoundingMode rounding) { mpfr::ForceRoundingMode r(rounding); if (!r.success) return true; diff --git a/libc/test/src/math/exhaustive/bfloat16_sub_test.cpp b/libc/test/src/math/exhaustive/bfloat16_sub_test.cpp index 350af52930744..11bc6f59dd294 100644 --- a/libc/test/src/math/exhaustive/bfloat16_sub_test.cpp +++ b/libc/test/src/math/exhaustive/bfloat16_sub_test.cpp @@ -23,9 +23,8 @@ struct Bfloat16SubChecker : public virtual LIBC_NAMESPACE::testing::Test { using FPBits = LIBC_NAMESPACE::fputil::FPBits; using StorageType = typename FPBits::StorageType; - uint64_t check(uint16_t x_start, uint16_t x_stop, - [[maybe_unused]] uint16_t y_start, uint16_t y_stop, - mpfr::RoundingMode rounding) { + uint64_t check(uint16_t x_start, uint16_t x_stop, uint16_t y_start, + uint16_t y_stop, mpfr::RoundingMode rounding) { mpfr::ForceRoundingMode r(rounding); if (!r.success) return true; diff --git a/libc/test/src/signal/sigaltstack_test.cpp b/libc/test/src/signal/sigaltstack_test.cpp index 43c679b23bb29..8c252c47452df 100644 --- a/libc/test/src/signal/sigaltstack_test.cpp +++ b/libc/test/src/signal/sigaltstack_test.cpp @@ -33,7 +33,7 @@ static void handler(int) { // out or mapped to a register. uint8_t var[LOCAL_VAR_SIZE]; for (int i = 0; i < LOCAL_VAR_SIZE; ++i) - var[i] = static_cast(i); + var[i] = i; // Verify that array is completely on the alt_stack. for (int i = 0; i < LOCAL_VAR_SIZE; ++i) { if (!(uintptr_t(var + i) < uintptr_t(alt_stack + ALT_STACK_SIZE) && diff --git a/libc/test/src/stdio/fileop_test.cpp b/libc/test/src/stdio/fileop_test.cpp index 1dc9b6522006d..02328042b92b3 100644 --- a/libc/test/src/stdio/fileop_test.cpp +++ b/libc/test/src/stdio/fileop_test.cpp @@ -101,7 +101,7 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) { // This is not a readable file. ASSERT_THAT(LIBC_NAMESPACE::fread(data, 1, 1, file), - returns(EQ(static_cast(0))).with_errno(NE(0))); + returns(EQ(0)).with_errno(NE(0))); ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); @@ -175,7 +175,7 @@ TEST_F(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) { // Trying to read more should fetch nothing. ASSERT_THAT( LIBC_NAMESPACE::fread(read_data, sizeof(MyStruct), WRITE_NMEMB, file), - returns(EQ(static_cast(0))).with_errno(EQ(0))); + returns(EQ(0)).with_errno(EQ(0))); EXPECT_NE(LIBC_NAMESPACE::feof(file), 0); EXPECT_EQ(LIBC_NAMESPACE::ferror(file), 0); ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0); diff --git a/libc/test/src/stdio/fopen_test.cpp b/libc/test/src/stdio/fopen_test.cpp index e427e496895e0..3f651f755e7f3 100644 --- a/libc/test/src/stdio/fopen_test.cpp +++ b/libc/test/src/stdio/fopen_test.cpp @@ -15,14 +15,15 @@ #include "test/UnitTest/Test.h" TEST(LlvmLibcFOpenTest, PrintToFile) { + int result; FILE *file = LIBC_NAMESPACE::fopen(APPEND_LIBC_TEST("testdata/test.txt"), "w"); ASSERT_FALSE(file == nullptr); static constexpr char STRING[] = "A simple string written to a file\n"; - size_t result = LIBC_NAMESPACE::fwrite(STRING, 1, sizeof(STRING) - 1, file); - EXPECT_GE(result, static_cast(0)); + result = LIBC_NAMESPACE::fwrite(STRING, 1, sizeof(STRING) - 1, file); + EXPECT_GE(result, 0); ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); diff --git a/libc/test/src/sys/socket/linux/send_recv_test.cpp b/libc/test/src/sys/socket/linux/send_recv_test.cpp index 5bfa4fb2ed572..46f73a29d6f7a 100644 --- a/libc/test/src/sys/socket/linux/send_recv_test.cpp +++ b/libc/test/src/sys/socket/linux/send_recv_test.cpp @@ -50,15 +50,13 @@ TEST_F(LlvmLibcSendRecvTest, SendFails) { const char TEST_MESSAGE[] = "connection terminated"; const size_t MESSAGE_LEN = sizeof(TEST_MESSAGE); - ASSERT_THAT( - static_cast(LIBC_NAMESPACE::send(-1, TEST_MESSAGE, MESSAGE_LEN, 0)), - Fails(EBADF)); + ASSERT_THAT(LIBC_NAMESPACE::send(-1, TEST_MESSAGE, MESSAGE_LEN, 0), + Fails(EBADF)); } TEST_F(LlvmLibcSendRecvTest, RecvFails) { char buffer[256]; - ASSERT_THAT( - static_cast(LIBC_NAMESPACE::recv(-1, buffer, sizeof(buffer), 0)), - Fails(EBADF)); + ASSERT_THAT(LIBC_NAMESPACE::recv(-1, buffer, sizeof(buffer), 0), + Fails(EBADF)); } diff --git a/libc/test/src/sys/socket/linux/sendmsg_recvmsg_test.cpp b/libc/test/src/sys/socket/linux/sendmsg_recvmsg_test.cpp index 31099650dd20b..7ed94b20c564a 100644 --- a/libc/test/src/sys/socket/linux/sendmsg_recvmsg_test.cpp +++ b/libc/test/src/sys/socket/linux/sendmsg_recvmsg_test.cpp @@ -45,9 +45,8 @@ TEST_F(LlvmLibcSendMsgRecvMsgTest, SucceedsWithSocketPair) { send_message.msg_controllen = 0; send_message.msg_flags = 0; - ASSERT_THAT( - static_cast(LIBC_NAMESPACE::sendmsg(sockpair[0], &send_message, 0)), - Succeeds(static_cast(MESSAGE_LEN))); + ASSERT_THAT(LIBC_NAMESPACE::sendmsg(sockpair[0], &send_message, 0), + Succeeds(static_cast(MESSAGE_LEN))); char buffer[256]; @@ -64,9 +63,8 @@ TEST_F(LlvmLibcSendMsgRecvMsgTest, SucceedsWithSocketPair) { recv_message.msg_controllen = 0; recv_message.msg_flags = 0; - ASSERT_THAT( - static_cast(LIBC_NAMESPACE::recvmsg(sockpair[1], &recv_message, 0)), - Succeeds(static_cast(MESSAGE_LEN))); + ASSERT_THAT(LIBC_NAMESPACE::recvmsg(sockpair[1], &recv_message, 0), + Succeeds(static_cast(MESSAGE_LEN))); ASSERT_STREQ(buffer, TEST_MESSAGE); @@ -93,8 +91,7 @@ TEST_F(LlvmLibcSendMsgRecvMsgTest, SendFails) { send_message.msg_controllen = 0; send_message.msg_flags = 0; - ASSERT_THAT(static_cast(LIBC_NAMESPACE::sendmsg(-1, &send_message, 0)), - Fails(EBADF)); + ASSERT_THAT(LIBC_NAMESPACE::sendmsg(-1, &send_message, 0), Fails(EBADF)); } TEST_F(LlvmLibcSendMsgRecvMsgTest, RecvFails) { @@ -113,6 +110,5 @@ TEST_F(LlvmLibcSendMsgRecvMsgTest, RecvFails) { recv_message.msg_controllen = 0; recv_message.msg_flags = 0; - ASSERT_THAT(static_cast(LIBC_NAMESPACE::recvmsg(-1, &recv_message, 0)), - Fails(EBADF)); + ASSERT_THAT(LIBC_NAMESPACE::recvmsg(-1, &recv_message, 0), Fails(EBADF)); } diff --git a/libc/test/src/sys/socket/linux/sendto_recvfrom_test.cpp b/libc/test/src/sys/socket/linux/sendto_recvfrom_test.cpp index c3f84266d464c..8377260252c37 100644 --- a/libc/test/src/sys/socket/linux/sendto_recvfrom_test.cpp +++ b/libc/test/src/sys/socket/linux/sendto_recvfrom_test.cpp @@ -54,7 +54,7 @@ TEST_F(LlvmLibcSendToRecvFromTest, SendToFails) { ASSERT_THAT( LIBC_NAMESPACE::sendto(-1, TEST_MESSAGE, MESSAGE_LEN, 0, nullptr, 0), - Fails(static_cast(EBADF), static_cast(-1))); + Fails(EBADF)); } TEST_F(LlvmLibcSendToRecvFromTest, RecvFromFails) { @@ -62,5 +62,5 @@ TEST_F(LlvmLibcSendToRecvFromTest, RecvFromFails) { ASSERT_THAT( LIBC_NAMESPACE::recvfrom(-1, buffer, sizeof(buffer), 0, nullptr, 0), - Fails(static_cast(EBADF), static_cast(-1))); + Fails(EBADF)); } diff --git a/libc/test/src/time/ctime_r_test.cpp b/libc/test/src/time/ctime_r_test.cpp index 084db300bf2d6..ee06c706734fb 100644 --- a/libc/test/src/time/ctime_r_test.cpp +++ b/libc/test/src/time/ctime_r_test.cpp @@ -23,7 +23,7 @@ TEST_F(LlvmLibcCtimeR, Nullptr) { result = LIBC_NAMESPACE::ctime_r(nullptr, buffer); ASSERT_STREQ(nullptr, result); - time_t t = 2147483648; // invalid argument + time_t t; result = LIBC_NAMESPACE::ctime_r(&t, nullptr); ASSERT_STREQ(nullptr, result); } diff --git a/libc/test/src/time/mktime_test.cpp b/libc/test/src/time/mktime_test.cpp index 1062fa45ef644..1dfdd73de5440 100644 --- a/libc/test/src/time/mktime_test.cpp +++ b/libc/test/src/time/mktime_test.cpp @@ -36,8 +36,7 @@ TEST(LlvmLibcMkTime, FailureSetsErrno) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), - Fails(EOVERFLOW)); + EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Fails(EOVERFLOW)); } TEST(LlvmLibcMkTime, InvalidSeconds) { @@ -52,8 +51,7 @@ TEST(LlvmLibcMkTime, InvalidSeconds) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), - Succeeds(-1)); + EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Succeeds(-1)); EXPECT_TM_EQ((tm{.tm_sec = 59, .tm_min = 59, .tm_hour = 23, @@ -77,8 +75,7 @@ TEST(LlvmLibcMkTime, InvalidSeconds) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), - Succeeds(60)); + EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Succeeds(60)); EXPECT_TM_EQ((tm{.tm_sec = 0, .tm_min = 1, .tm_hour = 0, @@ -104,7 +101,7 @@ TEST(LlvmLibcMkTime, InvalidMinutes) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), + EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Succeeds(-LIBC_NAMESPACE::time_constants::SECONDS_PER_MIN)); EXPECT_TM_EQ((tm{.tm_sec = 0, .tm_min = 59, @@ -129,7 +126,7 @@ TEST(LlvmLibcMkTime, InvalidMinutes) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), + EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Succeeds(60 * LIBC_NAMESPACE::time_constants::SECONDS_PER_MIN)); EXPECT_TM_EQ((tm{.tm_sec = 0, .tm_min = 0, @@ -156,7 +153,7 @@ TEST(LlvmLibcMkTime, InvalidHours) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), + EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Succeeds(-LIBC_NAMESPACE::time_constants::SECONDS_PER_HOUR)); EXPECT_TM_EQ((tm{.tm_sec = 0, .tm_min = 0, @@ -182,7 +179,7 @@ TEST(LlvmLibcMkTime, InvalidHours) { .tm_yday = 0, .tm_isdst = 0}; EXPECT_THAT( - static_cast(LIBC_NAMESPACE::mktime(&tm_data)), + LIBC_NAMESPACE::mktime(&tm_data), Succeeds(24 * LIBC_NAMESPACE::time_constants::SECONDS_PER_HOUR)); EXPECT_TM_EQ((tm{.tm_sec = 0, .tm_min = 0, @@ -208,7 +205,7 @@ TEST(LlvmLibcMkTime, InvalidYear) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), + EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Succeeds(-LIBC_NAMESPACE::time_constants::DAYS_PER_NON_LEAP_YEAR * LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY)); EXPECT_TM_EQ((tm{.tm_sec = 0, @@ -237,8 +234,7 @@ TEST(LlvmLibcMkTime, InvalidEndOf32BitEpochYear) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), - Fails(EOVERFLOW)); + EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Fails(EOVERFLOW)); } { @@ -252,8 +248,7 @@ TEST(LlvmLibcMkTime, InvalidEndOf32BitEpochYear) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), - Fails(EOVERFLOW)); + EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Fails(EOVERFLOW)); } { @@ -267,8 +262,7 @@ TEST(LlvmLibcMkTime, InvalidEndOf32BitEpochYear) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), - Fails(EOVERFLOW)); + EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Fails(EOVERFLOW)); } { @@ -282,8 +276,7 @@ TEST(LlvmLibcMkTime, InvalidEndOf32BitEpochYear) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), - Fails(EOVERFLOW)); + EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Fails(EOVERFLOW)); } { @@ -297,8 +290,7 @@ TEST(LlvmLibcMkTime, InvalidEndOf32BitEpochYear) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), - Fails(EOVERFLOW)); + EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Fails(EOVERFLOW)); } { @@ -312,8 +304,7 @@ TEST(LlvmLibcMkTime, InvalidEndOf32BitEpochYear) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), - Fails(EOVERFLOW)); + EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Fails(EOVERFLOW)); } } @@ -330,7 +321,7 @@ TEST(LlvmLibcMkTime, InvalidMonths) { .tm_yday = 0, .tm_isdst = 0}; EXPECT_THAT( - static_cast(LIBC_NAMESPACE::mktime(&tm_data)), + LIBC_NAMESPACE::mktime(&tm_data), Succeeds(-32 * LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY)); EXPECT_TM_EQ((tm{.tm_sec = 0, .tm_min = 0, @@ -356,7 +347,7 @@ TEST(LlvmLibcMkTime, InvalidMonths) { .tm_yday = 0, .tm_isdst = 0}; EXPECT_THAT( - static_cast(LIBC_NAMESPACE::mktime(&tm_data)), + LIBC_NAMESPACE::mktime(&tm_data), Succeeds(LIBC_NAMESPACE::time_constants::DAYS_PER_NON_LEAP_YEAR * LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY)); EXPECT_TM_EQ((tm{.tm_sec = 0, @@ -384,7 +375,7 @@ TEST(LlvmLibcMkTime, InvalidDays) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), + EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Succeeds(-1 * LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY)); EXPECT_TM_EQ((tm{.tm_sec = 0, .tm_min = 0, @@ -409,7 +400,7 @@ TEST(LlvmLibcMkTime, InvalidDays) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), + EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Succeeds(31 * LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY)); EXPECT_TM_EQ((tm{.tm_sec = 0, .tm_min = 0, @@ -434,7 +425,7 @@ TEST(LlvmLibcMkTime, InvalidDays) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), + EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Succeeds(59 * LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY)); EXPECT_TM_EQ((tm{.tm_sec = 0, .tm_min = 0, @@ -460,7 +451,7 @@ TEST(LlvmLibcMkTime, InvalidDays) { .tm_yday = 0, .tm_isdst = 0}; EXPECT_THAT( - static_cast(LIBC_NAMESPACE::mktime(&tm_data)), + LIBC_NAMESPACE::mktime(&tm_data), Succeeds(((2 * LIBC_NAMESPACE::time_constants::DAYS_PER_NON_LEAP_YEAR) + 60) * LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY)); @@ -490,8 +481,7 @@ TEST(LlvmLibcMkTime, EndOf32BitEpochYear) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), - Succeeds(0x7FFFFFFF)); + EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Succeeds(0x7FFFFFFF)); EXPECT_TM_EQ((tm{.tm_sec = 7, .tm_min = 14, .tm_hour = 3, @@ -517,8 +507,7 @@ TEST(LlvmLibcMkTime, EndOf32BitEpochYear) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), - Succeeds(0x7FFFFFFF - 8)); + EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Succeeds(0x7FFFFFFF - 8)); EXPECT_TM_EQ((tm{.tm_sec = 59, .tm_min = 13, .tm_hour = 3, @@ -543,7 +532,7 @@ TEST(LlvmLibcMkTime, EndOf32BitEpochYear) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), + EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Succeeds(0x7FFFFFFF - 8 - 14 * LIBC_NAMESPACE::time_constants::SECONDS_PER_MIN)); EXPECT_TM_EQ((tm{.tm_sec = 59, @@ -570,7 +559,7 @@ TEST(LlvmLibcMkTime, EndOf32BitEpochYear) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), + EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Succeeds(0x7FFFFFFF - 8 - 14 * LIBC_NAMESPACE::time_constants::SECONDS_PER_MIN - 3 * LIBC_NAMESPACE::time_constants::SECONDS_PER_HOUR)); @@ -598,7 +587,7 @@ TEST(LlvmLibcMkTime, EndOf32BitEpochYear) { .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0}; - EXPECT_THAT(static_cast(LIBC_NAMESPACE::mktime(&tm_data)), + EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Succeeds(0x7FFFFFFF - 8 - 14 * LIBC_NAMESPACE::time_constants::SECONDS_PER_MIN - 3 * LIBC_NAMESPACE::time_constants::SECONDS_PER_HOUR - From 7cee8176cc1b7c316561c8077e9b6babe04804f8 Mon Sep 17 00:00:00 2001 From: Jan Patrick Lehr Date: Wed, 22 Oct 2025 21:08:20 +0200 Subject: [PATCH 159/530] [Offload][OMPT] De-type profiler data in API (#340) The data type used in many API functions was still OMPT specific. This patch changes the type to vodi * to remove this OMPT specific data type from the interfaces. --- offload/plugins-nextgen/amdgpu/src/rtl.cpp | 143 +++++++----------- .../common/OMPT/OmptProfiler.cpp | 6 +- .../common/OMPT/OmptProfiler.h | 2 +- .../common/include/GenericProfiler.h | 2 +- 4 files changed, 59 insertions(+), 94 deletions(-) diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 0d410d1cc5fe4..8f444c60a3548 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -160,6 +160,13 @@ static Error timeDataTransferInNsAsync(void *Data) { return Plugin::success(); } + +static void * +getOrNullProfilerSpecificData(AsyncInfoWrapperTy &AsyncInfoWrapper) { + __tgt_async_info *AI = AsyncInfoWrapper; + return AI ? AI->ProfilerData : nullptr; +} + } // namespace plugin } // namespace target } // namespace omp @@ -197,40 +204,6 @@ static double getTimeOfDay() { return TimeVal; } -#ifdef OMPT_SUPPORT -namespace llvm::omp::target::plugin { -/// Returns a pointer to an OmptEventInfoTy object to be used for OMPT tracing -/// or nullptr. It is the caller's duty to free the returned pointer when no -/// longer needed. -static ompt::OmptEventInfoTy * -getOrNullOmptEventInfo(AsyncInfoWrapperTy &AsyncInfoWrapper) { - __tgt_async_info *AI = AsyncInfoWrapper; - if (!AI || !AI->ProfilerData) - return nullptr; - - // The profiler data is allocated in the profiler for each individual event. - return reinterpret_cast(AI->ProfilerData); -} - -} // namespace llvm::omp::target::plugin - -#else // OMPT_SUPPORT -namespace llvm::omp::target::ompt { - -struct OmptEventInfoTy {}; -} // namespace llvm::omp::target::ompt - -namespace llvm::omp::target::plugin { - -/// When no OMPT is enabled, return nullptr to de-facto disable the profiling -static ompt::OmptEventInfoTy * -getOrNullOmptEventInfo(AsyncInfoWrapperTy &AsyncInfoWrapper) { - return nullptr; -} - -} // namespace llvm::omp::target::plugin -#endif - namespace llvm { namespace omp { namespace target { @@ -1444,7 +1417,7 @@ struct AMDGPUQueueTy { hsa_status_t Status = hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError, &Device, UINT32_MAX, UINT32_MAX, &Queue); - if (Device.Plugin.getProfiler()->shouldEnableProfiling() || + if (Device.Plugin.getProfiler()->isProfilingEnabled() || OMPX_EnableQueueProfiling) hsa_amd_profiling_set_profiler_enabled(Queue, /*Enable=*/1); @@ -1810,10 +1783,11 @@ struct AMDGPUStreamTy { Error schedProfilerKernelTiming(GenericDeviceTy *Device, hsa_agent_t Agent, AMDGPUSignalTy *OutputSignal, double TicksToTime, - ompt::OmptEventInfoTy *OmptData) { + void *ProfilerSpecificData) { Callbacks.emplace_back(timeKernelInNsAsync); - ActionArgs.emplace_back().ProfilerArgs = ProfilingInfoTy{ - &(Device->Plugin), Agent, OutputSignal, TicksToTime, OmptData}; + ActionArgs.emplace_back().ProfilerArgs = + ProfilingInfoTy{&(Device->Plugin), Agent, OutputSignal, TicksToTime, + ProfilerSpecificData}; return Plugin::success(); } @@ -1822,10 +1796,11 @@ struct AMDGPUStreamTy { hsa_agent_t Agent, AMDGPUSignalTy *OutputSignal, double TicksToTime, - ompt::OmptEventInfoTy *OmptData) { + void *ProfilerSpecificData) { Callbacks.emplace_back(timeDataTransferInNsAsync); - ActionArgs.emplace_back().ProfilerArgs = ProfilingInfoTy{ - &(Device->Plugin), Agent, OutputSignal, TicksToTime, OmptData}; + ActionArgs.emplace_back().ProfilerArgs = + ProfilingInfoTy{&(Device->Plugin), Agent, OutputSignal, TicksToTime, + ProfilerSpecificData}; return Plugin::success(); } @@ -2200,7 +2175,7 @@ struct AMDGPUStreamTy { uint32_t NumThreads[3], uint32_t NumBlocks[3], uint32_t GroupSize, uint32_t StackSize, AMDGPUMemoryManagerTy &MemoryManager, - ompt::OmptEventInfoTy *OmptInfo = nullptr) { + void *ProfilerSpecificData = nullptr) { if (Queue == nullptr) return Plugin::error(ErrorCode::INVALID_NULL_POINTER, "target queue was nullptr"); @@ -2223,13 +2198,12 @@ struct AMDGPUStreamTy { // TODO: Technically this conditional compilation is not needed anymore #ifdef OMPT_SUPPORT - if (OmptInfo) { - DP("OMPT-Async: Info in KernelTy >> TR ptr: %p\n", OmptInfo->TraceRecord); + if (ProfilerSpecificData) { - // OmptInfo holds function pointer to finish trace record once the kernel - // completed. + // ProfilerSpecificData holds function pointer to finish trace record once + // the kernel completed. if (auto Err = Slots[Curr].schedProfilerKernelTiming( - &Device, Agent, OutputSignal, TicksToTime, OmptInfo)) + &Device, Agent, OutputSignal, TicksToTime, ProfilerSpecificData)) return Err; } #endif @@ -2302,7 +2276,7 @@ struct AMDGPUStreamTy { /// Push an asynchronous memory copy between pinned memory buffers. Error pushPinnedMemoryCopyAsync(void *Dst, const void *Src, uint64_t CopySize, - ompt::OmptEventInfoTy *OmptInfo = nullptr) { + void *ProfilerSpecificData = nullptr) { // Retrieve an available signal for the operation's output. AMDGPUSignalTy *OutputSignal = nullptr; if (auto Err = SignalManager.getResource(OutputSignal)) @@ -2317,11 +2291,10 @@ struct AMDGPUStreamTy { // TODO: Technically this conditional compilation is not needed anymore #ifdef OMPT_SUPPORT - if (OmptInfo) { - DP("OMPT-Async: Registering data timing in pushPinnedMemoryCopyAsync\n"); + if (ProfilerSpecificData) { // Capture the time the data transfer required for the d2h transfer. if (auto Err = Slots[Curr].schedProfilerDataTransferTiming( - &Device, Agent, OutputSignal, TicksToTime, OmptInfo)) + &Device, Agent, OutputSignal, TicksToTime, ProfilerSpecificData)) return Err; } #endif @@ -2348,7 +2321,7 @@ struct AMDGPUStreamTy { Error pushMemoryCopyD2HAsync(void *Dst, const void *Src, void *Inter, uint64_t CopySize, AMDGPUMemoryManagerTy &MemoryManager, - ompt::OmptEventInfoTy *OmptInfo = nullptr) { + void *ProfilerSpecificData = nullptr) { // Retrieve available signals for the operation's outputs. AMDGPUSignalTy *OutputSignals[2] = {}; if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals)) @@ -2374,11 +2347,11 @@ struct AMDGPUStreamTy { // TODO: Technically this conditional compilation is not needed anymore #ifdef OMPT_SUPPORT - if (OmptInfo) { - DP("OMPT-Async: Registering data timing in pushMemoryCopyD2HAsync\n"); + if (ProfilerSpecificData) { // Capture the time the data transfer required for the d2h transfer. if (auto Err = Slots[Curr].schedProfilerDataTransferTiming( - &Device, Agent, OutputSignals[0], TicksToTime, OmptInfo)) + &Device, Agent, OutputSignals[0], TicksToTime, + ProfilerSpecificData)) return Err; } #endif @@ -2435,7 +2408,7 @@ struct AMDGPUStreamTy { Error pushMemoryCopyH2DAsync(void *Dst, const void *Src, void *Inter, uint64_t CopySize, AMDGPUMemoryManagerTy &MemoryManager, - ompt::OmptEventInfoTy *OmptInfo = nullptr, + void *ProfilerSpecificData = nullptr, size_t NumTimes = 1) { // Retrieve available signals for the operation's outputs. AMDGPUSignalTy *OutputSignals[2] = {}; @@ -2498,11 +2471,11 @@ struct AMDGPUStreamTy { // TODO: Technically, this conditional compilation is not needed anymore #ifdef OMPT_SUPPORT - if (OmptInfo) { - DP("OMPT-Async: Registering data timing in pushMemoryCopyH2DAsync\n"); + if (ProfilerSpecificData) { // Capture the time the data transfer required for the d2h transfer. if (auto Err = Slots[Curr].schedProfilerDataTransferTiming( - &Device, Agent, OutputSignals[0], TicksToTime, OmptInfo)) + &Device, Agent, OutputSignals[0], TicksToTime, + ProfilerSpecificData)) return Err; } #endif @@ -2523,7 +2496,7 @@ struct AMDGPUStreamTy { // AMDGPUDeviceTy is incomplete here, passing the underlying agent instead Error pushMemoryCopyD2DAsync(void *Dst, hsa_agent_t DstAgent, const void *Src, hsa_agent_t SrcAgent, uint64_t CopySize, - ompt::OmptEventInfoTy *OmptInfo = nullptr) { + void *ProfilerSpecificData = nullptr) { AMDGPUSignalTy *OutputSignal; if (auto Err = SignalManager.getResources(/*Num=*/1, &OutputSignal)) return Err; @@ -2537,11 +2510,10 @@ struct AMDGPUStreamTy { // TODO: Technically, this conditional compilation is not needed anymore #ifdef OMPT_SUPPORT - if (OmptInfo) { - DP("OMPT-Async: Registering data timing in pushMemoryCopyD2DAsync\n"); + if (ProfilerSpecificData) { // Capture the time the data transfer required for the d2h transfer. if (auto Err = Slots[Curr].schedProfilerDataTransferTiming( - &Device, Agent, OutputSignal, TicksToTime, OmptInfo)) + &Device, Agent, OutputSignal, TicksToTime, ProfilerSpecificData)) return Err; } #endif @@ -3746,8 +3718,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { void *PinnedPtr = nullptr; // Obtain the OMPT-related callback data - DP("OMPT-Async: dataSubmitImpl\n"); - auto LocalOmptEventInfo = getOrNullOmptEventInfo(AsyncInfoWrapper); + auto ProfilerSpecificData = getOrNullProfilerSpecificData(AsyncInfoWrapper); // Prefault GPU page table in XNACK-Enabled case, on APUs, // under the assumption that explicitly allocated memory @@ -3763,9 +3734,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) { if (auto Err = getStream(AsyncInfoWrapper, Stream)) return Err; - DP("OMPT-Async: Pinned Copy\n"); return Stream->pushPinnedMemoryCopyAsync(TgtPtr, PinnedPtr, Size, - LocalOmptEventInfo); + ProfilerSpecificData); } // For large transfers use synchronous behavior. @@ -3793,7 +3763,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { if (auto Err = Signal.init()) return Err; - DP("OMPT-Async: Sync Copy\n"); if (auto Err = hsa_utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr, Agent, PinnedPtr, Agent, Size, 0, nullptr, Signal.get())) @@ -3803,9 +3772,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { return Err; #ifdef OMPT_SUPPORT - if (LocalOmptEventInfo) { + if (Plugin.getProfiler()->isProfilingEnabled()) { ProfilingInfoTy OmptKernelTimingArgsAsync{ - &Plugin, Agent, &Signal, TicksToTime, LocalOmptEventInfo}; + &Plugin, Agent, &Signal, TicksToTime, ProfilerSpecificData}; if (auto Err = timeDataTransferInNsAsync(&OmptKernelTimingArgsAsync)) return Err; @@ -3828,10 +3797,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { if (auto Err = getStream(AsyncInfoWrapper, Stream)) return Err; - DP("OMPT-Async: ASync Copy\n"); return Stream->pushMemoryCopyH2DAsync(TgtPtr, HstPtr, PinnedPtr, Size, PinnedMemoryManager, - LocalOmptEventInfo); + ProfilerSpecificData); } /// Retrieve data from the device (device to host transfer). @@ -3841,8 +3809,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { void *PinnedPtr = nullptr; // Obtain the OMPT-related callback data - DP("OMPT-Async: dataRetrieveImpl\n"); - auto LocalOmptEventInfo = getOrNullOmptEventInfo(AsyncInfoWrapper); + auto ProfilerSpecificData = getOrNullProfilerSpecificData(AsyncInfoWrapper); // Prefault GPU page table in XNACK-Enabled case, on APUs, // under the assumption that explicitly allocated memory @@ -3858,9 +3825,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) { if (auto Err = getStream(AsyncInfoWrapper, Stream)) return Err; - DP("OMPT-Async: Pinned Copy\n"); return Stream->pushPinnedMemoryCopyAsync(PinnedPtr, TgtPtr, Size, - LocalOmptEventInfo); + ProfilerSpecificData); } // For large transfers use synchronous behavior. @@ -3898,9 +3864,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { return Err; #ifdef OMPT_SUPPORT - if (LocalOmptEventInfo) { + if (Plugin.getProfiler()->isProfilingEnabled()) { ProfilingInfoTy OmptKernelTimingArgsAsync{ - &Plugin, Agent, &Signal, TicksToTime, LocalOmptEventInfo}; + &Plugin, Agent, &Signal, TicksToTime, ProfilerSpecificData}; if (auto Err = timeDataTransferInNsAsync(&OmptKernelTimingArgsAsync)) return Err; @@ -3925,7 +3891,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { return Stream->pushMemoryCopyD2HAsync(HstPtr, TgtPtr, PinnedPtr, Size, PinnedMemoryManager, - LocalOmptEventInfo); + ProfilerSpecificData); } /// Exchange data between two devices within the plugin. @@ -3934,8 +3900,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { AsyncInfoWrapperTy &AsyncInfoWrapper) override { AMDGPUDeviceTy &DstDevice = static_cast(DstGenericDevice); - DP("OMPT-Async: dataExchangeImpl\n"); - auto LocalOmptEventInfo = getOrNullOmptEventInfo(AsyncInfoWrapper); + auto ProfilerSpecificData = getOrNullProfilerSpecificData(AsyncInfoWrapper); // For large transfers use synchronous behavior. // If OMPT is enabled or synchronous behavior is explicitly requested: @@ -3957,9 +3922,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { return Err; #ifdef OMPT_SUPPORT - if (LocalOmptEventInfo) { + if (Plugin.getProfiler()->isProfilingEnabled()) { ProfilingInfoTy OmptKernelTimingArgsAsync{ - &Plugin, Agent, &Signal, TicksToTime, LocalOmptEventInfo}; + &Plugin, Agent, &Signal, TicksToTime, ProfilerSpecificData}; if (auto Err = timeDataTransferInNsAsync(&OmptKernelTimingArgsAsync)) return Err; @@ -3977,7 +3942,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { return Stream->pushMemoryCopyD2DAsync(DstPtr, DstDevice.getAgent(), SrcPtr, getAgent(), (uint64_t)Size, - LocalOmptEventInfo); + ProfilerSpecificData); } /// Insert a data fence between previous data operations and the following @@ -5601,12 +5566,12 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, } // Get required OMPT-related data - auto LocalOmptEventInfo = getOrNullOmptEventInfo(AsyncInfoWrapper); + auto ProfilerSpecificData = getOrNullProfilerSpecificData(AsyncInfoWrapper); // Push the kernel launch into the stream. return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks, GroupSize, static_cast(StackSize), - ArgsMemoryManager, LocalOmptEventInfo); + ArgsMemoryManager, ProfilerSpecificData); } void AMDGPUKernelTy::printAMDOneLineKernelTrace(GenericDeviceTy &GenericDevice, @@ -5847,8 +5812,8 @@ static ProfilingInfoTy *getProfilingInfo(void *Data) { static std::pair getKernelStartAndEndTime(const ProfilingInfoTy *Args) { - assert(Args->Plugin && "Invalid GenericPlugin pointer in OMPT profiling"); - assert(Args->Signal && "Invalid AMDGPUSignal pointer in OMPT profiling"); + assert(Args->Plugin && "Invalid GenericPlugin pointer in profiling"); + assert(Args->Signal && "Invalid AMDGPUSignal pointer in profiling"); hsa_amd_profiling_dispatch_time_t TimeRec{0, 0}; hsa_status_t Status = hsa_amd_profiling_get_dispatch_time( @@ -5872,7 +5837,7 @@ getKernelStartAndEndTime(const ProfilingInfoTy *Args) { static std::pair getCopyStartAndEndTime(const ProfilingInfoTy *Args) { - assert(Args->Signal && "Invalid AMDGPUSignal Pointer in OMPT profiling"); + assert(Args->Signal && "Invalid AMDGPUSignal Pointer in profiling"); hsa_amd_profiling_async_copy_time_t TimeRec{0, 0}; hsa_status_t Status = diff --git a/offload/plugins-nextgen/common/OMPT/OmptProfiler.cpp b/offload/plugins-nextgen/common/OMPT/OmptProfiler.cpp index f54eff3781023..96c89ff6231a8 100644 --- a/offload/plugins-nextgen/common/OMPT/OmptProfiler.cpp +++ b/offload/plugins-nextgen/common/OMPT/OmptProfiler.cpp @@ -90,7 +90,7 @@ void llvm::omp::target::ompt::OmptProfilerTy::handlePreKernelLaunch( void llvm::omp::target::ompt::OmptProfilerTy::handleKernelCompletion( uint64_t StartNanos, uint64_t EndNanos, void *Data) { - if (!shouldEnableProfiling()) + if (!isProfilingEnabled()) return; DP("OMPT-Async: Time kernel for asynchronous execution (Plugin): Start %lu " @@ -112,7 +112,7 @@ void llvm::omp::target::ompt::OmptProfilerTy::handleKernelCompletion( void llvm::omp::target::ompt::OmptProfilerTy::handleDataTransfer( uint64_t StartNanos, uint64_t EndNanos, void *Data) { - if (!shouldEnableProfiling()) + if (!isProfilingEnabled()) return; auto OmptEventInfo = reinterpret_cast(Data); @@ -126,7 +126,7 @@ void llvm::omp::target::ompt::OmptProfilerTy::handleDataTransfer( freeProfilerDataEntry(OmptEventInfo); } -bool llvm::omp::target::ompt::OmptProfilerTy::shouldEnableProfiling() { +bool llvm::omp::target::ompt::OmptProfilerTy::isProfilingEnabled() { return llvm::omp::target::ompt::TracingActive; } diff --git a/offload/plugins-nextgen/common/OMPT/OmptProfiler.h b/offload/plugins-nextgen/common/OMPT/OmptProfiler.h index 0359d53eb7bd4..d8ffbd52c6daa 100644 --- a/offload/plugins-nextgen/common/OMPT/OmptProfiler.h +++ b/offload/plugins-nextgen/common/OMPT/OmptProfiler.h @@ -80,7 +80,7 @@ class OmptProfilerTy : public plugin::GenericProfilerTy { #undef bindOmptTracingFunction } - bool shouldEnableProfiling() override; + bool isProfilingEnabled() override; void handleInit(plugin::GenericDeviceTy *Device, plugin::GenericPluginTy *Plugin) override; diff --git a/offload/plugins-nextgen/common/include/GenericProfiler.h b/offload/plugins-nextgen/common/include/GenericProfiler.h index a7c5355533f6b..4df15c4f7b8be 100644 --- a/offload/plugins-nextgen/common/include/GenericProfiler.h +++ b/offload/plugins-nextgen/common/include/GenericProfiler.h @@ -60,7 +60,7 @@ class GenericProfilerTy { /// Obtain a pointer to profiler-specific data, if any. virtual void *getProfilerSpecificData() { return nullptr; } - virtual bool shouldEnableProfiling() { return false; } + virtual bool isProfilingEnabled() { return false; } /// Set the factors which are used to interpolate the device clock compared to /// the host clock. This follows a simple linear interpolation: Slope *