diff --git a/bolt/docs/CommandLineArgumentReference.md b/bolt/docs/CommandLineArgumentReference.md index 7c6e01d669b74..0dbf6f59d5e88 100644 --- a/bolt/docs/CommandLineArgumentReference.md +++ b/bolt/docs/CommandLineArgumentReference.md @@ -811,6 +811,15 @@ Specify file name of the runtime instrumentation library +- `--runtime-lib-init-hook=` + + Primary target for hooking runtime library initialization, used in + fallback order of availability in input binary (entry_point -> init + -> init_array) (default: entry_point) + - `entry_point`: use ELF Header Entry Point + - `init`: use ELF DT_INIT entry + - `init_array`: use ELF .init_array entry + - `--sctc-mode=` Mode for simplify conditional tail calls diff --git a/bolt/docs/PacRetDesign.md b/bolt/docs/PacRetDesign.md index f3fe5fbd522cb..2e3cb7b91e0ce 100644 --- a/bolt/docs/PacRetDesign.md +++ b/bolt/docs/PacRetDesign.md @@ -200,15 +200,22 @@ This pass runs after optimizations. It performns the _inverse_ of MarkRAState pa Some BOLT passes can add new Instructions. In InsertNegateRAStatePass, we have to know what RA state these have. -The current solution has the `inferUnknownStates` function to cover these, using -a fairly simple strategy: unknown states inherit the last known state. +> [!important] +> As issue #160989 explains, unwind info is missing from stubs. +> For this same reason, we cannot generate correct pac-specific unwind info: the +> signedness of the _incorrect_ return address is meaningless. -This will be updated to a more robust solution. +Assignment of RAStates to newly generated instructions is done in `inferUnknownStates`. +We have two different cases to cover: -> [!important] -> As issue #160989 describes, unwind info is incorrect in stubs with multiple callers. -> For this same reason, we cannot generate correct pac-specific unwind info: the signess -> of the _incorrect_ return address is meaningless. +1. If a BasicBlock has some instructions with known RA state, and some without, we + can copy the RAState of known instructions to the unknown ones. As the control + flow only changes between BasicBlocks, instructions in the same BasicBlock have + the same return address. (The exception is noreturn calls, but these would only + cause problems, if the newly inserted instruction is right after the call.) + +2. If a BasicBlock has no instructions with known RAState, we have to copy the + RAState of the previous BasicBlock in layout order. ### Optimizations requiring special attention diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h index 2af1d330b7545..8a90febcea3cc 100644 --- a/bolt/include/bolt/Core/BinaryContext.h +++ b/bolt/include/bolt/Core/BinaryContext.h @@ -807,6 +807,15 @@ class BinaryContext { /// the execution of the binary is completed. std::optional FiniFunctionAddress; + /// DT_INIT. + std::optional InitAddress; + + /// DT_INIT_ARRAY. Only used when DT_INIT is not set. + std::optional InitArrayAddress; + + /// DT_INIT_ARRAYSZ. Only used when DT_INIT is not set. + std::optional InitArraySize; + /// DT_FINI. std::optional FiniAddress; diff --git a/bolt/include/bolt/Passes/InsertNegateRAStatePass.h b/bolt/include/bolt/Passes/InsertNegateRAStatePass.h index 836948bf5e9c0..3f003af96162d 100644 --- a/bolt/include/bolt/Passes/InsertNegateRAStatePass.h +++ b/bolt/include/bolt/Passes/InsertNegateRAStatePass.h @@ -1,4 +1,4 @@ -//===- bolt/Passes/InsertNegateRAStatePass.cpp ----------------------------===// +//===- bolt/Passes/InsertNegateRAStatePass.h ------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -30,9 +30,30 @@ class InsertNegateRAState : public BinaryFunctionPass { private: /// Because states are tracked as MCAnnotations on individual instructions, /// newly inserted instructions do not have a state associated with them. - /// New states are "inherited" from the last known state. + /// Uses fillUnknownStateInBB and fillUnknownStubs. void inferUnknownStates(BinaryFunction &BF); + /// Simple case: copy RAStates to unknown insts from previous inst. + /// If the first inst has unknown state, copy set it to the first known state. + /// Accounts for signing and authenticating insts. + void fillUnknownStateInBB(BinaryContext &BC, BinaryBasicBlock &BB); + + /// Fill in RAState in BasicBlocks consisting entirely of new instructions. + /// As of #160989, we have to copy the RAState from the previous BB in the + /// layout, because CFIs are already incorrect here. + void fillUnknownStubs(BinaryFunction &BF); + + /// Returns the first known RAState from \p BB, or std::nullopt if all are + /// unknown. + std::optional getFirstKnownRAState(BinaryContext &BC, + BinaryBasicBlock &BB); + + /// \p Return true if all instructions have unknown RAState. + bool isUnknownBlock(BinaryContext &BC, BinaryBasicBlock &BB); + + /// Set all instructions in \p BB to \p State. + void markUnknownBlock(BinaryContext &BC, BinaryBasicBlock &BB, bool State); + /// Support for function splitting: /// if two consecutive BBs with Signed state are going to end up in different /// functions (so are held by different FunctionFragments), we have to add a diff --git a/bolt/include/bolt/Rewrite/RewriteInstance.h b/bolt/include/bolt/Rewrite/RewriteInstance.h index 35abf6b4d4ddd..5950b3c1630e1 100644 --- a/bolt/include/bolt/Rewrite/RewriteInstance.h +++ b/bolt/include/bolt/Rewrite/RewriteInstance.h @@ -93,14 +93,23 @@ class RewriteInstance { /// section allocations if found. void discoverBOLTReserved(); + /// Check whether we should use DT_INIT or DT_INIT_ARRAY for instrumentation. + /// DT_INIT is preferred; DT_INIT_ARRAY is only used when no DT_INIT entry was + /// found. + Error discoverRtInitAddress(); + /// Check whether we should use DT_FINI or DT_FINI_ARRAY for instrumentation. /// DT_FINI is preferred; DT_FINI_ARRAY is only used when no DT_FINI entry was /// found. Error discoverRtFiniAddress(); + /// If DT_INIT_ARRAY is used for instrumentation, update the relocation of its + /// first entry to point to the instrumentation library's init address. + Error updateRtInitReloc(); + /// If DT_FINI_ARRAY is used for instrumentation, update the relocation of its /// first entry to point to the instrumentation library's fini address. - void updateRtFiniReloc(); + Error updateRtFiniReloc(); /// Create and initialize metadata rewriters for this instance. void initializeMetadataManager(); diff --git a/bolt/lib/Passes/Inliner.cpp b/bolt/lib/Passes/Inliner.cpp index 5a7d02a34b4d8..0740fcef9102b 100644 --- a/bolt/lib/Passes/Inliner.cpp +++ b/bolt/lib/Passes/Inliner.cpp @@ -491,6 +491,32 @@ bool Inliner::inlineCallsInFunction(BinaryFunction &Function) { } } + // AArch64 BTI: + // If the callee has an indirect tailcall (BR), we would transform it to + // an indirect call (BLR) in InlineCall. Because of this, we would have to + // update the BTI at the target of the tailcall. However, these targets + // are not known. Instead, we skip inlining blocks with indirect + // tailcalls. + auto HasIndirectTailCall = [&](const BinaryFunction &BF) -> bool { + for (const auto &BB : BF) { + for (const auto &II : BB) { + if (BC.MIB->isIndirectBranch(II) && BC.MIB->isTailCall(II)) { + return true; + } + } + } + return false; + }; + + if (BC.isAArch64() && BC.usesBTI() && + HasIndirectTailCall(*TargetFunction)) { + ++InstIt; + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Skipping inlining block with tailcall" + << " in " << Function << " : " << BB->getName() + << " to keep BTIs consistent.\n"); + continue; + } + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: inlining call to " << *TargetFunction << " in " << Function << " : " << BB->getName() << ". Count: " << BB->getKnownExecutionCount() diff --git a/bolt/lib/Passes/InsertNegateRAStatePass.cpp b/bolt/lib/Passes/InsertNegateRAStatePass.cpp index 775b7795e77c5..ed4de8a56f89f 100644 --- a/bolt/lib/Passes/InsertNegateRAStatePass.cpp +++ b/bolt/lib/Passes/InsertNegateRAStatePass.cpp @@ -52,8 +52,8 @@ void InsertNegateRAState::runOnFunction(BinaryFunction &BF) { MCInst &Inst = *It; if (BC.MIB->isCFI(Inst)) continue; - auto RAState = BC.MIB->getRAState(Inst); - if (!RAState) { + std::optional RAState = BC.MIB->getRAState(Inst); + if (!RAState.has_value()) { BC.errs() << "BOLT-ERROR: unknown RAState after inferUnknownStates " << " in function " << BF.getPrintName() << "\n"; PassFailed = true; @@ -74,6 +74,20 @@ void InsertNegateRAState::runOnFunction(BinaryFunction &BF) { } } +void InsertNegateRAState::inferUnknownStates(BinaryFunction &BF) { + BinaryContext &BC = BF.getBinaryContext(); + + // Fill in missing RAStates in simple cases (inside BBs). + for (BinaryBasicBlock &BB : BF) { + fillUnknownStateInBB(BC, BB); + } + // BasicBlocks which are made entirely of "new instructions" (instructions + // without RAState annotation) are stubs, and do not have correct unwind info. + // We should iterate in layout order and fill them based on previous known + // RAState. + fillUnknownStubs(BF); +} + void InsertNegateRAState::coverFunctionFragmentStart(BinaryFunction &BF, FunctionFragment &FF) { BinaryContext &BC = BF.getBinaryContext(); @@ -92,8 +106,8 @@ void InsertNegateRAState::coverFunctionFragmentStart(BinaryFunction &BF, // If a function is already split in the input, the first FF can also start // with Signed state. This covers that scenario as well. auto II = (*FirstNonEmpty)->getFirstNonPseudo(); - auto RAState = BC.MIB->getRAState(*II); - if (!RAState) { + std::optional RAState = BC.MIB->getRAState(*II); + if (!RAState.has_value()) { BC.errs() << "BOLT-ERROR: unknown RAState after inferUnknownStates " << " in function " << BF.getPrintName() << "\n"; PassFailed = true; @@ -104,32 +118,119 @@ void InsertNegateRAState::coverFunctionFragmentStart(BinaryFunction &BF, MCCFIInstruction::createNegateRAState(nullptr)); } -void InsertNegateRAState::inferUnknownStates(BinaryFunction &BF) { +std::optional +InsertNegateRAState::getFirstKnownRAState(BinaryContext &BC, + BinaryBasicBlock &BB) { + for (const MCInst &Inst : BB) { + if (BC.MIB->isCFI(Inst)) + continue; + std::optional RAState = BC.MIB->getRAState(Inst); + if (RAState.has_value()) + return RAState; + } + return std::nullopt; +} + +bool InsertNegateRAState::isUnknownBlock(BinaryContext &BC, + BinaryBasicBlock &BB) { + std::optional FirstRAState = getFirstKnownRAState(BC, BB); + return !FirstRAState.has_value(); +} + +void InsertNegateRAState::fillUnknownStateInBB(BinaryContext &BC, + BinaryBasicBlock &BB) { + + auto First = BB.getFirstNonPseudo(); + if (First == BB.end()) + return; + // If the first instruction has unknown RAState, we should copy the first + // known RAState. + std::optional RAState = BC.MIB->getRAState(*First); + if (!RAState.has_value()) { + std::optional FirstRAState = getFirstKnownRAState(BC, BB); + if (!FirstRAState.has_value()) + // We fill unknown BBs later. + return; + + BC.MIB->setRAState(*First, *FirstRAState); + } + + // At this point we know the RAState of the first instruction, + // so we can propagate the RAStates to all subsequent unknown instructions. + MCInst Prev = *First; + for (auto It = First + 1; It != BB.end(); ++It) { + MCInst &Inst = *It; + if (BC.MIB->isCFI(Inst)) + continue; + + // No need to check for nullopt: we only entered this loop after the first + // instruction had its RAState set, and RAState is always set for the + // previous instruction in the previous iteration of the loop. + std::optional PrevRAState = BC.MIB->getRAState(Prev); + + std::optional RAState = BC.MIB->getRAState(Inst); + if (!RAState.has_value()) { + if (BC.MIB->isPSignOnLR(Prev)) + PrevRAState = true; + else if (BC.MIB->isPAuthOnLR(Prev)) + PrevRAState = false; + BC.MIB->setRAState(Inst, *PrevRAState); + } + Prev = Inst; + } +} + +void InsertNegateRAState::markUnknownBlock(BinaryContext &BC, + BinaryBasicBlock &BB, bool State) { + // If we call this when an Instruction has either kRASigned or kRAUnsigned + // annotation, setRASigned or setRAUnsigned would fail. + assert(isUnknownBlock(BC, BB) && + "markUnknownBlock should only be called on unknown blocks"); + for (MCInst &Inst : BB) { + if (BC.MIB->isCFI(Inst)) + continue; + BC.MIB->setRAState(Inst, State); + } +} + +void InsertNegateRAState::fillUnknownStubs(BinaryFunction &BF) { BinaryContext &BC = BF.getBinaryContext(); bool FirstIter = true; MCInst PrevInst; - for (BinaryBasicBlock &BB : BF) { - for (MCInst &Inst : BB) { - if (BC.MIB->isCFI(Inst)) - continue; + for (FunctionFragment &FF : BF.getLayout().fragments()) { + for (BinaryBasicBlock *BB : FF) { + if (FirstIter) { + FirstIter = false; + if (isUnknownBlock(BC, *BB)) + // If the first BasicBlock is unknown, the function's entry RAState + // should be used. + markUnknownBlock(BC, *BB, BF.getInitialRAState()); + } else if (isUnknownBlock(BC, *BB)) { + // As explained in issue #160989, the unwind info is incorrect for + // stubs. Indicating the correct RAState without the rest of the unwind + // info being correct is not useful. Instead, we copy the RAState from + // the previous instruction. + std::optional PrevRAState = BC.MIB->getRAState(PrevInst); + if (!PrevRAState.has_value()) { + // No non-cfi instruction encountered in the function yet. + // This means the RAState is the same as at the function entry. + markUnknownBlock(BC, *BB, BF.getInitialRAState()); + continue; + } - auto RAState = BC.MIB->getRAState(Inst); - if (!FirstIter && !RAState) { if (BC.MIB->isPSignOnLR(PrevInst)) - RAState = true; + PrevRAState = true; else if (BC.MIB->isPAuthOnLR(PrevInst)) - RAState = false; - else { - auto PrevRAState = BC.MIB->getRAState(PrevInst); - RAState = PrevRAState ? *PrevRAState : false; - } - BC.MIB->setRAState(Inst, *RAState); - } else { - FirstIter = false; - if (!RAState) - BC.MIB->setRAState(Inst, BF.getInitialRAState()); + PrevRAState = false; + markUnknownBlock(BC, *BB, *PrevRAState); } - PrevInst = Inst; + // This function iterates on BasicBlocks, so the PrevInst has to be + // updated to the last instruction of the current BasicBlock. If the + // BasicBlock is empty, or only has PseudoInstructions, PrevInst will not + // be updated. + auto Last = BB->getLastNonPseudo(); + if (Last != BB->rend()) + PrevInst = *Last; } } } diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index 8a5bbe28e5f66..1c6244b2d2bf8 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -294,6 +294,28 @@ cl::bits GadgetScannersToRun( clEnumValN(GS_ALL, "all", "All implemented scanners")), cl::ZeroOrMore, cl::CommaSeparated, cl::cat(BinaryAnalysisCategory)); +// Primary targets for hooking runtime library initialization hooking +// with fallback to next item in case if current item is not available +// in the input binary. +enum RuntimeLibInitHookTarget : char { + RLIH_ENTRY_POINT = 0, /// Use ELF Header Entry Point + RLIH_INIT = 1, /// Use ELF DT_INIT entry + RLIH_INIT_ARRAY = 2, /// Use ELF .init_array entry +}; + +cl::opt RuntimeLibInitHook( + "runtime-lib-init-hook", + cl::desc("Primary target for hooking runtime library initialization, used " + "in fallback order of availabiliy in input binary (entry_point -> " + "init -> init_array) (default: entry_point)"), + cl::Hidden, cl::init(RLIH_ENTRY_POINT), + cl::values(clEnumValN(RLIH_ENTRY_POINT, "entry_point", + "use ELF Header Entry Point"), + clEnumValN(RLIH_INIT, "init", "use ELF DT_INIT entry"), + clEnumValN(RLIH_INIT_ARRAY, "init_array", + "use ELF .init_array entry")), + cl::ZeroOrMore, cl::cat(BoltOptCategory)); + } // namespace opts // FIXME: implement a better way to mark sections for replacement. @@ -741,9 +763,12 @@ Error RewriteInstance::run() { adjustCommandLineOptions(); discoverFileObjects(); - if (opts::Instrument && !BC->IsStaticExecutable) + if (opts::Instrument && !BC->IsStaticExecutable) { + if (Error E = discoverRtInitAddress()) + return E; if (Error E = discoverRtFiniAddress()) return E; + } preprocessProfileData(); @@ -785,8 +810,12 @@ Error RewriteInstance::run() { updateMetadata(); - if (opts::Instrument && !BC->IsStaticExecutable) - updateRtFiniReloc(); + if (opts::Instrument && !BC->IsStaticExecutable) { + if (Error E = updateRtInitReloc()) + return E; + if (Error E = updateRtFiniReloc()) + return E; + } if (opts::OutputFilename == "/dev/null") { BC->outs() << "BOLT-INFO: skipping writing final binary to disk\n"; @@ -1411,6 +1440,65 @@ void RewriteInstance::discoverBOLTReserved() { NextAvailableAddress = BC->BOLTReserved.start(); } +Error RewriteInstance::discoverRtInitAddress() { + if (BC->HasInterpHeader && opts::RuntimeLibInitHook == opts::RLIH_ENTRY_POINT) + return Error::success(); + + // Use DT_INIT if it's available. + if (BC->InitAddress && opts::RuntimeLibInitHook <= opts::RLIH_INIT) { + BC->StartFunctionAddress = BC->InitAddress; + return Error::success(); + } + + if (!BC->InitArrayAddress || !BC->InitArraySize) { + return createStringError(std::errc::not_supported, + "Instrumentation of shared library needs either " + "DT_INIT or DT_INIT_ARRAY"); + } + + if (*BC->InitArraySize < BC->AsmInfo->getCodePointerSize()) { + return createStringError(std::errc::not_supported, + "Need at least 1 DT_INIT_ARRAY slot"); + } + + ErrorOr InitArraySection = + BC->getSectionForAddress(*BC->InitArrayAddress); + if (auto EC = InitArraySection.getError()) + return errorCodeToError(EC); + + if (InitArraySection->getAddress() != *BC->InitArrayAddress) { + return createStringError(std::errc::not_supported, + "Inconsistent address of .init_array section"); + } + + if (const Relocation *Reloc = InitArraySection->getDynamicRelocationAt(0)) { + if (Reloc->isRelative()) { + BC->StartFunctionAddress = Reloc->Addend; + } else { + MCSymbol *Sym = Reloc->Symbol; + if (!Sym) + return createStringError( + std::errc::not_supported, + "Failed to locate symbol for 0 entry of .init_array"); + const BinaryFunction *BF = BC->getFunctionForSymbol(Sym); + if (!BF) + return createStringError( + std::errc::not_supported, + "Failed to locate binary function for 0 entry of .init_array"); + BC->StartFunctionAddress = BF->getAddress() + Reloc->Addend; + } + return Error::success(); + } + + if (const Relocation *Reloc = InitArraySection->getRelocationAt(0)) { + BC->StartFunctionAddress = Reloc->Value; + return Error::success(); + } + + return createStringError(std::errc::not_supported, + "No relocation for first DT_INIT_ARRAY slot"); +} + Error RewriteInstance::discoverRtFiniAddress() { // Use DT_FINI if it's available. if (BC->FiniAddress) { @@ -1434,6 +1522,11 @@ Error RewriteInstance::discoverRtFiniAddress() { if (auto EC = FiniArraySection.getError()) return errorCodeToError(EC); + if (FiniArraySection->getAddress() != *BC->FiniArrayAddress) { + return createStringError(std::errc::not_supported, + "Inconsistent address of .fini_array section"); + } + if (const Relocation *Reloc = FiniArraySection->getDynamicRelocationAt(0)) { BC->FiniFunctionAddress = Reloc->Addend; return Error::success(); @@ -1448,26 +1541,95 @@ Error RewriteInstance::discoverRtFiniAddress() { "No relocation for first DT_FINI_ARRAY slot"); } -void RewriteInstance::updateRtFiniReloc() { +Error RewriteInstance::updateRtInitReloc() { + if (BC->HasInterpHeader && opts::RuntimeLibInitHook == opts::RLIH_ENTRY_POINT) + return Error::success(); + + // Updating DT_INIT is handled by patchELFDynamic. + if (BC->InitAddress && opts::RuntimeLibInitHook <= opts::RLIH_INIT) + return Error::success(); + + const RuntimeLibrary *RT = BC->getRuntimeLibrary(); + if (!RT || !RT->getRuntimeStartAddress()) + return Error::success(); + + if (!BC->InitArrayAddress) + return Error::success(); + + if (!BC->InitArrayAddress || !BC->InitArraySize) + return createStringError(std::errc::not_supported, + "inconsistent .init_array state"); + + ErrorOr InitArraySection = + BC->getSectionForAddress(*BC->InitArrayAddress); + if (!InitArraySection) + return createStringError(std::errc::not_supported, ".init_array removed"); + + if (std::optional Reloc = + InitArraySection->takeDynamicRelocationAt(0)) { + if (Reloc->isRelative()) { + if (Reloc->Addend != BC->StartFunctionAddress) + return createStringError(std::errc::not_supported, + "inconsistent .init_array dynamic relocation"); + Reloc->Addend = RT->getRuntimeStartAddress(); + InitArraySection->addDynamicRelocation(*Reloc); + } else { + MCSymbol *Sym = Reloc->Symbol; + if (!Sym) + return createStringError( + std::errc::not_supported, + "Failed to locate symbol for 0 entry of .init_array"); + const BinaryFunction *BF = BC->getFunctionForSymbol(Sym); + if (!BF) + return createStringError( + std::errc::not_supported, + "Failed to locate binary function for 0 entry of .init_array"); + if (BF->getAddress() + Reloc->Addend != BC->StartFunctionAddress) + return createStringError(std::errc::not_supported, + "inconsistent .init_array dynamic relocation"); + InitArraySection->addDynamicRelocation(Relocation{ + /*Offset*/ 0, /*Symbol*/ nullptr, /*Type*/ Relocation::getAbs64(), + /*Addend*/ RT->getRuntimeStartAddress(), /*Value*/ 0}); + } + } + // Update the static relocation by adding a pending relocation which will get + // patched when flushPendingRelocations is called in rewriteFile. Note that + // flushPendingRelocations will calculate the value to patch as + // "Symbol + Addend". Since we don't have a symbol, just set the addend to the + // desired value. + InitArraySection->addPendingRelocation(Relocation{ + /*Offset*/ 0, /*Symbol*/ nullptr, /*Type*/ Relocation::getAbs64(), + /*Addend*/ RT->getRuntimeStartAddress(), /*Value*/ 0}); + BC->outs() + << "BOLT-INFO: runtime library initialization was hooked via .init_array " + "entry, set to 0x" + << Twine::utohexstr(RT->getRuntimeStartAddress()) << "\n"; + return Error::success(); +} + +Error RewriteInstance::updateRtFiniReloc() { // Updating DT_FINI is handled by patchELFDynamic. if (BC->FiniAddress) - return; + return Error::success(); const RuntimeLibrary *RT = BC->getRuntimeLibrary(); if (!RT || !RT->getRuntimeFiniAddress()) - return; + return Error::success(); - assert(BC->FiniArrayAddress && BC->FiniArraySize && - "inconsistent .fini_array state"); + if (!BC->FiniArrayAddress || !BC->FiniArraySize) + return createStringError(std::errc::not_supported, + "inconsistent .fini_array state"); ErrorOr FiniArraySection = BC->getSectionForAddress(*BC->FiniArrayAddress); - assert(FiniArraySection && ".fini_array removed"); + if (!FiniArraySection) + return createStringError(std::errc::not_supported, ".fini_array removed"); if (std::optional Reloc = FiniArraySection->takeDynamicRelocationAt(0)) { - assert(Reloc->Addend == BC->FiniFunctionAddress && - "inconsistent .fini_array dynamic relocation"); + if (Reloc->Addend != BC->FiniFunctionAddress) + return createStringError(std::errc::not_supported, + "inconsistent .fini_array dynamic relocation"); Reloc->Addend = RT->getRuntimeFiniAddress(); FiniArraySection->addDynamicRelocation(*Reloc); } @@ -1480,6 +1642,10 @@ void RewriteInstance::updateRtFiniReloc() { FiniArraySection->addPendingRelocation(Relocation{ /*Offset*/ 0, /*Symbol*/ nullptr, /*Type*/ Relocation::getAbs64(), /*Addend*/ RT->getRuntimeFiniAddress(), /*Value*/ 0}); + BC->outs() << "BOLT-INFO: runtime library finalization was hooked via " + ".fini_array entry, set to 0x" + << Twine::utohexstr(RT->getRuntimeFiniAddress()) << "\n"; + return Error::success(); } void RewriteInstance::registerFragments() { @@ -2178,6 +2344,14 @@ void RewriteInstance::adjustCommandLineOptions() { exit(1); } + if (opts::Instrument && opts::RuntimeLibInitHook == opts::RLIH_ENTRY_POINT && + !BC->HasInterpHeader) { + BC->errs() + << "BOLT-WARNING: adjusted runtime-lib-init-hook to 'init' due to " + "absence of INTERP header\n"; + opts::RuntimeLibInitHook = opts::RLIH_INIT; + } + if (opts::HotText && opts::HotTextMoveSections.getNumOccurrences() == 0) { opts::HotTextMoveSections.addValue(".stub"); opts::HotTextMoveSections.addValue(".mover"); @@ -4849,9 +5023,14 @@ void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile *File) { ELFEhdrTy NewEhdr = Obj.getHeader(); if (BC->HasRelocations) { - if (RuntimeLibrary *RtLibrary = BC->getRuntimeLibrary()) + RuntimeLibrary *RtLibrary = BC->getRuntimeLibrary(); + if (RtLibrary && opts::RuntimeLibInitHook == opts::RLIH_ENTRY_POINT) { NewEhdr.e_entry = RtLibrary->getRuntimeStartAddress(); - else + BC->outs() + << "BOLT-INFO: runtime library initialization was hooked via ELF " + "Header Entry Point, set to 0x" + << Twine::utohexstr(NewEhdr.e_entry) << "\n"; + } else NewEhdr.e_entry = getNewFunctionAddress(NewEhdr.e_entry); assert((NewEhdr.e_entry || !Obj.getHeader().e_entry) && "cannot find new address for entry point"); @@ -5692,14 +5871,23 @@ void RewriteInstance::patchELFDynamic(ELFObjectFile *File) { } RuntimeLibrary *RtLibrary = BC->getRuntimeLibrary(); if (RtLibrary && Dyn.getTag() == ELF::DT_FINI) { - if (uint64_t Addr = RtLibrary->getRuntimeFiniAddress()) + if (uint64_t Addr = RtLibrary->getRuntimeFiniAddress()) { NewDE.d_un.d_ptr = Addr; + BC->outs() + << "BOLT-INFO: runtime library finalization was hooked via " + "DT_FINI, set to 0x" + << Twine::utohexstr(Addr) << "\n"; + } } - if (RtLibrary && Dyn.getTag() == ELF::DT_INIT && !BC->HasInterpHeader) { + if (RtLibrary && Dyn.getTag() == ELF::DT_INIT && + (!BC->HasInterpHeader || + opts::RuntimeLibInitHook == opts::RLIH_INIT)) { if (auto Addr = RtLibrary->getRuntimeStartAddress()) { - LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Set DT_INIT to 0x" - << Twine::utohexstr(Addr) << '\n'); NewDE.d_un.d_ptr = Addr; + BC->outs() + << "BOLT-INFO: runtime library initialization was hooked via " + "DT_INIT, set to 0x" + << Twine::utohexstr(Addr) << "\n"; } } break; @@ -5767,10 +5955,13 @@ Error RewriteInstance::readELFDynamic(ELFObjectFile *File) { for (const Elf_Dyn &Dyn : DynamicEntries) { switch (Dyn.d_tag) { case ELF::DT_INIT: - if (!BC->HasInterpHeader) { - LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Set start function address\n"); - BC->StartFunctionAddress = Dyn.getPtr(); - } + BC->InitAddress = Dyn.getPtr(); + break; + case ELF::DT_INIT_ARRAY: + BC->InitArrayAddress = Dyn.getPtr(); + break; + case ELF::DT_INIT_ARRAYSZ: + BC->InitArraySize = Dyn.getPtr(); break; case ELF::DT_FINI: BC->FiniAddress = Dyn.getPtr(); diff --git a/bolt/test/AArch64/hook-fini.s b/bolt/test/AArch64/hook-fini.s index 4f321d463ef32..3bb95f9317b1b 100644 --- a/bolt/test/AArch64/hook-fini.s +++ b/bolt/test/AArch64/hook-fini.s @@ -15,13 +15,13 @@ # RUN: %clang %cflags -pie %s -Wl,-q -o %t.exe # RUN: llvm-readelf -d %t.exe | FileCheck --check-prefix=DYN-FINI %s # RUN: llvm-readelf -r %t.exe | FileCheck --check-prefix=RELOC-PIE %s -# RUN: llvm-bolt %t.exe -o %t --instrument +# RUN: llvm-bolt %t.exe -o %t --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-FINI %s # RUN: llvm-readelf -drs %t | FileCheck --check-prefix=CHECK-FINI %s # RUN: %clang %cflags -pie %s -Wl,-q,-fini=0 -o %t-no-fini.exe # RUN: llvm-readelf -d %t-no-fini.exe | FileCheck --check-prefix=DYN-NO-FINI %s # RUN: llvm-readelf -r %t-no-fini.exe | FileCheck --check-prefix=RELOC-PIE %s -# RUN: llvm-bolt %t-no-fini.exe -o %t-no-fini --instrument +# RUN: llvm-bolt %t-no-fini.exe -o %t-no-fini --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-FINI-ARRAY %s # RUN: llvm-readelf -drs %t-no-fini | FileCheck --check-prefix=CHECK-NO-FINI %s # RUN: llvm-readelf -ds -x .fini_array %t-no-fini | FileCheck --check-prefix=CHECK-NO-FINI-RELOC %s @@ -29,7 +29,7 @@ # RUN: %clang %cflags %p/../Inputs/stub.c -fPIC -shared -o %t-stub.so # RUN: %clang %cflags %s -no-pie -Wl,-q,-fini=0 %t-stub.so -o %t-no-pie-no-fini.exe # RUN: llvm-readelf -r %t-no-pie-no-fini.exe | FileCheck --check-prefix=RELOC-NO-PIE %s -# RUN: llvm-bolt %t-no-pie-no-fini.exe -o %t-no-pie-no-fini --instrument +# RUN: llvm-bolt %t-no-pie-no-fini.exe -o %t-no-pie-no-fini --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-FINI-ARRAY %s # RUN: llvm-readelf -ds -x .fini_array %t-no-pie-no-fini | FileCheck --check-prefix=CHECK-NO-PIE-NO-FINI %s ## With fini: dynamic section should contain DT_FINI @@ -46,6 +46,14 @@ ## Without PIE: binary should not have relative relocations # RELOC-NO-PIE-NOT: R_AARCH64_RELATIVE +## Check BOLT output output finalization hook (DT_FINI) +# CHECK-BOLT-RT-FINI: runtime library finalization was hooked via DT_FINI +# CHECK-BOLT-RT-FINI-NOT: runtime library finalization was hooked via .fini_array entry + +## Check BOLT output output finalization hook (.fini_array entry) +# CHECK-BOLT-RT-FINI-ARRAY-NOT: runtime library finalization was hooked via DT_FINI +# CHECK-BOLT-RT-FINI-ARRAY: runtime library finalization was hooked via .fini_array entry + ## Check that DT_FINI is set to __bolt_runtime_fini # CHECK-FINI: Dynamic section at offset {{.*}} contains {{.*}} entries: # CHECK-FINI-DAG: (FINI) 0x[[FINI:[[:xdigit:]]+]] diff --git a/bolt/test/AArch64/hook-init.s b/bolt/test/AArch64/hook-init.s new file mode 100644 index 0000000000000..a48328b630fa0 --- /dev/null +++ b/bolt/test/AArch64/hook-init.s @@ -0,0 +1,221 @@ +## Test the different ways of hooking the init function for instrumentation (via +## entry point, DT_INIT and via DT_INIT_ARRAY). We test the latter for both PIE +## and non-PIE binaries because of the different ways of handling relocations +## (static or dynamic), executable and shared library. +## All tests perform the following steps: +## - Compile and link for the case to be tested +## - Some sanity-checks on the dynamic section and relocations in the binary to +## verify it has the shape we want for testing: +## - INTERP in Program Headers +## - DT_INIT or DT_INIT_ARRAY in dynamic section +## - No relative relocations for non-PIE +## - Instrument (with extra --runtime-lib-init-hook=init/init_array options +## in some cases) +## - Verify generated binary +# REQUIRES: system-linux,bolt-runtime,target=aarch64{{.*}} + +# RUN: %clang %cflags -pie %s -Wl,-q -o %t.exe +# RUN: llvm-readelf -d %t.exe | FileCheck --check-prefix=DYN-INIT %s +# RUN: llvm-readelf -l %t.exe | FileCheck --check-prefix=PH-INTERP %s +# RUN: llvm-readelf -r %t.exe | FileCheck --check-prefix=RELOC-PIE %s +# RUN: llvm-bolt %t.exe -o %t --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-EP %s +# RUN: llvm-readelf -hdrs %t | FileCheck --check-prefix=CHECK-INIT-EP %s +# RUN: llvm-bolt %t.exe -o %t-no-ep --instrument --runtime-lib-init-hook=init | FileCheck --check-prefix=CHECK-BOLT-RT-INIT %s +# RUN: llvm-readelf -hdrs %t-no-ep | FileCheck --check-prefix=CHECK-INIT-NO-EP %s +# RUN: llvm-bolt %t.exe -o %t-no-ep --instrument --runtime-lib-init-hook=init_array | FileCheck --check-prefix=CHECK-BOLT-RT-INIT-ARRAY %s +# RUN: llvm-readelf -hdrs %t-no-ep | FileCheck --check-prefix=CHECK-INIT-ARRAY-NO-EP %s + +# RUN: %clang -shared %cflags -pie %s -Wl,-q -o %t-shared.exe +# RUN: llvm-readelf -d %t-shared.exe | FileCheck --check-prefix=DYN-INIT %s +# RUN: llvm-readelf -l %t-shared.exe | FileCheck --check-prefix=PH-INTERP-SHARED %s +# RUN: llvm-readelf -r %t-shared.exe | FileCheck --check-prefix=RELOC-SHARED-PIE %s +# RUN: llvm-bolt %t-shared.exe -o %t-shared --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-INIT %s +# RUN: llvm-readelf -hdrs %t-shared | FileCheck --check-prefix=CHECK-SHARED-INIT %s + +# RUN: %clang %cflags -pie %s -Wl,-q,-init=0 -o %t-no-init.exe +# RUN: llvm-readelf -d %t-no-init.exe | FileCheck --check-prefix=DYN-NO-INIT %s +# RUN: llvm-readelf -l %t-no-init.exe | FileCheck --check-prefix=PH-INTERP %s +# RUN: llvm-readelf -r %t-no-init.exe | FileCheck --check-prefix=RELOC-PIE %s +# RUN: llvm-bolt %t-no-init.exe -o %t-no-init --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-EP %s +# RUN: llvm-readelf -hdrs %t-no-init | FileCheck --check-prefix=CHECK-NO-INIT-EP %s +# RUN: llvm-bolt %t-no-init.exe -o %t-no-init-no-ep --instrument --runtime-lib-init-hook=init | FileCheck --check-prefix=CHECK-BOLT-RT-INIT-ARRAY %s +# RUN: llvm-readelf -hdrs %t-no-init-no-ep | FileCheck --check-prefix=CHECK-NO-INIT-NO-EP %s + +# RUN: %clang -shared %cflags -pie %s -Wl,-q,-init=0 -o %t-shared-no-init.exe +# RUN: llvm-readelf -d %t-shared-no-init.exe | FileCheck --check-prefix=DYN-NO-INIT %s +# RUN: llvm-readelf -l %t-shared-no-init.exe | FileCheck --check-prefix=PH-INTERP-SHARED %s +# RUN: llvm-readelf -r %t-shared-no-init.exe | FileCheck --check-prefix=RELOC-SHARED-PIE %s +# RUN: llvm-bolt %t-shared-no-init.exe -o %t-shared-no-init --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-INIT-ARRAY %s +# RUN: llvm-readelf -drs %t-shared-no-init | FileCheck --check-prefix=CHECK-SHARED-NO-INIT %s + +## Create a dummy shared library to link against to force creation of the dynamic section. +# RUN: %clang %cflags %p/../Inputs/stub.c -fPIC -shared -o %t-stub.so +# RUN: %clang %cflags %s -no-pie -Wl,-q,-init=0 %t-stub.so -o %t-no-pie-no-init.exe +# RUN: llvm-readelf -r %t-no-pie-no-init.exe | FileCheck --check-prefix=RELOC-NO-PIE %s +# RUN: llvm-bolt %t-no-pie-no-init.exe -o %t-no-pie-no-init --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-EP %s +# RUN: llvm-readelf -hds %t-no-pie-no-init | FileCheck --check-prefix=CHECK-NO-PIE-NO-INIT-EP %s + +## With init: dynamic section should contain DT_INIT +# DYN-INIT: (INIT) + +## Without init: dynamic section should only contain DT_INIT_ARRAY +# DYN-NO-INIT-NOT: (INIT) +# DYN-NO-INIT: (INIT_ARRAY) +# DYN-NO-INIT: (INIT_ARRAYSZ) + +## With interp program header (executable) +# PH-INTERP: Program Headers: +# PH-INTERP: INTERP + +## Without interp program header (shared library) +# PH-INTERP-SHARED: Program Headers: +# PH-INTERP-SHARED-NOT: INTERP + +## With PIE: binary should have relative relocations +# RELOC-PIE: R_AARCH64_RELATIVE + +## With PIE: binary should have relative relocations +# RELOC-SHARED-PIE: R_AARCH64_ABS64 + +## Without PIE: binary should not have relative relocations +# RELOC-NO-PIE-NOT: R_AARCH64_RELATIVE + +## Check BOLT output output initialization hook (ELF Header Entry Point) +# CHECK-BOLT-RT-EP: runtime library initialization was hooked via ELF Header Entry Point +# CHECK-BOLT-RT-EP-NOT: runtime library initialization was hooked via DT_INIT +# CHECK-BOLT-RT-EP-NOT: runtime library initialization was hooked via .init_array entry + +## Check BOLT output output initialization hook (DT_INIT) +# CHECK-BOLT-RT-INIT-NOT: runtime library initialization was hooked via ELF Header Entry Point +# CHECK-BOLT-RT-INIT: runtime library initialization was hooked via DT_INIT +# CHECK-BOLT-RT-INIT-NOT: runtime library initialization was hooked via .init_array entry + +## Check BOLT output output initialization hook (.init_array entry) +# CHECK-BOLT-RT-INIT-ARRAY-NOT: runtime library initialization was hooked via ELF Header Entry Point +# CHECK-BOLT-RT-INIT-ARRAY-NOT: runtime library initialization was hooked via DT_INIT +# CHECK-BOLT-RT-INIT-ARRAY: runtime library initialization was hooked via .init_array entry + +## Check that entry point address is set to __bolt_runtime_start for PIE executable with DT_INIT +# CHECK-INIT-EP: ELF Header: +# CHECK-INIT-EP: Entry point address: 0x[[#%x,EP_ADDR:]] +## Check that the dynamic relocation at .init and .init_array were not patched +# CHECK-INIT-EP: Dynamic section at offset {{.*}} contains {{.*}} entries: +# CHECK-INIT-EP-NOT: (INIT) 0x[[#%x, EP_ADDR]] +# CHECK-INIT-EP-NOT: (INIT_ARRAY) 0x[[#%x, EP_ADDR]] +## Check that the new entry point address points to __bolt_runtime_start +# CHECK-INIT-EP: Symbol table '.symtab' contains {{.*}} entries: +# CHECK-INIT-EP: {{0+}}[[#%x, EP_ADDR]] {{.*}} __bolt_runtime_start + +## Check that DT_INIT address is set to __bolt_runtime_start for PIE executable with DT_INIT +# CHECK-INIT-NO-EP: ELF Header: +# CHECK-INIT-NO-EP: Entry point address: 0x[[#%x,EP_ADDR:]] +## Read Dynamic section DT_INIT and DT_INIT_ARRAY entries +# CHECK-INIT-NO-EP: Dynamic section at offset {{.*}} contains {{.*}} entries: +# CHECK-INIT-NO-EP-DAG: (INIT) 0x[[#%x,INIT:]] +# CHECK-INIT-NO-EP-DAG: (INIT_ARRAY) 0x[[#%x,INIT_ARRAY:]] +## Check if ELF entry point address points to _start symbol and new DT_INIT entry points to __bolt_runtime_start +# CHECK-INIT-NO-EP: Symbol table '.symtab' contains {{.*}} entries: +# CHECK-INIT-NO-EP-DAG: {{0+}}[[#%x, EP_ADDR]] {{.*}} _start +# CHECK-INIT-NO-EP-DAG: {{0+}}[[#%x, INIT]] {{.*}} __bolt_runtime_start + +## Check that 1st entry of DT_INIT_ARRAY is set to __bolt_runtime_start and DT_INIT was not changed +# CHECK-INIT-ARRAY-NO-EP: ELF Header: +# CHECK-INIT-ARRAY-NO-EP: Entry point address: 0x[[#%x,EP_ADDR:]] +## Read Dynamic section DT_INIT and DT_INIT_ARRAY entries +# CHECK-INIT-ARRAY-NO-EP: Dynamic section at offset {{.*}} contains {{.*}} entries: +# CHECK-INIT-ARRAY-NO-EP-DAG: (INIT) 0x[[#%x,INIT:]] +# CHECK-INIT-ARRAY-NO-EP-DAG: (INIT_ARRAY) 0x[[#%x,INIT_ARRAY:]] +## Read the dynamic relocation from 1st entry of .init_array +# CHECK-INIT-ARRAY-NO-EP: Relocation section '.rela.dyn' at offset {{.*}} contains {{.*}} entries +# CHECK-INIT-ARRAY-NO-EP: {{0+}}[[#%x,INIT_ARRAY]] {{.*}} R_AARCH64_RELATIVE [[#%x,INIT_ADDR:]] +# CHECK-INIT-ARRAY-NO-EP-NOT: {{0+}}[[#%x,INIT_ARRAY]] {{.*}} R_AARCH64_RELATIVE [[#%x,INIT]] +## Check that 1st entry of .init_array points to __bolt_runtime_start +# CHECK-INIT-ARRAY-NO-EP: Symbol table '.symtab' contains {{.*}} entries: +# CHECK-INIT-ARRAY-NO-EP-DAG: {{0+}}[[#%x, EP_ADDR]] {{.*}} _start +# CHECK-INIT-ARRAY-NO-EP-DAG: {{[0-9]]*}}: {{0+}}[[#%x, INIT_ADDR]] {{.*}} __bolt_runtime_start + +## Check that entry point address is set to __bolt_runtime_start for PIE executable without DT_INIT +# CHECK-NO-INIT-EP: ELF Header: +# CHECK-NO-INIT-EP: Entry point address: 0x[[#%x,EP_ADDR:]] +## Check that the dynamic relocation at .init and .init_array were not patched +# CHECK-NO-INIT-EP: Dynamic section at offset {{.*}} contains {{.*}} entries: +# CHECK-NO-INIT-EP-NOT: (INIT) 0x[[#%x, EP_ADDR]] +# CHECK-NO-INIT-EP-NOT: (INIT_ARRAY) 0x[[#%x, EP_ADDR]] +## Check that the new entry point address points to __bolt_runtime_start +# CHECK-NO-INIT-EP: Symbol table '.symtab' contains {{.*}} entries: +# CHECK-NO-INIT-EP: {{0+}}[[#%x, EP_ADDR]] {{.*}} __bolt_runtime_start + +## Check that DT_INIT is set to __bolt_runtime_start for shared library with DT_INIT +# CHECK-SHARED-INIT: Dynamic section at offset {{.*}} contains {{.*}} entries: +# CHECK-SHARED-INIT-DAG: (INIT) 0x[[#%x, INIT:]] +# CHECK-SHARED-INIT-DAG: (INIT_ARRAY) 0x[[#%x, INIT_ARRAY:]] +## Check that the dynamic relocation at .init_array was not patched +# CHECK-SHARED-INIT: Relocation section '.rela.dyn' at offset {{.*}} contains {{.*}} entries +# CHECK-SHARED-INIT-NOT: {{0+}}[[#%x, INIT_ARRAY]] {{.*}} R_AARCH64_ABS64 {{0+}}[[#%x, INIT]] +## Check that dynamic section DT_INIT points to __bolt_runtime_start +# CHECK-SHARED-INIT: Symbol table '.symtab' contains {{.*}} entries: +# CHECK-SHARED-INIT: {{0+}}[[#%x, INIT]] {{.*}} __bolt_runtime_start + +## Check that entry point address is set to __bolt_runtime_start for PIE executable without DT_INIT +# CHECK-NO-INIT-NO-EP: ELF Header: +# CHECK-NO-INIT-NO-EP: Entry point address: 0x[[#%x,EP_ADDR:]] +# CHECK-NO-INIT-NO-EP: Dynamic section at offset {{.*}} contains {{.*}} entries: +# CHECK-NO-INIT-NO-EP-NOT: (INIT) +# CHECK-NO-INIT-NO-EP: (INIT_ARRAY) 0x[[#%x,INIT_ARRAY:]] +## Read the dynamic relocation from 1st entry of .init_array +# CHECK-NO-INIT-NO-EP: Relocation section '.rela.dyn' at offset {{.*}} contains {{.*}} entries +# CHECK-NO-INIT-NO-EP: {{0+}}[[#%x,INIT_ARRAY]] {{.*}} R_AARCH64_RELATIVE [[#%x,INIT_ADDR:]] +## Check that 1st entry of .init_array points to __bolt_runtime_start +# CHECK-NO-INIT-NO-EP: Symbol table '.symtab' contains {{.*}} entries: +# CHECK-NO-INIT-NO-EP-DAG: {{0+}}[[#%x, EP_ADDR]] {{.*}} _start +# CHECK-NO-INIT-NO-EP-DAG: {{[0-9]]*}}: {{0+}}[[#%x, INIT_ADDR]] {{.*}} __bolt_runtime_start + +## Check that entry point address is set to __bolt_runtime_start for shared library without DT_INIT +# CHECK-SHARED-NO-INIT: Dynamic section at offset {{.*}} contains {{.*}} entries: +# CHECK-SHARED-NO-INIT-NOT: (INIT) +# CHECK-SHARED-NO-INIT: (INIT_ARRAY) 0x[[#%x,INIT_ARRAY:]] +## Read the dynamic relocation from 1st entry of .init_array +# CHECK-SHARED-NO-INIT: Relocation section '.rela.dyn' at offset {{.*}} contains {{.*}} entries +# CHECK-SHARED-NO-INIT: {{0+}}[[#%x, INIT_ARRAY]] {{.*}} R_AARCH64_ABS64 [[#%x,INIT_ADDR:]] +## Check that 1st entry of .init_array points to __bolt_runtime_start +# CHECK-SHARED-NO-INIT: Symbol table '.symtab' contains {{.*}} entries: +# CHECK-SHARED-NO-INIT: {{[0-9]]*}}: {{0+}}[[#%x, INIT_ADDR]] {{.*}} __bolt_runtime_start + +## Check that entry point address is set to __bolt_runtime_start for non-PIE executable with DT_INIT +# CHECK-NO-PIE-NO-INIT-EP: ELF Header: +# CHECK-NO-PIE-NO-INIT-EP: Entry point address: 0x[[#%x,EP_ADDR:]] +## Check that the dynamic relocation at .init and .init_array were not patched +# CHECK-NO-PIE-NO-INIT-EP: Dynamic section at offset {{.*}} contains {{.*}} entries: +# CHECK-NO-PIE-NO-INIT-EP-NOT: (INIT) 0x[[#%x, EP_ADDR]] +# CHECK-NO-PIE-NO-INIT-EP-NOT: (INIT_ARRAY) 0x[[#%x, EP_ADDR]] +## Check that the new entry point address points to __bolt_runtime_start +# CHECK-NO-PIE-NO-INIT-EP: Symbol table '.symtab' contains {{.*}} entries: +# CHECK-NO-PIE-NO-INIT-EP: {{0+}}[[#%x, EP_ADDR]] {{.*}} __bolt_runtime_start + + .globl _start + .type _start, %function +_start: + # Dummy relocation to force relocation mode. + .reloc 0, R_AARCH64_NONE + ret +.size _start, .-_start + + .globl _init + .type _init, %function +_init: + ret + .size _init, .-_init + + .globl _fini + .type _fini, %function +_fini: + ret + .size _fini, .-_fini + + .section .init_array,"aw" + .align 3 + .dword _init + + .section .fini_array,"aw" + .align 3 + .dword _fini diff --git a/bolt/test/AArch64/inline-bti-dbg.s b/bolt/test/AArch64/inline-bti-dbg.s new file mode 100644 index 0000000000000..a0db4589d39ac --- /dev/null +++ b/bolt/test/AArch64/inline-bti-dbg.s @@ -0,0 +1,40 @@ +# This test checks that for AArch64 binaries with BTI, we do not inline blocks with indirect tailcalls. +# Same as inline-bti.s, but checks the debug output, and therefore requires assertions. + +# REQUIRES: system-linux, assertions + +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o +# RUN: %clang %cflags -O0 %t.o -o %t.exe -Wl,-q -Wl,-z,force-bti +# RUN: llvm-bolt --inline-all %t.exe -o %t.bolt --debug 2>&1 | FileCheck %s + +# For BTI, we should not inline foo. +# CHECK: BOLT-DEBUG: Skipping inlining block with tailcall in _Z3barP1A : .LBB01 to keep BTIs consistent. +# CHECK-NOT: BOLT-INFO: inlined {{[0-9]+}} calls at {{[0-9]+}} call sites in {{[0-9]+}} iteration(s). Change in binary size: {{[0-9]+}} bytes. + + .text + .globl _Z3fooP1A + .type _Z3fooP1A,@function +_Z3fooP1A: + ldr x8, [x0] + ldr w0, [x8] + br x30 + .size _Z3fooP1A, .-_Z3fooP1A + + .globl _Z3barP1A + .type _Z3barP1A,@function +_Z3barP1A: + stp x29, x30, [sp, #-16]! + mov x29, sp + bl _Z3fooP1A + mul w0, w0, w0 + ldp x29, x30, [sp], #16 + ret + .size _Z3barP1A, .-_Z3barP1A + + .globl main + .p2align 2 + .type main,@function +main: + mov w0, wzr + ret + .size main, .-main diff --git a/bolt/test/AArch64/inline-bti.s b/bolt/test/AArch64/inline-bti.s new file mode 100644 index 0000000000000..62f6ea6f4b63a --- /dev/null +++ b/bolt/test/AArch64/inline-bti.s @@ -0,0 +1,38 @@ +## This test checks that for AArch64 binaries with BTI, we do not inline blocks with indirect tailcalls. + +# REQUIRES: system-linux + +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o +# RUN: %clang %cflags -O0 %t.o -o %t.exe -Wl,-q -Wl,-z,force-bti +# RUN: llvm-bolt --inline-all %t.exe -o %t.bolt | FileCheck %s + +# For BTI, we should not inline foo. +# CHECK-NOT: BOLT-INFO: inlined {{[0-9]+}} calls at {{[0-9]+}} call sites in {{[0-9]+}} iteration(s). Change in binary size: {{[0-9]+}} bytes. + + .text + .globl _Z3fooP1A + .type _Z3fooP1A,@function +_Z3fooP1A: + ldr x8, [x0] + ldr w0, [x8] + br x30 + .size _Z3fooP1A, .-_Z3fooP1A + + .globl _Z3barP1A + .type _Z3barP1A,@function +_Z3barP1A: + stp x29, x30, [sp, #-16]! + mov x29, sp + bl _Z3fooP1A + mul w0, w0, w0 + ldp x29, x30, [sp], #16 + ret + .size _Z3barP1A, .-_Z3barP1A + + .globl main + .p2align 2 + .type main,@function +main: + mov w0, wzr + ret + .size main, .-main diff --git a/bolt/test/X86/hook-init.s b/bolt/test/X86/hook-init.s new file mode 100644 index 0000000000000..3184541f040b9 --- /dev/null +++ b/bolt/test/X86/hook-init.s @@ -0,0 +1,221 @@ +## Test the different ways of hooking the init function for instrumentation (via +## entry point, DT_INIT and via DT_INIT_ARRAY). We test the latter for both PIE +## and non-PIE binaries because of the different ways of handling relocations +## (static or dynamic), executable and shared library. +## All tests perform the following steps: +## - Compile and link for the case to be tested +## - Some sanity-checks on the dynamic section and relocations in the binary to +## verify it has the shape we want for testing: +## - INTERP in Program Headers +## - DT_INIT or DT_INIT_ARRAY in dynamic section +## - No relative relocations for non-PIE +## - Instrument (with extra --runtime-lib-init-hook=init/init_array options +## in some cases) +## - Verify generated binary +# REQUIRES: system-linux,bolt-runtime,target=x86_64-{{.*}} + +# RUN: %clang %cflags -pie %s -Wl,-q -o %t.exe +# RUN: llvm-readelf -d %t.exe | FileCheck --check-prefix=DYN-INIT %s +# RUN: llvm-readelf -l %t.exe | FileCheck --check-prefix=PH-INTERP %s +# RUN: llvm-readelf -r %t.exe | FileCheck --check-prefix=RELOC-PIE %s +# RUN: llvm-bolt %t.exe -o %t --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-EP %s +# RUN: llvm-readelf -hdrs %t | FileCheck --check-prefix=CHECK-INIT-EP %s +# RUN: llvm-bolt %t.exe -o %t-no-ep --instrument --runtime-lib-init-hook=init | FileCheck --check-prefix=CHECK-BOLT-RT-INIT %s +# RUN: llvm-readelf -hdrs %t-no-ep | FileCheck --check-prefix=CHECK-INIT-NO-EP %s +# RUN: llvm-bolt %t.exe -o %t-no-ep --instrument --runtime-lib-init-hook=init_array | FileCheck --check-prefix=CHECK-BOLT-RT-INIT-ARRAY %s +# RUN: llvm-readelf -hdrs %t-no-ep | FileCheck --check-prefix=CHECK-INIT-ARRAY-NO-EP %s + +# RUN: %clang -shared %cflags -pie %s -Wl,-q -o %t-shared.exe +# RUN: llvm-readelf -d %t-shared.exe | FileCheck --check-prefix=DYN-INIT %s +# RUN: llvm-readelf -l %t-shared.exe | FileCheck --check-prefix=PH-INTERP-SHARED %s +# RUN: llvm-readelf -r %t-shared.exe | FileCheck --check-prefix=RELOC-SHARED-PIE %s +# RUN: llvm-bolt %t-shared.exe -o %t-shared --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-INIT %s +# RUN: llvm-readelf -hdrs %t-shared | FileCheck --check-prefix=CHECK-SHARED-INIT %s + +# RUN: %clang %cflags -pie %s -Wl,-q,-init=0 -o %t-no-init.exe +# RUN: llvm-readelf -d %t-no-init.exe | FileCheck --check-prefix=DYN-NO-INIT %s +# RUN: llvm-readelf -l %t-no-init.exe | FileCheck --check-prefix=PH-INTERP %s +# RUN: llvm-readelf -r %t-no-init.exe | FileCheck --check-prefix=RELOC-PIE %s +# RUN: llvm-bolt %t-no-init.exe -o %t-no-init --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-EP %s +# RUN: llvm-readelf -hdrs %t-no-init | FileCheck --check-prefix=CHECK-NO-INIT-EP %s +# RUN: llvm-bolt %t-no-init.exe -o %t-no-init-no-ep --instrument --runtime-lib-init-hook=init | FileCheck --check-prefix=CHECK-BOLT-RT-INIT-ARRAY %s +# RUN: llvm-readelf -hdrs %t-no-init-no-ep | FileCheck --check-prefix=CHECK-NO-INIT-NO-EP %s + +# RUN: %clang -shared %cflags -pie %s -Wl,-q,-init=0 -o %t-shared-no-init.exe +# RUN: llvm-readelf -d %t-shared-no-init.exe | FileCheck --check-prefix=DYN-NO-INIT %s +# RUN: llvm-readelf -l %t-shared-no-init.exe | FileCheck --check-prefix=PH-INTERP-SHARED %s +# RUN: llvm-readelf -r %t-shared-no-init.exe | FileCheck --check-prefix=RELOC-SHARED-PIE %s +# RUN: llvm-bolt %t-shared-no-init.exe -o %t-shared-no-init --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-INIT-ARRAY %s +# RUN: llvm-readelf -drs %t-shared-no-init | FileCheck --check-prefix=CHECK-SHARED-NO-INIT %s + +## Create a dummy shared library to link against to force creation of the dynamic section. +# RUN: %clang %cflags %p/../Inputs/stub.c -fPIC -shared -o %t-stub.so +# RUN: %clang %cflags %s -no-pie -Wl,-q,-init=0 %t-stub.so -o %t-no-pie-no-init.exe +# RUN: llvm-readelf -r %t-no-pie-no-init.exe | FileCheck --check-prefix=RELOC-NO-PIE %s +# RUN: llvm-bolt %t-no-pie-no-init.exe -o %t-no-pie-no-init --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-EP %s +# RUN: llvm-readelf -hds %t-no-pie-no-init | FileCheck --check-prefix=CHECK-NO-PIE-NO-INIT-EP %s + +## With init: dynamic section should contain DT_INIT +# DYN-INIT: (INIT) + +## Without init: dynamic section should only contain DT_INIT_ARRAY +# DYN-NO-INIT-NOT: (INIT) +# DYN-NO-INIT: (INIT_ARRAY) +# DYN-NO-INIT: (INIT_ARRAYSZ) + +## With interp program header (executable) +# PH-INTERP: Program Headers: +# PH-INTERP: INTERP + +## Without interp program header (shared library) +# PH-INTERP-SHARED: Program Headers: +# PH-INTERP-SHARED-NOT: INTERP + +## With PIE: binary should have relative relocations +# RELOC-PIE: R_X86_64_RELATIVE + +## With PIE: binary should have relative relocations +# RELOC-SHARED-PIE: R_X86_64_64 + +## Without PIE: binary should not have relative relocations +# RELOC-NO-PIE-NOT: R_X86_64_RELATIVE + +## Check BOLT output output initialization hook (ELF Header Entry Point) +# CHECK-BOLT-RT-EP: runtime library initialization was hooked via ELF Header Entry Point +# CHECK-BOLT-RT-EP-NOT: runtime library initialization was hooked via DT_INIT +# CHECK-BOLT-RT-EP-NOT: runtime library initialization was hooked via .init_array entry + +## Check BOLT output output initialization hook (DT_INIT) +# CHECK-BOLT-RT-INIT-NOT: runtime library initialization was hooked via ELF Header Entry Point +# CHECK-BOLT-RT-INIT: runtime library initialization was hooked via DT_INIT +# CHECK-BOLT-RT-INIT-NOT: runtime library initialization was hooked via .init_array entry + +## Check BOLT output output initialization hook (1st entry of .init_array) +# CHECK-BOLT-RT-INIT-ARRAY-NOT: runtime library initialization was hooked via ELF Header Entry Point +# CHECK-BOLT-RT-INIT-ARRAY-NOT: runtime library initialization was hooked via DT_INIT +# CHECK-BOLT-RT-INIT-ARRAY: runtime library initialization was hooked via .init_array entry + +## Check that entry point address is set to __bolt_runtime_start for PIE executable with DT_INIT +# CHECK-INIT-EP: ELF Header: +# CHECK-INIT-EP: Entry point address: 0x[[#%x,EP_ADDR:]] +## Check that the dynamic relocation at .init and .init_array were not patched +# CHECK-INIT-EP: Dynamic section at offset {{.*}} contains {{.*}} entries: +# CHECK-INIT-EP-NOT: (INIT) 0x[[#%x, EP_ADDR]] +# CHECK-INIT-EP-NOT: (INIT_ARRAY) 0x[[#%x, EP_ADDR]] +## Check that the new entry point address points to __bolt_runtime_start +# CHECK-INIT-EP: Symbol table '.symtab' contains {{.*}} entries: +# CHECK-INIT-EP: {{0+}}[[#%x, EP_ADDR]] {{.*}} __bolt_runtime_start + +## Check that DT_INIT address is set to __bolt_runtime_start for PIE executable with DT_INIT +# CHECK-INIT-NO-EP: ELF Header: +# CHECK-INIT-NO-EP: Entry point address: 0x[[#%x,EP_ADDR:]] +## Read Dynamic section DT_INIT and DT_INIT_ARRAY entries +# CHECK-INIT-NO-EP: Dynamic section at offset {{.*}} contains {{.*}} entries: +# CHECK-INIT-NO-EP-DAG: (INIT) 0x[[#%x,INIT:]] +# CHECK-INIT-NO-EP-DAG: (INIT_ARRAY) 0x[[#%x,INIT_ARRAY:]] +## Check if ELF entry point address points to _start symbol and new DT_INIT entry points to __bolt_runtime_start +# CHECK-INIT-NO-EP: Symbol table '.symtab' contains {{.*}} entries: +# CHECK-INIT-NO-EP-DAG: {{0+}}[[#%x, EP_ADDR]] {{.*}} _start +# CHECK-INIT-NO-EP-DAG: {{0+}}[[#%x, INIT]] {{.*}} __bolt_runtime_start + +## Check that 1st entry of DT_INIT_ARRAY is set to __bolt_runtime_start and DT_INIT was not changed +# CHECK-INIT-ARRAY-NO-EP: ELF Header: +# CHECK-INIT-ARRAY-NO-EP: Entry point address: 0x[[#%x,EP_ADDR:]] +## Read Dynamic section DT_INIT and DT_INIT_ARRAY entries +# CHECK-INIT-ARRAY-NO-EP: Dynamic section at offset {{.*}} contains {{.*}} entries: +# CHECK-INIT-ARRAY-NO-EP-DAG: (INIT) 0x[[#%x,INIT:]] +# CHECK-INIT-ARRAY-NO-EP-DAG: (INIT_ARRAY) 0x[[#%x,INIT_ARRAY:]] +## Read the dynamic relocation from 1st entry of .init_array +# CHECK-INIT-ARRAY-NO-EP: Relocation section '.rela.dyn' at offset {{.*}} contains {{.*}} entries +# CHECK-INIT-ARRAY-NO-EP: {{0+}}[[#%x,INIT_ARRAY]] {{.*}} R_X86_64_RELATIVE [[#%x,INIT_ADDR:]] +# CHECK-INIT-ARRAY-NO-EP-NOT: {{0+}}[[#%x,INIT_ARRAY]] {{.*}} R_X86_64_RELATIVE [[#%x,INIT]] +## Check that 1st entry of .init_array points to __bolt_runtime_start +# CHECK-INIT-ARRAY-NO-EP: Symbol table '.symtab' contains {{.*}} entries: +# CHECK-INIT-ARRAY-NO-EP-DAG: {{0+}}[[#%x, EP_ADDR]] {{.*}} _start +# CHECK-INIT-ARRAY-NO-EP-DAG: {{[0-9]]*}}: {{0+}}[[#%x, INIT_ADDR]] {{.*}} __bolt_runtime_start + +## Check that entry point address is set to __bolt_runtime_start for PIE executable without DT_INIT +# CHECK-NO-INIT-EP: ELF Header: +# CHECK-NO-INIT-EP: Entry point address: 0x[[#%x,EP_ADDR:]] +## Check that the dynamic relocation at .init and .init_array were not patched +# CHECK-NO-INIT-EP: Dynamic section at offset {{.*}} contains {{.*}} entries: +# CHECK-NO-INIT-EP-NOT: (INIT) 0x[[#%x, EP_ADDR]] +# CHECK-NO-INIT-EP-NOT: (INIT_ARRAY) 0x[[#%x, EP_ADDR]] +## Check that the new entry point address points to __bolt_runtime_start +# CHECK-NO-INIT-EP: Symbol table '.symtab' contains {{.*}} entries: +# CHECK-NO-INIT-EP: {{0+}}[[#%x, EP_ADDR]] {{.*}} __bolt_runtime_start + +## Check that DT_INIT is set to __bolt_runtime_start for shared library with DT_INIT +# CHECK-SHARED-INIT: Dynamic section at offset {{.*}} contains {{.*}} entries: +# CHECK-SHARED-INIT-DAG: (INIT) 0x[[#%x, INIT:]] +# CHECK-SHARED-INIT-DAG: (INIT_ARRAY) 0x[[#%x, INIT_ARRAY:]] +## Check that the dynamic relocation at .init_array was not patched +# CHECK-SHARED-INIT: Relocation section '.rela.dyn' at offset {{.*}} contains {{.*}} entries +# CHECK-SHARED-INIT-NOT: {{0+}}[[#%x, INIT_ARRAY]] {{.*}} R_X86_64_64 {{0+}}[[#%x, INIT]] +## Check that dynamic section DT_INIT points to __bolt_runtime_start +# CHECK-SHARED-INIT: Symbol table '.symtab' contains {{.*}} entries: +# CHECK-SHARED-INIT: {{0+}}[[#%x, INIT]] {{.*}} __bolt_runtime_start + +## Check that entry point address is set to __bolt_runtime_start for PIE executable without DT_INIT +# CHECK-NO-INIT-NO-EP: ELF Header: +# CHECK-NO-INIT-NO-EP: Entry point address: 0x[[#%x,EP_ADDR:]] +# CHECK-NO-INIT-NO-EP: Dynamic section at offset {{.*}} contains {{.*}} entries: +# CHECK-NO-INIT-NO-EP-NOT: (INIT) +# CHECK-NO-INIT-NO-EP: (INIT_ARRAY) 0x[[#%x,INIT_ARRAY:]] +## Read the dynamic relocation from 1st entry of .init_array +# CHECK-NO-INIT-NO-EP: Relocation section '.rela.dyn' at offset {{.*}} contains {{.*}} entries +# CHECK-NO-INIT-NO-EP: {{0+}}[[#%x,INIT_ARRAY]] {{.*}} R_X86_64_RELATIVE [[#%x,INIT_ADDR:]] +## Check that 1st entry of .init_array points to __bolt_runtime_start +# CHECK-NO-INIT-NO-EP: Symbol table '.symtab' contains {{.*}} entries: +# CHECK-NO-INIT-NO-EP-DAG: {{0+}}[[#%x, EP_ADDR]] {{.*}} _start +# CHECK-NO-INIT-NO-EP-DAG: {{[0-9]]*}}: {{0+}}[[#%x, INIT_ADDR]] {{.*}} __bolt_runtime_start + +## Check that entry point address is set to __bolt_runtime_start for shared library without DT_INIT +# CHECK-SHARED-NO-INIT: Dynamic section at offset {{.*}} contains {{.*}} entries: +# CHECK-SHARED-NO-INIT-NOT: (INIT) +# CHECK-SHARED-NO-INIT: (INIT_ARRAY) 0x[[#%x,INIT_ARRAY:]] +## Read the dynamic relocation from 1st entry of .init_array +# CHECK-SHARED-NO-INIT: Relocation section '.rela.dyn' at offset {{.*}} contains {{.*}} entries +# CHECK-SHARED-NO-INIT: {{0+}}[[#%x, INIT_ARRAY]] {{.*}} R_X86_64_64 [[#%x,INIT_ADDR:]] +## Check that 1st entry of .init_array points to __bolt_runtime_start +# CHECK-SHARED-NO-INIT: Symbol table '.symtab' contains {{.*}} entries: +# CHECK-SHARED-NO-INIT: {{[0-9]]*}}: {{0+}}[[#%x, INIT_ADDR]] {{.*}} __bolt_runtime_start + +## Check that entry point address is set to __bolt_runtime_start for non-PIE executable with DT_INIT +# CHECK-NO-PIE-NO-INIT-EP: ELF Header: +# CHECK-NO-PIE-NO-INIT-EP: Entry point address: 0x[[#%x,EP_ADDR:]] +## Check that the dynamic relocation at .init and .init_array were not patched +# CHECK-NO-PIE-NO-INIT-EP: Dynamic section at offset {{.*}} contains {{.*}} entries: +# CHECK-NO-PIE-NO-INIT-EP-NOT: (INIT) 0x[[#%x, EP_ADDR]] +# CHECK-NO-PIE-NO-INIT-EP-NOT: (INIT_ARRAY) 0x[[#%x, EP_ADDR]] +## Check that the new entry point address points to __bolt_runtime_start +# CHECK-NO-PIE-NO-INIT-EP: Symbol table '.symtab' contains {{.*}} entries: +# CHECK-NO-PIE-NO-INIT-EP: {{0+}}[[#%x, EP_ADDR]] {{.*}} __bolt_runtime_start + + .globl _start + .type _start, %function +_start: + # Dummy relocation to force relocation mode. + .reloc 0, R_X86_64_NONE + retq +.size _start, .-_start + + .globl _init + .type _init, %function +_init: + retq + .size _init, .-_init + + .globl _fini + .type _fini, %function +_fini: + retq + .size _fini, .-_fini + + .section .init_array,"aw" + .align 8 + .quad _init + + .section .fini_array,"aw" + .align 8 + .quad _fini diff --git a/bolt/test/X86/internal-call-instrument-so.s b/bolt/test/X86/internal-call-instrument-so.s index 99e5b29221409..fe23bc61afa32 100644 --- a/bolt/test/X86/internal-call-instrument-so.s +++ b/bolt/test/X86/internal-call-instrument-so.s @@ -5,7 +5,7 @@ # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o # Delete our BB symbols so BOLT doesn't mark them as entry points # RUN: llvm-strip --strip-unneeded %t.o -# RUN: ld.lld %t.o -o %t.exe -q -shared -fini=_fini +# RUN: ld.lld %t.o -o %t.exe -q -shared -fini=_fini -init=_init # RUN: llvm-bolt --instrument %t.exe --relocs -o %t.out .text @@ -48,6 +48,13 @@ _fini: hlt .size _fini, .-_fini + .globl _init + .type _init, %function + .p2align 4 +_init: + retq + .size _init, .-_init + .data .globl var var: diff --git a/bolt/test/runtime/X86/instrument-wrong-target.s b/bolt/test/runtime/X86/instrument-wrong-target.s index 343d93a89ed13..fa40d43f10a0f 100644 --- a/bolt/test/runtime/X86/instrument-wrong-target.s +++ b/bolt/test/runtime/X86/instrument-wrong-target.s @@ -19,6 +19,13 @@ _start: ret .size _start, .-_start + .globl _init + .type _init, %function + # Force DT_INIT to be created (needed for instrumentation). +_init: + ret + .size _init, .-_init + .globl _fini .type _fini, %function # Force DT_FINI to be created (needed for instrumentation). diff --git a/bolt/unittests/CMakeLists.txt b/bolt/unittests/CMakeLists.txt index 64414b83d39fe..d47ddc46b7388 100644 --- a/bolt/unittests/CMakeLists.txt +++ b/bolt/unittests/CMakeLists.txt @@ -7,3 +7,4 @@ endfunction() add_subdirectory(Core) add_subdirectory(Profile) +add_subdirectory(Passes) diff --git a/bolt/unittests/Passes/CMakeLists.txt b/bolt/unittests/Passes/CMakeLists.txt new file mode 100644 index 0000000000000..3dc578adeb357 --- /dev/null +++ b/bolt/unittests/Passes/CMakeLists.txt @@ -0,0 +1,30 @@ +set(LLVM_LINK_COMPONENTS + DebugInfoDWARF + Object + MC + ${BOLT_TARGETS_TO_BUILD} + ) + +add_bolt_unittest(PassTests + InsertNegateRAState.cpp + + DISABLE_LLVM_LINK_LLVM_DYLIB + ) + +target_link_libraries(PassTests + PRIVATE + LLVMBOLTCore + LLVMBOLTRewrite + LLVMBOLTPasses + LLVMBOLTProfile + LLVMBOLTUtils + ) + +foreach (tgt ${BOLT_TARGETS_TO_BUILD}) + include_directories( + ${LLVM_MAIN_SRC_DIR}/lib/Target/${tgt} + ${LLVM_BINARY_DIR}/lib/Target/${tgt} + ) + string(TOUPPER "${tgt}" upper) + target_compile_definitions(PassTests PRIVATE "${upper}_AVAILABLE") +endforeach() diff --git a/bolt/unittests/Passes/InsertNegateRAState.cpp b/bolt/unittests/Passes/InsertNegateRAState.cpp new file mode 100644 index 0000000000000..2ef78d381e570 --- /dev/null +++ b/bolt/unittests/Passes/InsertNegateRAState.cpp @@ -0,0 +1,333 @@ +//===- bolt/unittest/Passes/InsertNegateRAState.cpp -----------------------===// +// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifdef AARCH64_AVAILABLE +#include "AArch64Subtarget.h" +#include "MCTargetDesc/AArch64MCTargetDesc.h" +#endif // AARCH64_AVAILABLE + +#include "bolt/Core/BinaryBasicBlock.h" +#include "bolt/Core/BinaryFunction.h" +#include "bolt/Passes/InsertNegateRAStatePass.h" +#include "bolt/Rewrite/BinaryPassManager.h" +#include "bolt/Rewrite/RewriteInstance.h" +#include "llvm/BinaryFormat/ELF.h" +#include "llvm/MC/MCDwarf.h" +#include "llvm/MC/MCInstBuilder.h" +#include "llvm/Support/TargetSelect.h" +#include "gtest/gtest.h" + +using namespace llvm; +using namespace llvm::object; +using namespace llvm::ELF; +using namespace bolt; + +namespace { +struct PassTester : public testing::TestWithParam { + void SetUp() override { + initalizeLLVM(); + prepareElf(); + initializeBolt(); + } + +protected: + void initalizeLLVM() { +#define BOLT_TARGET(target) \ + LLVMInitialize##target##TargetInfo(); \ + LLVMInitialize##target##TargetMC(); \ + LLVMInitialize##target##AsmParser(); \ + LLVMInitialize##target##Disassembler(); \ + LLVMInitialize##target##Target(); \ + LLVMInitialize##target##AsmPrinter(); + +#include "bolt/Core/TargetConfig.def" + } + +#define PREPARE_FUNC(name) \ + constexpr uint64_t FunctionAddress = 0x1000; \ + BinaryFunction *BF = BC->createBinaryFunction( \ + name, *TextSection, FunctionAddress, /*Size=*/0, /*SymbolSize=*/0, \ + /*Alignment=*/16); \ + /* Make sure the pass runs on the BF.*/ \ + BF->updateState(BinaryFunction::State::CFG); \ + BF->setContainedNegateRAState(); \ + /* All tests need at least one BB. */ \ + BinaryBasicBlock *BB = BF->addBasicBlock(); \ + BF->addEntryPoint(*BB); \ + BB->setCFIState(0); + + void prepareElf() { + memcpy(ElfBuf, "\177ELF", 4); + ELF64LE::Ehdr *EHdr = reinterpret_cast(ElfBuf); + EHdr->e_ident[llvm::ELF::EI_CLASS] = llvm::ELF::ELFCLASS64; + EHdr->e_ident[llvm::ELF::EI_DATA] = llvm::ELF::ELFDATA2LSB; + EHdr->e_machine = GetParam() == Triple::aarch64 ? EM_AARCH64 : EM_X86_64; + MemoryBufferRef Source(StringRef(ElfBuf, sizeof(ElfBuf)), "ELF"); + ObjFile = cantFail(ObjectFile::createObjectFile(Source)); + } + void initializeBolt() { + Relocation::Arch = ObjFile->makeTriple().getArch(); + BC = cantFail(BinaryContext::createBinaryContext( + ObjFile->makeTriple(), std::make_shared(), + ObjFile->getFileName(), nullptr, true, DWARFContext::create(*ObjFile), + {llvm::outs(), llvm::errs()})); + ASSERT_FALSE(!BC); + BC->initializeTarget(std::unique_ptr( + createMCPlusBuilder(GetParam(), BC->MIA.get(), BC->MII.get(), + BC->MRI.get(), BC->STI.get()))); + + PassManager = std::make_unique(*BC); + PassManager->registerPass(std::make_unique()); + + TextSection = &BC->registerOrUpdateSection( + ".text", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_EXECINSTR, + /*Data=*/nullptr, /*Size=*/0, + /*Alignment=*/16); + } + + std::vector findCFIOffsets(BinaryFunction &BF) { + std::vector Locations; + int Idx = 0; + int InstSize = 4; // AArch64 + for (BinaryBasicBlock &BB : BF) { + for (MCInst &Inst : BB) { + if (BC->MIB->isCFI(Inst)) { + const MCCFIInstruction *CFI = BF.getCFIFor(Inst); + if (CFI->getOperation() == MCCFIInstruction::OpNegateRAState) + Locations.push_back(Idx * InstSize); + } + Idx++; + } + } + return Locations; + } + + char ElfBuf[sizeof(typename ELF64LE::Ehdr)] = {}; + std::unique_ptr ObjFile; + std::unique_ptr BC; + std::unique_ptr PassManager; + BinarySection *TextSection; +}; +} // namespace + +TEST_P(PassTester, ExampleTest) { + if (GetParam() != Triple::aarch64) + GTEST_SKIP(); + + ASSERT_NE(TextSection, nullptr); + + PREPARE_FUNC("ExampleFunction"); + + MCInst UnsignedInst = MCInstBuilder(AArch64::ADDSXri) + .addReg(AArch64::X0) + .addReg(AArch64::X0) + .addImm(0) + .addImm(0); + BC->MIB->setRAState(UnsignedInst, false); + BB->addInstruction(UnsignedInst); + + MCInst SignedInst = MCInstBuilder(AArch64::ADDSXri) + .addReg(AArch64::X0) + .addReg(AArch64::X0) + .addImm(1) + .addImm(0); + BC->MIB->setRAState(SignedInst, true); + BB->addInstruction(SignedInst); + + Error E = PassManager->runPasses(); + EXPECT_FALSE(E); + + /* Expected layout of BF after the pass: + + .LBB0 (3 instructions, align : 1) + Entry Point + CFI State : 0 + 00000000: adds x0, x0, #0x0 + 00000004: !CFI $0 ; OpNegateRAState + 00000004: adds x0, x0, #0x1 + CFI State: 0 + */ + auto CFILoc = findCFIOffsets(*BF); + EXPECT_EQ(CFILoc.size(), 1u); + EXPECT_EQ(CFILoc[0], 4); +} + +TEST_P(PassTester, fillUnknownStateInBBTest) { + /* Check that a if BB starts with unknown RAState, we can fill the unknown + states based on following instructions with known RAStates. + * + * .LBB0 (1 instructions, align : 1) + Entry Point + CFI State : 0 + 00000000: adds x0, x0, #0x0 + CFI State: 0 + + .LBB1 (4 instructions, align : 1) + CFI State : 0 + 00000004: !CFI $0 ; OpNegateRAState + 00000004: adds x0, x0, #0x1 + 00000008: adds x0, x0, #0x2 + 0000000c: adds x0, x0, #0x3 + CFI State: 0 + */ + if (GetParam() != Triple::aarch64) + GTEST_SKIP(); + + ASSERT_NE(TextSection, nullptr); + + PREPARE_FUNC("FuncWithUnknownStateInBB"); + BinaryBasicBlock *BB2 = BF->addBasicBlock(); + BB2->setCFIState(0); + + MCInst Unsigned = MCInstBuilder(AArch64::ADDSXri) + .addReg(AArch64::X0) + .addReg(AArch64::X0) + .addImm(0) + .addImm(0); + BC->MIB->setRAState(Unsigned, false); + BB->addInstruction(Unsigned); + + MCInst Unknown = MCInstBuilder(AArch64::ADDSXri) + .addReg(AArch64::X0) + .addReg(AArch64::X0) + .addImm(1) + .addImm(0); + MCInst Unknown1 = MCInstBuilder(AArch64::ADDSXri) + .addReg(AArch64::X0) + .addReg(AArch64::X0) + .addImm(2) + .addImm(0); + MCInst Signed = MCInstBuilder(AArch64::ADDSXri) + .addReg(AArch64::X0) + .addReg(AArch64::X0) + .addImm(3) + .addImm(0); + BC->MIB->setRAState(Signed, true); + BB2->addInstruction(Unknown); + BB2->addInstruction(Unknown1); + BB2->addInstruction(Signed); + + Error E = PassManager->runPasses(); + EXPECT_FALSE(E); + + auto CFILoc = findCFIOffsets(*BF); + EXPECT_EQ(CFILoc.size(), 1u); + EXPECT_EQ(CFILoc[0], 4); + // Check that the pass set Unknown and Unknown1 to signed. + // begin() is the CFI, begin() + 1 is Unknown, begin() + 2 is Unknown1. + std::optional RAState = BC->MIB->getRAState(*(BB2->begin() + 1)); + EXPECT_TRUE(RAState.has_value()); + EXPECT_TRUE(*RAState); + std::optional RAState1 = BC->MIB->getRAState(*(BB2->begin() + 2)); + EXPECT_TRUE(RAState1.has_value()); + EXPECT_TRUE(*RAState1); +} + +TEST_P(PassTester, fillUnknownStubs) { + /* + * Stubs that are not part of the function's CFG should inherit the RAState of + the BasicBlock before it. + * + * LBB1 is not part of the CFG: LBB0 jumps unconditionally to LBB2. + * LBB1 would be a stub inserted in LongJmp in real code. + * We do not add any NegateRAState CFIs, as other CFIs are not added either. + * See issue #160989 for more details. + * + * .LBB0 (1 instructions, align : 1) + Entry Point + 00000000: b .LBB2 + Successors: .LBB2 + + .LBB1 (1 instructions, align : 1) + 00000004: ret + + .LBB2 (1 instructions, align : 1) + Predecessors: .LBB0 + 00000008: ret + */ + if (GetParam() != Triple::aarch64) + GTEST_SKIP(); + + ASSERT_NE(TextSection, nullptr); + + PREPARE_FUNC("FuncWithStub"); + BinaryBasicBlock *BB2 = BF->addBasicBlock(); + BB2->setCFIState(0); + BinaryBasicBlock *BB3 = BF->addBasicBlock(); + BB3->setCFIState(0); + + BB->addSuccessor(BB3); + + // Jumping over BB2, to BB3. + MCInst Jump; + BC->MIB->createUncondBranch(Jump, BB3->getLabel(), BC->Ctx.get()); + BB->addInstruction(Jump); + BC->MIB->setRAState(Jump, false); + + // BB2, in real code it would be a ShortJmp. + // Unknown RAState. + MCInst StubInst; + BC->MIB->createReturn(StubInst); + BB2->addInstruction(StubInst); + + // Can be any instruction. + MCInst Ret; + BC->MIB->createReturn(Ret); + BB3->addInstruction(Ret); + BC->MIB->setRAState(Ret, false); + + Error E = PassManager->runPasses(); + EXPECT_FALSE(E); + + // Check that we did not generate any NegateRAState CFIs. + auto CFILoc = findCFIOffsets(*BF); + EXPECT_EQ(CFILoc.size(), 0u); +} + +TEST_P(PassTester, fillUnknownStubsEmpty) { + /* + * This test checks that BOLT can set the RAState of unknown BBs, + * even if all previous BBs are empty, hence no PrevInst gets set. + * + * As this means that the current (empty) BB is the first with non-pseudo + * instructions, the function's initialRAState should be used. + */ + if (GetParam() != Triple::aarch64) + GTEST_SKIP(); + + ASSERT_NE(TextSection, nullptr); + + PREPARE_FUNC("FuncWithStub"); + BF->setInitialRAState(false); + BinaryBasicBlock *BB2 = BF->addBasicBlock(); + BB2->setCFIState(0); + + // BB is empty. + BB->addSuccessor(BB2); + + // BB2, in real code it would be a ShortJmp. + // Unknown RAState. + MCInst StubInst; + BC->MIB->createReturn(StubInst); + BB2->addInstruction(StubInst); + + Error E = PassManager->runPasses(); + EXPECT_FALSE(E); + + // Check that BOLT added an RAState to BB2. + std::optional RAState = BC->MIB->getRAState(*(BB2->begin())); + EXPECT_TRUE(RAState.has_value()); + // BB2 should be set to BF.initialRAState (false). + EXPECT_FALSE(*RAState); +} + +#ifdef AARCH64_AVAILABLE +INSTANTIATE_TEST_SUITE_P(AArch64, PassTester, + ::testing::Values(Triple::aarch64)); +#endif diff --git a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-bounds-avoid-unchecked-container-access.rst b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-bounds-avoid-unchecked-container-access.rst index fe78ad8056443..38143c94cd3ae 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-bounds-avoid-unchecked-container-access.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-bounds-avoid-unchecked-container-access.rst @@ -29,9 +29,9 @@ STL containers for which ``operator[]`` is well-defined for all inputs are excluded from this check (e.g.: ``std::map::operator[]``). This check enforces part of the `SL.con.3 -` +`_ guideline and is part of the `Bounds Safety (Bounds 4) -` +`_ profile from the C++ Core Guidelines. Options diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst index e7ca7b0bd0792..ab3f2c48983ca 100644 --- a/clang/docs/OpenMPSupport.rst +++ b/clang/docs/OpenMPSupport.rst @@ -266,7 +266,7 @@ implementation. +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ | device | has_device_addr clause on target construct | :none:`unclaimed` | | +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | iterators in map clause or motion clauses | :none:`unclaimed` | | +| device | iterators in map clause or motion clauses | :none:`done` | https://github.com/llvm/llvm-project/pull/159112 | +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ | device | indirect clause on declare target directive | :part:`In Progress` | | +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h index ee2321dd158d4..5394b2558b407 100644 --- a/clang/include/clang/AST/Decl.h +++ b/clang/include/clang/AST/Decl.h @@ -4524,6 +4524,11 @@ class RecordDecl : public TagDecl { return field_begin() == field_end(); } + /// Returns the number of fields (non-static data members) in this record. + unsigned getNumFields() const { + return std::distance(field_begin(), field_end()); + } + /// noload_fields - Iterate over the fields stored in this record /// that are currently loaded; don't attempt to retrieve anything /// from an external source. diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h index 72c5efde7449b..d9c3cf239451e 100644 --- a/clang/include/clang/AST/OpenMPClause.h +++ b/clang/include/clang/AST/OpenMPClause.h @@ -7582,7 +7582,8 @@ class OMPToClause final : public OMPMappableExprListClause, /// Motion-modifiers for the 'to' clause. OpenMPMotionModifierKind MotionModifiers[NumberOfOMPMotionModifiers] = { - OMPC_MOTION_MODIFIER_unknown, OMPC_MOTION_MODIFIER_unknown}; + OMPC_MOTION_MODIFIER_unknown, OMPC_MOTION_MODIFIER_unknown, + OMPC_MOTION_MODIFIER_unknown}; /// Location of motion-modifiers for the 'to' clause. SourceLocation MotionModifiersLoc[NumberOfOMPMotionModifiers]; @@ -7654,6 +7655,9 @@ class OMPToClause final : public OMPMappableExprListClause, MotionModifiersLoc[I] = TLoc; } + void setIteratorModifier(Expr *IteratorModifier) { + getTrailingObjects()[2 * varlist_size()] = IteratorModifier; + } /// Set colon location. void setColonLoc(SourceLocation Loc) { ColonLoc = Loc; } @@ -7662,7 +7666,7 @@ class OMPToClause final : public OMPMappableExprListClause, size_t numTrailingObjects(OverloadToken) const { // There are varlist_size() of expressions, and varlist_size() of // user-defined mappers. - return 2 * varlist_size(); + return 2 * varlist_size() + 1; } size_t numTrailingObjects(OverloadToken) const { return getUniqueDeclarationsNum(); @@ -7688,15 +7692,14 @@ class OMPToClause final : public OMPMappableExprListClause, /// \param UDMQualifierLoc C++ nested name specifier for the associated /// user-defined mapper. /// \param MapperId The identifier of associated user-defined mapper. - static OMPToClause *Create(const ASTContext &C, const OMPVarListLocTy &Locs, - ArrayRef Vars, - ArrayRef Declarations, - MappableExprComponentListsRef ComponentLists, - ArrayRef UDMapperRefs, - ArrayRef MotionModifiers, - ArrayRef MotionModifiersLoc, - NestedNameSpecifierLoc UDMQualifierLoc, - DeclarationNameInfo MapperId); + static OMPToClause * + Create(const ASTContext &C, const OMPVarListLocTy &Locs, + ArrayRef Vars, ArrayRef Declarations, + MappableExprComponentListsRef ComponentLists, + ArrayRef UDMapperRefs, Expr *IteratorModifier, + ArrayRef MotionModifiers, + ArrayRef MotionModifiersLoc, + NestedNameSpecifierLoc UDMQualifierLoc, DeclarationNameInfo MapperId); /// Creates an empty clause with the place for \a NumVars variables. /// @@ -7717,7 +7720,9 @@ class OMPToClause final : public OMPMappableExprListClause, "Requested modifier exceeds the total number of modifiers."); return MotionModifiers[Cnt]; } - + Expr *getIteratorModifier() const { + return getTrailingObjects()[2 * varlist_size()]; + } /// Fetches the motion-modifier location at 'Cnt' index of array of modifiers' /// locations. /// @@ -7782,7 +7787,8 @@ class OMPFromClause final /// Motion-modifiers for the 'from' clause. OpenMPMotionModifierKind MotionModifiers[NumberOfOMPMotionModifiers] = { - OMPC_MOTION_MODIFIER_unknown, OMPC_MOTION_MODIFIER_unknown}; + OMPC_MOTION_MODIFIER_unknown, OMPC_MOTION_MODIFIER_unknown, + OMPC_MOTION_MODIFIER_unknown}; /// Location of motion-modifiers for the 'from' clause. SourceLocation MotionModifiersLoc[NumberOfOMPMotionModifiers]; @@ -7843,7 +7849,9 @@ class OMPFromClause final "Unexpected index to store motion modifier, exceeds array size."); MotionModifiers[I] = T; } - + void setIteratorModifier(Expr *IteratorModifier) { + getTrailingObjects()[2 * varlist_size()] = IteratorModifier; + } /// Set location for the motion-modifier. /// /// \param I index for motion-modifier location. @@ -7862,7 +7870,7 @@ class OMPFromClause final size_t numTrailingObjects(OverloadToken) const { // There are varlist_size() of expressions, and varlist_size() of // user-defined mappers. - return 2 * varlist_size(); + return 2 * varlist_size() + 1; } size_t numTrailingObjects(OverloadToken) const { return getUniqueDeclarationsNum(); @@ -7892,7 +7900,7 @@ class OMPFromClause final Create(const ASTContext &C, const OMPVarListLocTy &Locs, ArrayRef Vars, ArrayRef Declarations, MappableExprComponentListsRef ComponentLists, - ArrayRef UDMapperRefs, + ArrayRef UDMapperRefs, Expr *IteratorExpr, ArrayRef MotionModifiers, ArrayRef MotionModifiersLoc, NestedNameSpecifierLoc UDMQualifierLoc, DeclarationNameInfo MapperId); @@ -7916,7 +7924,9 @@ class OMPFromClause final "Requested modifier exceeds the total number of modifiers."); return MotionModifiers[Cnt]; } - + Expr *getIteratorModifier() const { + return getTrailingObjects()[2 * varlist_size()]; + } /// Fetches the motion-modifier location at 'Cnt' index of array of modifiers' /// locations. /// diff --git a/clang/include/clang/Basic/OpenMPKinds.def b/clang/include/clang/Basic/OpenMPKinds.def index b98b946cad75a..ceac89d3aba6d 100644 --- a/clang/include/clang/Basic/OpenMPKinds.def +++ b/clang/include/clang/Basic/OpenMPKinds.def @@ -207,6 +207,7 @@ OPENMP_MAP_MODIFIER_KIND(ompx_hold) // Modifiers for 'to' or 'from' clause. OPENMP_MOTION_MODIFIER_KIND(mapper) +OPENMP_MOTION_MODIFIER_KIND(iterator) OPENMP_MOTION_MODIFIER_KIND(present) // Static attributes for 'dist_schedule' clause. diff --git a/clang/include/clang/Sema/SemaOpenMP.h b/clang/include/clang/Sema/SemaOpenMP.h index 686e51ee92a08..2d05b4423140b 100644 --- a/clang/include/clang/Sema/SemaOpenMP.h +++ b/clang/include/clang/Sema/SemaOpenMP.h @@ -1351,7 +1351,7 @@ class SemaOpenMP : public SemaBase { OMPClause * ActOnOpenMPToClause(ArrayRef MotionModifiers, ArrayRef MotionModifiersLoc, - CXXScopeSpec &MapperIdScopeSpec, + Expr *IteratorModifier, CXXScopeSpec &MapperIdScopeSpec, DeclarationNameInfo &MapperId, SourceLocation ColonLoc, ArrayRef VarList, const OMPVarListLocTy &Locs, ArrayRef UnresolvedMappers = {}); @@ -1359,7 +1359,7 @@ class SemaOpenMP : public SemaBase { OMPClause * ActOnOpenMPFromClause(ArrayRef MotionModifiers, ArrayRef MotionModifiersLoc, - CXXScopeSpec &MapperIdScopeSpec, + Expr *IteratorModifier, CXXScopeSpec &MapperIdScopeSpec, DeclarationNameInfo &MapperId, SourceLocation ColonLoc, ArrayRef VarList, const OMPVarListLocTy &Locs, ArrayRef UnresolvedMappers = {}); diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index dd0b8e790d444..58e84ef70abb7 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -1705,6 +1705,9 @@ bool Compiler::VisitFixedPointUnaryOperator(const UnaryOperator *E) { template bool Compiler::VisitImplicitValueInitExpr( const ImplicitValueInitExpr *E) { + if (DiscardResult) + return true; + QualType QT = E->getType(); if (OptPrimType T = classify(QT)) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 8496b58105c7a..971fce541bb88 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -1921,6 +1921,10 @@ static bool interp__builtin_memcmp(InterpState &S, CodePtr OpPC, if (PtrA.isDummy() || PtrB.isDummy()) return false; + if (!CheckRange(S, OpPC, PtrA, AK_Read) || + !CheckRange(S, OpPC, PtrB, AK_Read)) + return false; + // Now, read both pointers to a buffer and compare those. BitcastBuffer BufferA( Bits(ASTCtx.getTypeSize(ElemTypeA) * PtrA.getNumElems())); diff --git a/clang/lib/AST/ComparisonCategories.cpp b/clang/lib/AST/ComparisonCategories.cpp index 0c7a7f4eacbbf..1b9c938e2ace3 100644 --- a/clang/lib/AST/ComparisonCategories.cpp +++ b/clang/lib/AST/ComparisonCategories.cpp @@ -49,7 +49,7 @@ bool ComparisonCategoryInfo::ValueInfo::hasValidIntValue() const { // Before we attempt to get the value of the first field, ensure that we // actually have one (and only one) field. const auto *Record = VD->getType()->getAsCXXRecordDecl(); - if (std::distance(Record->field_begin(), Record->field_end()) != 1 || + if (Record->getNumFields() != 1 || !Record->field_begin()->getType()->isIntegralOrEnumerationType()) return false; diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index b986ee6ca4fa3..e5af4cb049ba9 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -3971,8 +3971,7 @@ static bool constructAggregate(EvalInfo &Info, const FPOptions FPO, if (auto *CXXRD = dyn_cast(RD)) NumBases = CXXRD->getNumBases(); - *Res = APValue(APValue::UninitStruct(), NumBases, - std::distance(RD->field_begin(), RD->field_end())); + *Res = APValue(APValue::UninitStruct(), NumBases, RD->getNumFields()); SmallVector> ReverseList; // we need to traverse backwards @@ -5529,8 +5528,8 @@ static bool handleDefaultInitValue(QualType T, APValue &Result) { Result = APValue((const FieldDecl *)nullptr); return true; } - Result = APValue(APValue::UninitStruct(), RD->getNumBases(), - std::distance(RD->field_begin(), RD->field_end())); + Result = + APValue(APValue::UninitStruct(), RD->getNumBases(), RD->getNumFields()); unsigned Index = 0; for (CXXRecordDecl::base_class_const_iterator I = RD->bases_begin(), @@ -7184,7 +7183,7 @@ static bool HandleConstructorCall(const Expr *E, const LValue &This, if (!Result.hasValue()) { if (!RD->isUnion()) Result = APValue(APValue::UninitStruct(), RD->getNumBases(), - std::distance(RD->field_begin(), RD->field_end())); + RD->getNumFields()); else // A union starts with no active member. Result = APValue((const FieldDecl*)nullptr); @@ -8135,8 +8134,7 @@ class BufferToAPValueConverter { if (auto *CXXRD = dyn_cast(RD)) NumBases = CXXRD->getNumBases(); - APValue ResultVal(APValue::UninitStruct(), NumBases, - std::distance(RD->field_begin(), RD->field_end())); + APValue ResultVal(APValue::UninitStruct(), NumBases, RD->getNumFields()); // Visit the base classes. if (auto *CXXRD = dyn_cast(RD)) { @@ -11146,7 +11144,7 @@ static bool HandleClassZeroInitialization(EvalInfo &Info, const Expr *E, assert(!RD->isUnion() && "Expected non-union class type"); const CXXRecordDecl *CD = dyn_cast(RD); Result = APValue(APValue::UninitStruct(), CD ? CD->getNumBases() : 0, - std::distance(RD->field_begin(), RD->field_end())); + RD->getNumFields()); if (RD->isInvalidDecl()) return false; const ASTRecordLayout &Layout = Info.Ctx.getASTRecordLayout(RD); @@ -11342,7 +11340,7 @@ bool RecordExprEvaluator::VisitCXXParenListOrInitListExpr( if (!Result.hasValue()) Result = APValue(APValue::UninitStruct(), CXXRD ? CXXRD->getNumBases() : 0, - std::distance(RD->field_begin(), RD->field_end())); + RD->getNumFields()); unsigned ElementNo = 0; bool Success = true; @@ -11549,8 +11547,7 @@ bool RecordExprEvaluator::VisitLambdaExpr(const LambdaExpr *E) { if (ClosureClass->isInvalidDecl()) return false; - const size_t NumFields = - std::distance(ClosureClass->field_begin(), ClosureClass->field_end()); + const size_t NumFields = ClosureClass->getNumFields(); assert(NumFields == (size_t)std::distance(E->capture_init_begin(), E->capture_init_end()) && diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp index 0640fed823771..2183d77de8fa7 100644 --- a/clang/lib/AST/OpenMPClause.cpp +++ b/clang/lib/AST/OpenMPClause.cpp @@ -1321,7 +1321,7 @@ OMPToClause *OMPToClause::Create( const ASTContext &C, const OMPVarListLocTy &Locs, ArrayRef Vars, ArrayRef Declarations, MappableExprComponentListsRef ComponentLists, ArrayRef UDMapperRefs, - ArrayRef MotionModifiers, + Expr *IteratorModifier, ArrayRef MotionModifiers, ArrayRef MotionModifiersLoc, NestedNameSpecifierLoc UDMQualifierLoc, DeclarationNameInfo MapperId) { OMPMappableExprListSizeTy Sizes; @@ -1343,7 +1343,7 @@ OMPToClause *OMPToClause::Create( void *Mem = C.Allocate( totalSizeToAlloc( - 2 * Sizes.NumVars, Sizes.NumUniqueDeclarations, + 2 * Sizes.NumVars + 1, Sizes.NumUniqueDeclarations, Sizes.NumUniqueDeclarations + Sizes.NumComponentLists, Sizes.NumComponents)); @@ -1353,6 +1353,7 @@ OMPToClause *OMPToClause::Create( Clause->setVarRefs(Vars); Clause->setUDMapperRefs(UDMapperRefs); Clause->setClauseInfo(Declarations, ComponentLists); + Clause->setIteratorModifier(IteratorModifier); return Clause; } @@ -1361,17 +1362,19 @@ OMPToClause *OMPToClause::CreateEmpty(const ASTContext &C, void *Mem = C.Allocate( totalSizeToAlloc( - 2 * Sizes.NumVars, Sizes.NumUniqueDeclarations, + 2 * Sizes.NumVars + 1, Sizes.NumUniqueDeclarations, Sizes.NumUniqueDeclarations + Sizes.NumComponentLists, Sizes.NumComponents)); - return new (Mem) OMPToClause(Sizes); + OMPToClause *Clause = new (Mem) OMPToClause(Sizes); + Clause->setIteratorModifier(nullptr); + return Clause; } OMPFromClause *OMPFromClause::Create( const ASTContext &C, const OMPVarListLocTy &Locs, ArrayRef Vars, ArrayRef Declarations, MappableExprComponentListsRef ComponentLists, ArrayRef UDMapperRefs, - ArrayRef MotionModifiers, + Expr *IteratorModifier, ArrayRef MotionModifiers, ArrayRef MotionModifiersLoc, NestedNameSpecifierLoc UDMQualifierLoc, DeclarationNameInfo MapperId) { OMPMappableExprListSizeTy Sizes; @@ -1393,7 +1396,7 @@ OMPFromClause *OMPFromClause::Create( void *Mem = C.Allocate( totalSizeToAlloc( - 2 * Sizes.NumVars, Sizes.NumUniqueDeclarations, + 2 * Sizes.NumVars + 1, Sizes.NumUniqueDeclarations, Sizes.NumUniqueDeclarations + Sizes.NumComponentLists, Sizes.NumComponents)); @@ -1404,6 +1407,7 @@ OMPFromClause *OMPFromClause::Create( Clause->setVarRefs(Vars); Clause->setUDMapperRefs(UDMapperRefs); Clause->setClauseInfo(Declarations, ComponentLists); + Clause->setIteratorModifier(IteratorModifier); return Clause; } @@ -1413,10 +1417,12 @@ OMPFromClause::CreateEmpty(const ASTContext &C, void *Mem = C.Allocate( totalSizeToAlloc( - 2 * Sizes.NumVars, Sizes.NumUniqueDeclarations, + 2 * Sizes.NumVars + 1, Sizes.NumUniqueDeclarations, Sizes.NumUniqueDeclarations + Sizes.NumComponentLists, Sizes.NumComponents)); - return new (Mem) OMPFromClause(Sizes); + OMPFromClause *Clause = new (Mem) OMPFromClause(Sizes); + Clause->setIteratorModifier(nullptr); + return Clause; } void OMPUseDevicePtrClause::setPrivateCopies(ArrayRef VL) { @@ -2694,12 +2700,16 @@ template void OMPClausePrinter::VisitOMPMotionClause(T *Node) { OS << '('; for (unsigned I = 0; I < NumberOfOMPMotionModifiers; ++I) { if (Node->getMotionModifier(I) != OMPC_MOTION_MODIFIER_unknown) { - OS << getOpenMPSimpleClauseTypeName(Node->getClauseKind(), - Node->getMotionModifier(I)); - if (Node->getMotionModifier(I) == OMPC_MOTION_MODIFIER_mapper) - PrintMapper(OS, Node, Policy); - if (I < ModifierCount - 1) - OS << ", "; + if (Node->getMotionModifier(I) == OMPC_MOTION_MODIFIER_iterator) { + PrintIterator(OS, Node, Policy); + } else { + OS << getOpenMPSimpleClauseTypeName(Node->getClauseKind(), + Node->getMotionModifier(I)); + if (Node->getMotionModifier(I) == OMPC_MOTION_MODIFIER_mapper) + PrintMapper(OS, Node, Policy); + if (I < ModifierCount - 1) + OS << ", "; + } } } OS << ':'; diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp index f5c07fe2e33ff..bbe85986b07fc 100644 --- a/clang/lib/CodeGen/CGHLSLRuntime.cpp +++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp @@ -816,8 +816,7 @@ CGHLSLRuntime::handleStructSemanticLoad( const llvm::StructType *ST = cast(Type); const clang::RecordDecl *RD = Decl->getType()->getAsRecordDecl(); - assert(std::distance(RD->field_begin(), RD->field_end()) == - ST->getNumElements()); + assert(RD->getNumFields() == ST->getNumElements()); llvm::Value *Aggregate = llvm::PoisonValue::get(Type); auto FieldDecl = RD->field_begin(); @@ -849,8 +848,7 @@ CGHLSLRuntime::handleStructSemanticStore( RD = Decl->getType()->getAsRecordDecl(); assert(RD); - assert(std::distance(RD->field_begin(), RD->field_end()) == - ST->getNumElements()); + assert(RD->getNumFields() == ST->getNumElements()); auto FieldDecl = RD->field_begin(); for (unsigned I = 0; I < ST->getNumElements(); ++I) { diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 5ceaaf30b8d24..a735295d6e78c 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -8683,6 +8683,15 @@ class MappableExprsHandler { if (llvm::is_contained(C->getMotionModifiers(), OMPC_MOTION_MODIFIER_present)) Kind = Present; + if (llvm::is_contained(C->getMotionModifiers(), + OMPC_MOTION_MODIFIER_iterator)) { + if (auto *IteratorExpr = dyn_cast( + C->getIteratorModifier()->IgnoreParenImpCasts())) { + const auto *VD = cast(IteratorExpr->getIteratorDecl(0)); + CGF.EmitVarDecl(*VD); + } + } + const auto *EI = C->getVarRefs().begin(); for (const auto L : C->component_lists()) { InfoGen(std::get<0>(L), Kind, std::get<1>(L), OMPC_MAP_to, {}, @@ -8699,6 +8708,15 @@ class MappableExprsHandler { if (llvm::is_contained(C->getMotionModifiers(), OMPC_MOTION_MODIFIER_present)) Kind = Present; + if (llvm::is_contained(C->getMotionModifiers(), + OMPC_MOTION_MODIFIER_iterator)) { + if (auto *IteratorExpr = dyn_cast( + C->getIteratorModifier()->IgnoreParenImpCasts())) { + const auto *VD = cast(IteratorExpr->getIteratorDecl(0)); + CGF.EmitVarDecl(*VD); + } + } + const auto *EI = C->getVarRefs().begin(); for (const auto L : C->component_lists()) { InfoGen(std::get<0>(L), Kind, std::get<1>(L), OMPC_MAP_from, {}, diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp index 3b69c286634bb..15c3f7594bf44 100644 --- a/clang/lib/Parse/ParseOpenMP.cpp +++ b/clang/lib/Parse/ParseOpenMP.cpp @@ -4925,19 +4925,28 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind, break; Data.MotionModifiers.push_back(Modifier); Data.MotionModifiersLoc.push_back(Tok.getLocation()); - ConsumeToken(); - if (Modifier == OMPC_MOTION_MODIFIER_mapper) { - IsInvalidMapperModifier = parseMapperModifier(Data); - if (IsInvalidMapperModifier) + if (PP.getSpelling(Tok) == "iterator" && getLangOpts().OpenMP >= 51) { + ExprResult Tail; + Tail = ParseOpenMPIteratorsExpr(); + Tail = Actions.ActOnFinishFullExpr(Tail.get(), T.getOpenLocation(), + /*DiscardedValue=*/false); + if (Tail.isUsable()) + Data.IteratorExpr = Tail.get(); + } else { + ConsumeToken(); + if (Modifier == OMPC_MOTION_MODIFIER_mapper) { + IsInvalidMapperModifier = parseMapperModifier(Data); + if (IsInvalidMapperModifier) + break; + } + // OpenMP < 5.1 doesn't permit a ',' or additional modifiers. + if (getLangOpts().OpenMP < 51) break; + // OpenMP 5.1 accepts an optional ',' even if the next character is ':'. + // TODO: Is that intentional? + if (Tok.is(tok::comma)) + ConsumeToken(); } - // OpenMP < 5.1 doesn't permit a ',' or additional modifiers. - if (getLangOpts().OpenMP < 51) - break; - // OpenMP 5.1 accepts an optional ',' even if the next character is ':'. - // TODO: Is that intentional? - if (Tok.is(tok::comma)) - ConsumeToken(); } if (!Data.MotionModifiers.empty() && Tok.isNot(tok::colon)) { if (!IsInvalidMapperModifier) { diff --git a/clang/lib/Sema/CodeCompleteConsumer.cpp b/clang/lib/Sema/CodeCompleteConsumer.cpp index e3fc7c11f4594..50a552272f421 100644 --- a/clang/lib/Sema/CodeCompleteConsumer.cpp +++ b/clang/lib/Sema/CodeCompleteConsumer.cpp @@ -539,8 +539,7 @@ unsigned CodeCompleteConsumer::OverloadCandidate::getNumParams() const { return Template->getTemplateParameters()->size(); if (Kind == CK_Aggregate) { - unsigned Count = - std::distance(AggregateType->field_begin(), AggregateType->field_end()); + unsigned Count = AggregateType->getNumFields(); if (const auto *CRD = dyn_cast(AggregateType)) Count += CRD->getNumBases(); return Count; diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index 836f5517f13f1..4a88891954acd 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -19155,16 +19155,16 @@ OMPClause *SemaOpenMP::ActOnOpenMPVarListClause(OpenMPClauseKind Kind, ExtraModifierLoc, ColonLoc, VarList, Locs); break; case OMPC_to: - Res = - ActOnOpenMPToClause(Data.MotionModifiers, Data.MotionModifiersLoc, - Data.ReductionOrMapperIdScopeSpec, - Data.ReductionOrMapperId, ColonLoc, VarList, Locs); + Res = ActOnOpenMPToClause( + Data.MotionModifiers, Data.MotionModifiersLoc, Data.IteratorExpr, + Data.ReductionOrMapperIdScopeSpec, Data.ReductionOrMapperId, ColonLoc, + VarList, Locs); break; case OMPC_from: - Res = ActOnOpenMPFromClause(Data.MotionModifiers, Data.MotionModifiersLoc, - Data.ReductionOrMapperIdScopeSpec, - Data.ReductionOrMapperId, ColonLoc, VarList, - Locs); + Res = ActOnOpenMPFromClause( + Data.MotionModifiers, Data.MotionModifiersLoc, Data.IteratorExpr, + Data.ReductionOrMapperIdScopeSpec, Data.ReductionOrMapperId, ColonLoc, + VarList, Locs); break; case OMPC_use_device_ptr: Res = ActOnOpenMPUseDevicePtrClause(VarList, Locs); @@ -24902,11 +24902,12 @@ void SemaOpenMP::ActOnOpenMPDeclareTargetInitializer(Decl *TargetDecl) { OMPClause *SemaOpenMP::ActOnOpenMPToClause( ArrayRef MotionModifiers, - ArrayRef MotionModifiersLoc, + ArrayRef MotionModifiersLoc, Expr *IteratorExpr, CXXScopeSpec &MapperIdScopeSpec, DeclarationNameInfo &MapperId, SourceLocation ColonLoc, ArrayRef VarList, const OMPVarListLocTy &Locs, ArrayRef UnresolvedMappers) { OpenMPMotionModifierKind Modifiers[] = {OMPC_MOTION_MODIFIER_unknown, + OMPC_MOTION_MODIFIER_unknown, OMPC_MOTION_MODIFIER_unknown}; SourceLocation ModifiersLoc[NumberOfOMPMotionModifiers]; @@ -24930,20 +24931,25 @@ OMPClause *SemaOpenMP::ActOnOpenMPToClause( MapperIdScopeSpec, MapperId, UnresolvedMappers); if (MVLI.ProcessedVarList.empty()) return nullptr; - + if (IteratorExpr) + if (auto *DRE = dyn_cast(IteratorExpr)) + if (auto *VD = dyn_cast(DRE->getDecl())) + DSAStack->addIteratorVarDecl(VD); return OMPToClause::Create( getASTContext(), Locs, MVLI.ProcessedVarList, MVLI.VarBaseDeclarations, - MVLI.VarComponents, MVLI.UDMapperList, Modifiers, ModifiersLoc, - MapperIdScopeSpec.getWithLocInContext(getASTContext()), MapperId); + MVLI.VarComponents, MVLI.UDMapperList, IteratorExpr, Modifiers, + ModifiersLoc, MapperIdScopeSpec.getWithLocInContext(getASTContext()), + MapperId); } OMPClause *SemaOpenMP::ActOnOpenMPFromClause( ArrayRef MotionModifiers, - ArrayRef MotionModifiersLoc, + ArrayRef MotionModifiersLoc, Expr *IteratorExpr, CXXScopeSpec &MapperIdScopeSpec, DeclarationNameInfo &MapperId, SourceLocation ColonLoc, ArrayRef VarList, const OMPVarListLocTy &Locs, ArrayRef UnresolvedMappers) { OpenMPMotionModifierKind Modifiers[] = {OMPC_MOTION_MODIFIER_unknown, + OMPC_MOTION_MODIFIER_unknown, OMPC_MOTION_MODIFIER_unknown}; SourceLocation ModifiersLoc[NumberOfOMPMotionModifiers]; @@ -24967,11 +24973,15 @@ OMPClause *SemaOpenMP::ActOnOpenMPFromClause( MapperIdScopeSpec, MapperId, UnresolvedMappers); if (MVLI.ProcessedVarList.empty()) return nullptr; - + if (IteratorExpr) + if (auto *DRE = dyn_cast(IteratorExpr)) + if (auto *VD = dyn_cast(DRE->getDecl())) + DSAStack->addIteratorVarDecl(VD); return OMPFromClause::Create( getASTContext(), Locs, MVLI.ProcessedVarList, MVLI.VarBaseDeclarations, - MVLI.VarComponents, MVLI.UDMapperList, Modifiers, ModifiersLoc, - MapperIdScopeSpec.getWithLocInContext(getASTContext()), MapperId); + MVLI.VarComponents, MVLI.UDMapperList, IteratorExpr, Modifiers, + ModifiersLoc, MapperIdScopeSpec.getWithLocInContext(getASTContext()), + MapperId); } OMPClause * diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 0e8b674a006d0..8e5dbeb792348 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -2221,13 +2221,14 @@ class TreeTransform { OMPClause * RebuildOMPToClause(ArrayRef MotionModifiers, ArrayRef MotionModifiersLoc, - CXXScopeSpec &MapperIdScopeSpec, + Expr *IteratorModifier, CXXScopeSpec &MapperIdScopeSpec, DeclarationNameInfo &MapperId, SourceLocation ColonLoc, ArrayRef VarList, const OMPVarListLocTy &Locs, ArrayRef UnresolvedMappers) { return getSema().OpenMP().ActOnOpenMPToClause( - MotionModifiers, MotionModifiersLoc, MapperIdScopeSpec, MapperId, - ColonLoc, VarList, Locs, UnresolvedMappers); + MotionModifiers, MotionModifiersLoc, IteratorModifier, + MapperIdScopeSpec, MapperId, ColonLoc, VarList, Locs, + UnresolvedMappers); } /// Build a new OpenMP 'from' clause. @@ -2237,13 +2238,14 @@ class TreeTransform { OMPClause * RebuildOMPFromClause(ArrayRef MotionModifiers, ArrayRef MotionModifiersLoc, - CXXScopeSpec &MapperIdScopeSpec, + Expr *IteratorModifier, CXXScopeSpec &MapperIdScopeSpec, DeclarationNameInfo &MapperId, SourceLocation ColonLoc, ArrayRef VarList, const OMPVarListLocTy &Locs, ArrayRef UnresolvedMappers) { return getSema().OpenMP().ActOnOpenMPFromClause( - MotionModifiers, MotionModifiersLoc, MapperIdScopeSpec, MapperId, - ColonLoc, VarList, Locs, UnresolvedMappers); + MotionModifiers, MotionModifiersLoc, IteratorModifier, + MapperIdScopeSpec, MapperId, ColonLoc, VarList, Locs, + UnresolvedMappers); } /// Build a new OpenMP 'use_device_ptr' clause. @@ -11535,6 +11537,13 @@ template OMPClause *TreeTransform::TransformOMPToClause(OMPToClause *C) { OMPVarListLocTy Locs(C->getBeginLoc(), C->getLParenLoc(), C->getEndLoc()); llvm::SmallVector Vars; + Expr *IteratorModifier = C->getIteratorModifier(); + if (IteratorModifier) { + ExprResult MapModRes = getDerived().TransformExpr(IteratorModifier); + if (MapModRes.isInvalid()) + return nullptr; + IteratorModifier = MapModRes.get(); + } CXXScopeSpec MapperIdScopeSpec; DeclarationNameInfo MapperIdInfo; llvm::SmallVector UnresolvedMappers; @@ -11542,14 +11551,22 @@ OMPClause *TreeTransform::TransformOMPToClause(OMPToClause *C) { *this, C, Vars, MapperIdScopeSpec, MapperIdInfo, UnresolvedMappers)) return nullptr; return getDerived().RebuildOMPToClause( - C->getMotionModifiers(), C->getMotionModifiersLoc(), MapperIdScopeSpec, - MapperIdInfo, C->getColonLoc(), Vars, Locs, UnresolvedMappers); + C->getMotionModifiers(), C->getMotionModifiersLoc(), IteratorModifier, + MapperIdScopeSpec, MapperIdInfo, C->getColonLoc(), Vars, Locs, + UnresolvedMappers); } template OMPClause *TreeTransform::TransformOMPFromClause(OMPFromClause *C) { OMPVarListLocTy Locs(C->getBeginLoc(), C->getLParenLoc(), C->getEndLoc()); llvm::SmallVector Vars; + Expr *IteratorModifier = C->getIteratorModifier(); + if (IteratorModifier) { + ExprResult MapModRes = getDerived().TransformExpr(IteratorModifier); + if (MapModRes.isInvalid()) + return nullptr; + IteratorModifier = MapModRes.get(); + } CXXScopeSpec MapperIdScopeSpec; DeclarationNameInfo MapperIdInfo; llvm::SmallVector UnresolvedMappers; @@ -11557,8 +11574,9 @@ OMPClause *TreeTransform::TransformOMPFromClause(OMPFromClause *C) { *this, C, Vars, MapperIdScopeSpec, MapperIdInfo, UnresolvedMappers)) return nullptr; return getDerived().RebuildOMPFromClause( - C->getMotionModifiers(), C->getMotionModifiersLoc(), MapperIdScopeSpec, - MapperIdInfo, C->getColonLoc(), Vars, Locs, UnresolvedMappers); + C->getMotionModifiers(), C->getMotionModifiersLoc(), IteratorModifier, + MapperIdScopeSpec, MapperIdInfo, C->getColonLoc(), Vars, Locs, + UnresolvedMappers); } template diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index b0c7bae46f09e..47a1dc8f5a478 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -12387,6 +12387,8 @@ void OMPClauseReader::VisitOMPToClause(OMPToClause *C) { C->setMotionModifier( I, static_cast(Record.readInt())); C->setMotionModifierLoc(I, Record.readSourceLocation()); + if (C->getMotionModifier(I) == OMPC_MOTION_MODIFIER_iterator) + C->setIteratorModifier(Record.readExpr()); } C->setMapperQualifierLoc(Record.readNestedNameSpecifierLoc()); C->setMapperIdInfo(Record.readDeclarationNameInfo()); @@ -12443,6 +12445,8 @@ void OMPClauseReader::VisitOMPFromClause(OMPFromClause *C) { C->setMotionModifier( I, static_cast(Record.readInt())); C->setMotionModifierLoc(I, Record.readSourceLocation()); + if (C->getMotionModifier(I) == OMPC_MOTION_MODIFIER_iterator) + C->setIteratorModifier(Record.readExpr()); } C->setMapperQualifierLoc(Record.readNestedNameSpecifierLoc()); C->setMapperIdInfo(Record.readDeclarationNameInfo()); diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index e8c0d3f2b4ee9..fcee93c0ebbd3 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -8417,6 +8417,8 @@ void OMPClauseWriter::VisitOMPToClause(OMPToClause *C) { for (unsigned I = 0; I < NumberOfOMPMotionModifiers; ++I) { Record.push_back(C->getMotionModifier(I)); Record.AddSourceLocation(C->getMotionModifierLoc(I)); + if (C->getMotionModifier(I) == OMPC_MOTION_MODIFIER_iterator) + Record.AddStmt(C->getIteratorModifier()); } Record.AddNestedNameSpecifierLoc(C->getMapperQualifierLoc()); Record.AddDeclarationNameInfo(C->getMapperIdInfo()); @@ -8447,6 +8449,8 @@ void OMPClauseWriter::VisitOMPFromClause(OMPFromClause *C) { for (unsigned I = 0; I < NumberOfOMPMotionModifiers; ++I) { Record.push_back(C->getMotionModifier(I)); Record.AddSourceLocation(C->getMotionModifierLoc(I)); + if (C->getMotionModifier(I) == OMPC_MOTION_MODIFIER_iterator) + Record.AddStmt(C->getIteratorModifier()); } Record.AddNestedNameSpecifierLoc(C->getMapperQualifierLoc()); Record.AddDeclarationNameInfo(C->getMapperIdInfo()); diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp b/clang/test/AST/ByteCode/builtin-functions.cpp index 4a53cb66b2fdd..3076b5239ebbe 100644 --- a/clang/test/AST/ByteCode/builtin-functions.cpp +++ b/clang/test/AST/ByteCode/builtin-functions.cpp @@ -1545,6 +1545,13 @@ namespace Memcmp { int unknown; void foo(void) { unknown *= __builtin_memcmp(0, 0, 2); } + + constexpr int onepasttheend(char a) { + __builtin_memcmp(&a, &a + 1, 1); // both-note {{read of dereferenced one-past-the-end pointer}} + return 1; + } + static_assert(onepasttheend(10)); // both-error {{not an integral constant expression}} \ + // both-note {{in call to}} } namespace Memchr { diff --git a/clang/test/AST/ByteCode/c.c b/clang/test/AST/ByteCode/c.c index bffd557ff77a6..0d3d97b5eeab2 100644 --- a/clang/test/AST/ByteCode/c.c +++ b/clang/test/AST/ByteCode/c.c @@ -392,3 +392,16 @@ void plainComplex(void) { _Complex cd; // all-warning {{_Complex double}} cd = *(_Complex *)&(struct { double r, i; }){0.0, 0.0}; // all-warning {{_Complex double}} } + +/// This test results in an ImplicitValueInitExpr with DiscardResult set. +struct M{ + char c; +}; +typedef struct S64 { + struct M m; + char a[64]; +} I64; + +_Static_assert((((I64){}, 1)), ""); // all-warning {{left operand of comma operator has no effect}} \ + // pedantic-warning {{use of an empty initializer is a C23 extension}} \ + // pedantic-warning {{expression is not an integer constant expression; folding it to a constant is a GNU extension}} diff --git a/clang/test/OpenMP/cancel_codegen.cpp b/clang/test/OpenMP/cancel_codegen.cpp index 16e7542a8e826..600aae211087a 100644 --- a/clang/test/OpenMP/cancel_codegen.cpp +++ b/clang/test/OpenMP/cancel_codegen.cpp @@ -774,10 +774,8 @@ for (int i = 0; i < argc; ++i) { // CHECK3-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM12]]) // CHECK3-NEXT: br label [[OMP_SECTION_LOOP_AFTER:%.*]] // CHECK3: omp_section_loop.after: -// CHECK3-NEXT: br label [[OMP_SECTION_LOOP_AFTERSECTIONS_FINI:%.*]] -// CHECK3: omp_section_loop.aftersections.fini: -// CHECK3-NEXT: br label [[OMP_SECTION_LOOP_PREHEADER13:%.*]] -// CHECK3: omp_section_loop.preheader13: +// CHECK3-NEXT: br label [[OMP_SECTION_LOOP_PREHEADER16:%.*]] +// CHECK3: omp_section_loop.preheader16: // CHECK3-NEXT: store i32 0, ptr [[P_LOWERBOUND29]], align 4 // CHECK3-NEXT: store i32 1, ptr [[P_UPPERBOUND30]], align 4 // CHECK3-NEXT: store i32 1, ptr [[P_STRIDE31]], align 4 @@ -787,54 +785,52 @@ for (int i = 0; i < argc; ++i) { // CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[P_UPPERBOUND30]], align 4 // CHECK3-NEXT: [[TMP11:%.*]] = sub i32 [[TMP10]], [[TMP9]] // CHECK3-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 1 -// CHECK3-NEXT: br label [[OMP_SECTION_LOOP_HEADER14:%.*]] -// CHECK3: omp_section_loop.header14: -// CHECK3-NEXT: [[OMP_SECTION_LOOP_IV20:%.*]] = phi i32 [ 0, [[OMP_SECTION_LOOP_PREHEADER13]] ], [ [[OMP_SECTION_LOOP_NEXT22:%.*]], [[OMP_SECTION_LOOP_INC17:%.*]] ] -// CHECK3-NEXT: br label [[OMP_SECTION_LOOP_COND15:%.*]] -// CHECK3: omp_section_loop.cond15: +// CHECK3-NEXT: br label [[OMP_SECTION_LOOP_HEADER17:%.*]] +// CHECK3: omp_section_loop.header17: +// CHECK3-NEXT: [[OMP_SECTION_LOOP_IV20:%.*]] = phi i32 [ 0, [[OMP_SECTION_LOOP_PREHEADER16]] ], [ [[OMP_SECTION_LOOP_NEXT22:%.*]], [[OMP_SECTION_LOOP_INC17:%.*]] ] +// CHECK3-NEXT: br label [[OMP_SECTION_LOOP_COND18:%.*]] +// CHECK3: omp_section_loop.cond18: // CHECK3-NEXT: [[OMP_SECTION_LOOP_CMP21:%.*]] = icmp ult i32 [[OMP_SECTION_LOOP_IV20]], [[TMP12]] -// CHECK3-NEXT: br i1 [[OMP_SECTION_LOOP_CMP21]], label [[OMP_SECTION_LOOP_BODY16:%.*]], label [[OMP_SECTION_LOOP_EXIT18:%.*]] -// CHECK3: omp_section_loop.body16: +// CHECK3-NEXT: br i1 [[OMP_SECTION_LOOP_CMP21]], label [[OMP_SECTION_LOOP_BODY19:%.*]], label [[OMP_SECTION_LOOP_EXIT21:%.*]] +// CHECK3: omp_section_loop.body19: // CHECK3-NEXT: [[TMP13:%.*]] = add i32 [[OMP_SECTION_LOOP_IV20]], [[TMP9]] // CHECK3-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], 1 // CHECK3-NEXT: [[TMP15:%.*]] = add i32 [[TMP14]], 0 // CHECK3-NEXT: switch i32 [[TMP15]], label [[OMP_SECTION_LOOP_BODY16_SECTIONS_AFTER:%.*]] [ -// CHECK3-NEXT: i32 0, label [[OMP_SECTION_LOOP_BODY_CASE23:%.*]] -// CHECK3-NEXT: i32 1, label [[OMP_SECTION_LOOP_BODY_CASE25:%.*]] +// CHECK3-NEXT: i32 0, label [[OMP_SECTION_LOOP_BODY_CASE26:%.*]] +// CHECK3-NEXT: i32 1, label [[OMP_SECTION_LOOP_BODY_CASE29:%.*]] // CHECK3-NEXT: ] -// CHECK3: omp_section_loop.body.case23: +// CHECK3: omp_section_loop.body.case26: // CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM24:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) // CHECK3-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_cancel(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM24]], i32 3) // CHECK3-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP16]], 0 -// CHECK3-NEXT: br i1 [[TMP17]], label [[OMP_SECTION_LOOP_BODY_CASE23_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE23_CNCL:%.*]] -// CHECK3: omp_section_loop.body.case23.split: -// CHECK3-NEXT: br label [[OMP_SECTION_LOOP_BODY_CASE23_SECTION_AFTER:%.*]] -// CHECK3: omp_section_loop.body.case23.section.after: +// CHECK3-NEXT: br i1 [[TMP17]], label [[OMP_SECTION_LOOP_BODY_CASE26_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE26_CNCL:%.*]] +// CHECK3: omp_section_loop.body.case26.split: +// CHECK3-NEXT: br label [[OMP_SECTION_LOOP_BODY_CASE26_SECTION_AFTER:%.*]] +// CHECK3: omp_section_loop.body.case26.section.after: // CHECK3-NEXT: br label [[OMP_SECTION_LOOP_BODY16_SECTIONS_AFTER]] -// CHECK3: omp_section_loop.body.case25: +// CHECK3: omp_section_loop.body.case29: // CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM27:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) // CHECK3-NEXT: [[TMP18:%.*]] = call i32 @__kmpc_cancel(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM27]], i32 3) // CHECK3-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP18]], 0 -// CHECK3-NEXT: br i1 [[TMP19]], label [[OMP_SECTION_LOOP_BODY_CASE25_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE25_CNCL:%.*]] -// CHECK3: omp_section_loop.body.case25.split: -// CHECK3-NEXT: br label [[OMP_SECTION_LOOP_BODY_CASE25_SECTION_AFTER26:%.*]] -// CHECK3: omp_section_loop.body.case25.section.after26: -// CHECK3-NEXT: br label [[OMP_SECTION_LOOP_BODY_CASE25_SECTION_AFTER:%.*]] -// CHECK3: omp_section_loop.body.case25.section.after: -// CHECK3-NEXT: br label [[OMP_SECTION_LOOP_BODY16_SECTIONS_AFTER]] -// CHECK3: omp_section_loop.body16.sections.after: -// CHECK3-NEXT: br label [[OMP_SECTION_LOOP_INC17]] -// CHECK3: omp_section_loop.inc17: +// CHECK3-NEXT: br i1 [[TMP19]], label [[OMP_SECTION_LOOP_BODY_CASE29_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE29_CNCL:%.*]] +// CHECK3: omp_section_loop.body.case29.split: +// CHECK3-NEXT: br label [[OMP_SECTION_LOOP_BODY_CASE25_SECTION_AFTER29:%.*]] +// CHECK3: omp_section_loop.body.case29.section.after30: +// CHECK3-NEXT: br label [[OMP_SECTION_LOOP_BODY_CASE29_SECTION_AFTER:%.*]] +// CHECK3: omp_section_loop.body.case29.section.after: +// CHECK3-NEXT: br label [[OMP_SECTION_LOOP_BODY19_SECTIONS_AFTER:.*]] +// CHECK3: omp_section_loop.body19.sections.after: +// CHECK3-NEXT: br label [[OMP_SECTION_LOOP_INC20:.*]] +// CHECK3: omp_section_loop.inc20: // CHECK3-NEXT: [[OMP_SECTION_LOOP_NEXT22]] = add nuw i32 [[OMP_SECTION_LOOP_IV20]], 1 -// CHECK3-NEXT: br label [[OMP_SECTION_LOOP_HEADER14]] -// CHECK3: omp_section_loop.exit18: +// CHECK3-NEXT: br label [[OMP_SECTION_LOOP_HEADER17]] +// CHECK3: omp_section_loop.exit21: // CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM32]]) // CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM33:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) // CHECK3-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM33]]) -// CHECK3-NEXT: br label [[OMP_SECTION_LOOP_AFTER19:%.*]] -// CHECK3: omp_section_loop.after19: -// CHECK3-NEXT: br label [[OMP_SECTION_LOOP_AFTER19SECTIONS_FINI:%.*]] -// CHECK3: omp_section_loop.after19sections.fini: +// CHECK3-NEXT: br label [[OMP_SECTION_LOOP_AFTER22:%.*]] +// CHECK3: omp_section_loop.after22: // CHECK3-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 // CHECK3-NEXT: store i32 [[TMP20]], ptr [[DOTCAPTURE_EXPR_]], align 4 // CHECK3-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 @@ -891,11 +887,11 @@ for (int i = 0; i < argc; ++i) { // CHECK3: .cancel.exit: // CHECK3-NEXT: br label [[CANCEL_EXIT:%.*]] // CHECK3: omp_section_loop.body.case.cncl: -// CHECK3-NEXT: br label [[OMP_SECTION_LOOP_EXIT]] -// CHECK3: omp_section_loop.body.case23.cncl: -// CHECK3-NEXT: br label [[OMP_SECTION_LOOP_EXIT18]] -// CHECK3: omp_section_loop.body.case25.cncl: -// CHECK3-NEXT: br label [[OMP_SECTION_LOOP_EXIT18]] +// CHECK3-NEXT: br label [[OMP_SECTION_LOOP_EXIT:.*]] +// CHECK3: omp_section_loop.body.case26.cncl: +// CHECK3-NEXT: br label [[OMP_SECTION_LOOP_EXIT18:.*]] +// CHECK3: omp_section_loop.body.case29.cncl: +// CHECK3-NEXT: br label [[OMP_SECTION_LOOP_EXIT21:.*]] // CHECK3: .cancel.continue: // CHECK3-NEXT: br label [[OMP_IF_END:%.*]] // CHECK3: omp_if.else: @@ -954,8 +950,17 @@ for (int i = 0; i < argc; ++i) { // CHECK3-NEXT: [[TOBOOL:%.*]] = fcmp une float [[TMP2]], 0.000000e+00 // CHECK3-NEXT: br i1 [[TOBOOL]], label [[TMP14:%.*]], label [[TMP3:%.*]] // CHECK3: 3: -// CHECK3-NEXT: br label [[TMP4:%.*]] -// CHECK3: 4: +// CHECK3-NEXT: %[[GTN:.*]] = call i32 @__kmpc_global_thread_num(ptr @1) +// CHECK3-NEXT: %[[CANCEL_POINT:.*]] = call i32 @__kmpc_cancellationpoint(ptr @1, i32 %[[GTN]], i32 1) +// CHECK3-NEXT: %[[COND:.*]] = icmp eq i32 %[[CANCEL_POINT]], 0 +// CHECK3-NEXT: br i1 %[[COND]], label %[[SPLIT:.*]], label %[[CNCL:.*]] +// CHECK3: .cncl: +// CHECK3-NEXT: br label %[[FINI:.*]] +// CHECK3: .fini: +// CHECK3-NEXT: br label %[[EXIT_STUB:omp.par.exit.exitStub]] +// CHECK3: .split: +// CHECK3-NEXT: br label [[TMP6:%.*]] +// CHECK3: 6: // CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[LOADGEP_ARGC_ADDR]], align 4 // CHECK3-NEXT: [[CONV:%.*]] = trunc i32 [[TMP5]] to i8 // CHECK3-NEXT: [[TMP6:%.*]] = load ptr, ptr [[LOADGEP_ARGV_ADDR]], align 8 @@ -967,8 +972,8 @@ for (int i = 0; i < argc; ++i) { // CHECK3-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_cancel_barrier(ptr @[[GLOB3:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM4]]) // CHECK3-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0 // CHECK3-NEXT: br i1 [[TMP9]], label [[DOTCONT:%.*]], label [[DOTCNCL5:%.*]] -// CHECK3: .cncl5: -// CHECK3-NEXT: br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]] +// CHECK3: .cncl7: +// CHECK3-NEXT: br label %[[FINI]] // CHECK3: .cont: // CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[LOADGEP_ARGC_ADDR]], align 4 // CHECK3-NEXT: [[TMP11:%.*]] = load ptr, ptr [[LOADGEP_ARGV_ADDR]], align 8 @@ -984,18 +989,16 @@ for (int i = 0; i < argc; ++i) { // CHECK3: omp.par.region.parallel.after: // CHECK3-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]] // CHECK3: omp.par.pre_finalize: -// CHECK3-NEXT: br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB]] -// CHECK3: 14: +// CHECK3-NEXT: br label %[[FINI]] +// CHECK3: 16: // CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) // CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_cancel(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM1]], i32 1) // CHECK3-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 0 // CHECK3-NEXT: br i1 [[TMP16]], label [[DOTSPLIT:%.*]], label [[DOTCNCL:%.*]] -// CHECK3: .cncl: -// CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) -// CHECK3-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_cancel_barrier(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]]) -// CHECK3-NEXT: br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB]] -// CHECK3: .split: -// CHECK3-NEXT: br label [[TMP4]] +// CHECK3: .cncl4: +// CHECK3-NEXT: br label %[[FINI]] +// CHECK3: .split3: +// CHECK3-NEXT: br label {{.+}} // CHECK3: omp.par.exit.exitStub: // CHECK3-NEXT: ret void // @@ -1089,7 +1092,7 @@ for (int i = 0; i < argc; ++i) { // CHECK3: .omp.sections.case.split: // CHECK3-NEXT: br label [[DOTOMP_SECTIONS_EXIT]] // CHECK3: .omp.sections.case.cncl: -// CHECK3-NEXT: br label [[CANCEL_CONT:%.*]] +// CHECK3-NEXT: br label [[FINI:%.*]] // CHECK3: .omp.sections.exit: // CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK3: omp.inner.for.inc: @@ -1100,7 +1103,7 @@ for (int i = 0; i < argc; ++i) { // CHECK3: omp.inner.for.end: // CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM3:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB19:[0-9]+]]) // CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB15]], i32 [[OMP_GLOBAL_THREAD_NUM3]]) -// CHECK3-NEXT: br label [[CANCEL_CONT]] +// CHECK3-NEXT: br label [[CANCEL_CONT:.*]] // CHECK3: cancel.cont: // CHECK3-NEXT: ret void // CHECK3: cancel.exit: @@ -1153,6 +1156,8 @@ for (int i = 0; i < argc; ++i) { // CHECK3: .omp.sections.case.split: // CHECK3-NEXT: br label [[DOTOMP_SECTIONS_EXIT]] // CHECK3: .omp.sections.case.cncl: +// CHECK3-NEXT: br label [[DOTFINI:.%*]] +// CHECK3: .fini: // CHECK3-NEXT: br label [[CANCEL_CONT:%.*]] // CHECK3: .omp.sections.case2: // CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM3:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) @@ -1162,9 +1167,11 @@ for (int i = 0; i < argc; ++i) { // CHECK3: .omp.sections.case2.split: // CHECK3-NEXT: br label [[DOTOMP_SECTIONS_CASE2_SECTION_AFTER:%.*]] // CHECK3: .omp.sections.case2.section.after: -// CHECK3-NEXT: br label [[DOTOMP_SECTIONS_EXIT]] +// CHECK3-NEXT: br label [[OMP_REGION_FINALIZE:.*]] +// CHECK3: omp_region.finalize: +// CHECK3-NEXT: br label [[OMP_SECTIONS_EXIT:.*]] // CHECK3: .omp.sections.case2.cncl: -// CHECK3-NEXT: br label [[OMP_INNER_FOR_END]] +// CHECK3-NEXT: br label [[FINI:.*]] // CHECK3: .omp.sections.exit: // CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK3: omp.inner.for.inc: diff --git a/clang/test/OpenMP/critical_codegen.cpp b/clang/test/OpenMP/critical_codegen.cpp index 5c752d354804b..9620613dfdb87 100644 --- a/clang/test/OpenMP/critical_codegen.cpp +++ b/clang/test/OpenMP/critical_codegen.cpp @@ -35,6 +35,8 @@ int main() { // ALL-NEXT: store i8 2, ptr [[A_ADDR]] // IRBUILDER-NEXT: br label %[[AFTER:[^ ,]+]] // IRBUILDER: [[AFTER]] +// IRBUILDER-NEXT: br label %[[OMP_REGION_FINALIZE:[^ ,]+]] +// IRBUILDER: [[OMP_REGION_FINALIZE]] // ALL-NEXT: call {{.*}}void @__kmpc_end_critical(ptr [[DEFAULT_LOC]], i32 [[GTID]], ptr [[UNNAMED_LOCK]]) #pragma omp critical a = 2; diff --git a/clang/test/OpenMP/critical_codegen_attr.cpp b/clang/test/OpenMP/critical_codegen_attr.cpp index 32482a92e76b8..50b0b04fcfd4a 100644 --- a/clang/test/OpenMP/critical_codegen_attr.cpp +++ b/clang/test/OpenMP/critical_codegen_attr.cpp @@ -35,6 +35,8 @@ int main() { // ALL-NEXT: store i8 2, ptr [[A_ADDR]] // IRBUILDER-NEXT: br label %[[AFTER:[^ ,]+]] // IRBUILDER: [[AFTER]] +// IRBUILDER-NEXT: br label %[[OMP_REGION_FINALIZE:[^ ,]+]] +// IRBUILDER: [[OMP_REGION_FINALIZE]] // ALL-NEXT: call {{.*}}void @__kmpc_end_critical(ptr [[DEFAULT_LOC]], i32 [[GTID]], ptr [[UNNAMED_LOCK]]) [[omp::directive(critical)]] a = 2; diff --git a/clang/test/OpenMP/irbuilder_nested_parallel_for.c b/clang/test/OpenMP/irbuilder_nested_parallel_for.c index 607d5a60b5cfb..f946c09726853 100644 --- a/clang/test/OpenMP/irbuilder_nested_parallel_for.c +++ b/clang/test/OpenMP/irbuilder_nested_parallel_for.c @@ -137,6 +137,8 @@ void parallel_for_2(float *r, int a, double b) { // CHECK: omp.par.region.parallel.after: // CHECK-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]] // CHECK: omp.par.pre_finalize: +// CHECK-NEXT: br label [[DOTFINI:%.*]] +// CHECK: .fini: // CHECK-NEXT: br label [[OMP_PAR_EXIT_EXITSTUB:%.*]] // CHECK: omp_loop.body: // CHECK-NEXT: [[TMP8:%.*]] = add i32 [[OMP_LOOP_IV]], [[TMP5]] @@ -161,7 +163,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: store ptr [[__CONTEXT]], ptr [[__CONTEXT_ADDR]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON:%.*]], ptr [[TMP0]], i32 0, i32 0 -// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !nonnull [[META3:![0-9]+]], !align [[META4:![0-9]+]] // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 // CHECK-NEXT: store i32 [[TMP3]], ptr [[DOTSTART]], align 4 // CHECK-NEXT: store i32 100, ptr [[DOTSTOP]], align 4 @@ -184,7 +186,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: br label [[COND_END]] // CHECK: cond.end: // CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ] -// CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8 +// CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK-NEXT: store i32 [[COND]], ptr [[TMP10]], align 4 // CHECK-NEXT: ret void // @@ -204,7 +206,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4 // CHECK-NEXT: [[MUL:%.*]] = mul i32 1, [[TMP3]] // CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]] -// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK-NEXT: store i32 [[ADD]], ptr [[TMP4]], align 4 // CHECK-NEXT: ret void // @@ -238,11 +240,11 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-SAME: (ptr noalias [[TID_ADDR:%.*]], ptr noalias [[ZERO_ADDR:%.*]], ptr [[TMP0:%.*]]) #[[ATTR1]] { // CHECK-NEXT: omp.par.entry: // CHECK-NEXT: [[GEP_A_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 0 -// CHECK-NEXT: [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META5:![0-9]+]] +// CHECK-NEXT: [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META4]] // CHECK-NEXT: [[GEP_B_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 1 -// CHECK-NEXT: [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META6:![0-9]+]] +// CHECK-NEXT: [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META7:![0-9]+]] // CHECK-NEXT: [[GEP_R_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 2 -// CHECK-NEXT: [[LOADGEP_R_ADDR:%.*]] = load ptr, ptr [[GEP_R_ADDR]], align 8, !align [[META6]] +// CHECK-NEXT: [[LOADGEP_R_ADDR:%.*]] = load ptr, ptr [[GEP_R_ADDR]], align 8, !align [[META7]] // CHECK-NEXT: [[STRUCTARG:%.*]] = alloca { ptr, ptr, ptr }, align 8 // CHECK-NEXT: [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4 // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TID_ADDR]], align 4 @@ -266,6 +268,8 @@ void parallel_for_2(float *r, int a, double b) { // CHECK: omp.par.region.parallel.after: // CHECK-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]] // CHECK: omp.par.pre_finalize: +// CHECK-NEXT: br label [[DOTFINI16:%.*]] +// CHECK: .fini16: // CHECK-NEXT: br label [[OMP_PAR_EXIT_EXITSTUB:%.*]] // CHECK: omp.par.exit.exitStub: // CHECK-NEXT: ret void @@ -275,11 +279,11 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-SAME: (ptr noalias [[TID_ADDR2:%.*]], ptr noalias [[ZERO_ADDR3:%.*]], ptr [[TMP0:%.*]]) #[[ATTR1]] { // CHECK-NEXT: omp.par.entry4: // CHECK-NEXT: [[GEP_A_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 0 -// CHECK-NEXT: [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META5]] +// CHECK-NEXT: [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META4]] // CHECK-NEXT: [[GEP_B_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 1 -// CHECK-NEXT: [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META6]] +// CHECK-NEXT: [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META7]] // CHECK-NEXT: [[GEP_R_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 2 -// CHECK-NEXT: [[LOADGEP_R_ADDR:%.*]] = load ptr, ptr [[GEP_R_ADDR]], align 8, !align [[META6]] +// CHECK-NEXT: [[LOADGEP_R_ADDR:%.*]] = load ptr, ptr [[GEP_R_ADDR]], align 8, !align [[META7]] // CHECK-NEXT: [[P_LASTITER:%.*]] = alloca i32, align 4 // CHECK-NEXT: [[P_LOWERBOUND:%.*]] = alloca i32, align 4 // CHECK-NEXT: [[P_UPPERBOUND:%.*]] = alloca i32, align 4 @@ -331,6 +335,8 @@ void parallel_for_2(float *r, int a, double b) { // CHECK: omp.par.region5.parallel.after: // CHECK-NEXT: br label [[OMP_PAR_PRE_FINALIZE6:%.*]] // CHECK: omp.par.pre_finalize6: +// CHECK-NEXT: br label [[DOTFINI:%.*]] +// CHECK: .fini: // CHECK-NEXT: br label [[OMP_PAR_EXIT7_EXITSTUB:%.*]] // CHECK: omp_loop.body: // CHECK-NEXT: [[TMP9:%.*]] = add i32 [[OMP_LOOP_IV]], [[TMP6]] @@ -362,7 +368,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: store ptr [[__CONTEXT]], ptr [[__CONTEXT_ADDR]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_1:%.*]], ptr [[TMP0]], i32 0, i32 0 -// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 // CHECK-NEXT: store i32 [[TMP3]], ptr [[DOTSTART]], align 4 // CHECK-NEXT: store i32 100, ptr [[DOTSTOP]], align 4 @@ -385,7 +391,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: br label [[COND_END]] // CHECK: cond.end: // CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ] -// CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8 +// CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK-NEXT: store i32 [[COND]], ptr [[TMP10]], align 4 // CHECK-NEXT: ret void // @@ -405,7 +411,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4 // CHECK-NEXT: [[MUL:%.*]] = mul i32 1, [[TMP3]] // CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]] -// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK-NEXT: store i32 [[ADD]], ptr [[TMP4]], align 4 // CHECK-NEXT: ret void // @@ -417,14 +423,14 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: [[R_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 // CHECK-NEXT: [[B_ADDR:%.*]] = alloca double, align 8 -// CHECK-NEXT: [[I188:%.*]] = alloca i32, align 4 -// CHECK-NEXT: [[AGG_CAPTURED189:%.*]] = alloca [[STRUCT_ANON_17:%.*]], align 8 -// CHECK-NEXT: [[AGG_CAPTURED190:%.*]] = alloca [[STRUCT_ANON_18:%.*]], align 4 -// CHECK-NEXT: [[DOTCOUNT_ADDR191:%.*]] = alloca i32, align 4 -// CHECK-NEXT: [[P_LASTITER206:%.*]] = alloca i32, align 4 -// CHECK-NEXT: [[P_LOWERBOUND207:%.*]] = alloca i32, align 4 -// CHECK-NEXT: [[P_UPPERBOUND208:%.*]] = alloca i32, align 4 -// CHECK-NEXT: [[P_STRIDE209:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[I191:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[AGG_CAPTURED192:%.*]] = alloca [[STRUCT_ANON_17:%.*]], align 8 +// CHECK-NEXT: [[AGG_CAPTURED193:%.*]] = alloca [[STRUCT_ANON_18:%.*]], align 4 +// CHECK-NEXT: [[DOTCOUNT_ADDR194:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[P_LASTITER209:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[P_LOWERBOUND210:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[P_UPPERBOUND211:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[P_STRIDE212:%.*]] = alloca i32, align 4 // CHECK-NEXT: store ptr [[R]], ptr [[R_ADDR]], align 8 // CHECK-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 // CHECK-NEXT: store double [[B]], ptr [[B_ADDR]], align 8 @@ -440,53 +446,53 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 1, ptr @_Z14parallel_for_2Pfid..omp_par.23, ptr [[STRUCTARG]]) // CHECK-NEXT: br label [[OMP_PAR_EXIT:%.*]] // CHECK: omp.par.exit: -// CHECK-NEXT: store i32 0, ptr [[I188]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_17]], ptr [[AGG_CAPTURED189]], i32 0, i32 0 -// CHECK-NEXT: store ptr [[I188]], ptr [[TMP0]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_18]], ptr [[AGG_CAPTURED190]], i32 0, i32 0 -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[I188]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I191]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_17]], ptr [[AGG_CAPTURED192]], i32 0, i32 0 +// CHECK-NEXT: store ptr [[I191]], ptr [[TMP0]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_18]], ptr [[AGG_CAPTURED193]], i32 0, i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[I191]], align 4 // CHECK-NEXT: store i32 [[TMP2]], ptr [[TMP1]], align 4 -// CHECK-NEXT: call void @__captured_stmt.19(ptr [[DOTCOUNT_ADDR191]], ptr [[AGG_CAPTURED189]]) -// CHECK-NEXT: [[DOTCOUNT192:%.*]] = load i32, ptr [[DOTCOUNT_ADDR191]], align 4 -// CHECK-NEXT: br label [[OMP_LOOP_PREHEADER193:%.*]] -// CHECK: omp_loop.preheader193: -// CHECK-NEXT: store i32 0, ptr [[P_LOWERBOUND207]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = sub i32 [[DOTCOUNT192]], 1 -// CHECK-NEXT: store i32 [[TMP3]], ptr [[P_UPPERBOUND208]], align 4 -// CHECK-NEXT: store i32 1, ptr [[P_STRIDE209]], align 4 -// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM210:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) -// CHECK-NEXT: call void @__kmpc_for_static_init_4u(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM210]], i32 34, ptr [[P_LASTITER206]], ptr [[P_LOWERBOUND207]], ptr [[P_UPPERBOUND208]], ptr [[P_STRIDE209]], i32 1, i32 0) -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[P_LOWERBOUND207]], align 4 -// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[P_UPPERBOUND208]], align 4 -// CHECK-NEXT: [[TRIP_COUNT_MINUS1211:%.*]] = sub i32 [[TMP5]], [[TMP4]] -// CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TRIP_COUNT_MINUS1211]], 1 -// CHECK-NEXT: br label [[OMP_LOOP_HEADER194:%.*]] -// CHECK: omp_loop.header194: -// CHECK-NEXT: [[OMP_LOOP_IV200:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER193]] ], [ [[OMP_LOOP_NEXT202:%.*]], [[OMP_LOOP_INC197:%.*]] ] -// CHECK-NEXT: br label [[OMP_LOOP_COND195:%.*]] -// CHECK: omp_loop.cond195: -// CHECK-NEXT: [[OMP_LOOP_CMP201:%.*]] = icmp ult i32 [[OMP_LOOP_IV200]], [[TMP6]] -// CHECK-NEXT: br i1 [[OMP_LOOP_CMP201]], label [[OMP_LOOP_BODY196:%.*]], label [[OMP_LOOP_EXIT198:%.*]] -// CHECK: omp_loop.body196: -// CHECK-NEXT: [[TMP7:%.*]] = add i32 [[OMP_LOOP_IV200]], [[TMP4]] -// CHECK-NEXT: call void @__captured_stmt.20(ptr [[I188]], i32 [[TMP7]], ptr [[AGG_CAPTURED190]]) +// CHECK-NEXT: call void @__captured_stmt.19(ptr [[DOTCOUNT_ADDR194]], ptr [[AGG_CAPTURED192]]) +// CHECK-NEXT: [[DOTCOUNT195:%.*]] = load i32, ptr [[DOTCOUNT_ADDR194]], align 4 +// CHECK-NEXT: br label [[OMP_LOOP_PREHEADER196:%.*]] +// CHECK: omp_loop.preheader196: +// CHECK-NEXT: store i32 0, ptr [[P_LOWERBOUND210]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = sub i32 [[DOTCOUNT195]], 1 +// CHECK-NEXT: store i32 [[TMP3]], ptr [[P_UPPERBOUND211]], align 4 +// CHECK-NEXT: store i32 1, ptr [[P_STRIDE212]], align 4 +// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM213:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) +// CHECK-NEXT: call void @__kmpc_for_static_init_4u(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM213]], i32 34, ptr [[P_LASTITER209]], ptr [[P_LOWERBOUND210]], ptr [[P_UPPERBOUND211]], ptr [[P_STRIDE212]], i32 1, i32 0) +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[P_LOWERBOUND210]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[P_UPPERBOUND211]], align 4 +// CHECK-NEXT: [[TRIP_COUNT_MINUS1214:%.*]] = sub i32 [[TMP5]], [[TMP4]] +// CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TRIP_COUNT_MINUS1214]], 1 +// CHECK-NEXT: br label [[OMP_LOOP_HEADER197:%.*]] +// CHECK: omp_loop.header197: +// CHECK-NEXT: [[OMP_LOOP_IV203:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER196]] ], [ [[OMP_LOOP_NEXT205:%.*]], [[OMP_LOOP_INC200:%.*]] ] +// CHECK-NEXT: br label [[OMP_LOOP_COND198:%.*]] +// CHECK: omp_loop.cond198: +// CHECK-NEXT: [[OMP_LOOP_CMP204:%.*]] = icmp ult i32 [[OMP_LOOP_IV203]], [[TMP6]] +// CHECK-NEXT: br i1 [[OMP_LOOP_CMP204]], label [[OMP_LOOP_BODY199:%.*]], label [[OMP_LOOP_EXIT201:%.*]] +// CHECK: omp_loop.body199: +// CHECK-NEXT: [[TMP7:%.*]] = add i32 [[OMP_LOOP_IV203]], [[TMP4]] +// CHECK-NEXT: call void @__captured_stmt.20(ptr [[I191]], i32 [[TMP7]], ptr [[AGG_CAPTURED193]]) // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4 -// CHECK-NEXT: [[CONV203:%.*]] = sitofp i32 [[TMP8]] to double +// CHECK-NEXT: [[CONV206:%.*]] = sitofp i32 [[TMP8]] to double // CHECK-NEXT: [[TMP9:%.*]] = load double, ptr [[B_ADDR]], align 8 -// CHECK-NEXT: [[ADD204:%.*]] = fadd double [[CONV203]], [[TMP9]] -// CHECK-NEXT: [[CONV205:%.*]] = fptrunc double [[ADD204]] to float +// CHECK-NEXT: [[ADD207:%.*]] = fadd double [[CONV206]], [[TMP9]] +// CHECK-NEXT: [[CONV208:%.*]] = fptrunc double [[ADD207]] to float // CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[R_ADDR]], align 8 -// CHECK-NEXT: store float [[CONV205]], ptr [[TMP10]], align 4 -// CHECK-NEXT: br label [[OMP_LOOP_INC197]] -// CHECK: omp_loop.inc197: -// CHECK-NEXT: [[OMP_LOOP_NEXT202]] = add nuw i32 [[OMP_LOOP_IV200]], 1 -// CHECK-NEXT: br label [[OMP_LOOP_HEADER194]] -// CHECK: omp_loop.exit198: -// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM210]]) -// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM212:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) -// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM212]]) -// CHECK-NEXT: br label [[OMP_LOOP_AFTER199:%.*]] -// CHECK: omp_loop.after199: +// CHECK-NEXT: store float [[CONV208]], ptr [[TMP10]], align 4 +// CHECK-NEXT: br label [[OMP_LOOP_INC200]] +// CHECK: omp_loop.inc200: +// CHECK-NEXT: [[OMP_LOOP_NEXT205]] = add nuw i32 [[OMP_LOOP_IV203]], 1 +// CHECK-NEXT: br label [[OMP_LOOP_HEADER197]] +// CHECK: omp_loop.exit201: +// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM213]]) +// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM215:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM215]]) +// CHECK-NEXT: br label [[OMP_LOOP_AFTER202:%.*]] +// CHECK: omp_loop.after202: // CHECK-NEXT: ret void // // @@ -494,16 +500,16 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-SAME: (ptr noalias [[TID_ADDR:%.*]], ptr noalias [[ZERO_ADDR:%.*]], ptr [[TMP0:%.*]]) #[[ATTR1]] { // CHECK-NEXT: omp.par.entry: // CHECK-NEXT: [[GEP_A_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 0 -// CHECK-NEXT: [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META5]] +// CHECK-NEXT: [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META4]] // CHECK-NEXT: [[GEP_B_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 1 -// CHECK-NEXT: [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META6]] +// CHECK-NEXT: [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META7]] // CHECK-NEXT: [[GEP_R_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 2 -// CHECK-NEXT: [[LOADGEP_R_ADDR:%.*]] = load ptr, ptr [[GEP_R_ADDR]], align 8, !align [[META6]] +// CHECK-NEXT: [[LOADGEP_R_ADDR:%.*]] = load ptr, ptr [[GEP_R_ADDR]], align 8, !align [[META7]] // CHECK-NEXT: [[STRUCTARG:%.*]] = alloca { ptr, ptr, ptr }, align 8 -// CHECK-NEXT: [[P_LASTITER181:%.*]] = alloca i32, align 4 -// CHECK-NEXT: [[P_LOWERBOUND182:%.*]] = alloca i32, align 4 -// CHECK-NEXT: [[P_UPPERBOUND183:%.*]] = alloca i32, align 4 -// CHECK-NEXT: [[P_STRIDE184:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[P_LASTITER183:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[P_LOWERBOUND184:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[P_UPPERBOUND185:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[P_STRIDE186:%.*]] = alloca i32, align 4 // CHECK-NEXT: [[P_LASTITER:%.*]] = alloca i32, align 4 // CHECK-NEXT: [[P_LOWERBOUND:%.*]] = alloca i32, align 4 // CHECK-NEXT: [[P_UPPERBOUND:%.*]] = alloca i32, align 4 @@ -516,10 +522,10 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON_3:%.*]], align 8 // CHECK-NEXT: [[AGG_CAPTURED1:%.*]] = alloca [[STRUCT_ANON_4:%.*]], align 4 // CHECK-NEXT: [[DOTCOUNT_ADDR:%.*]] = alloca i32, align 4 -// CHECK-NEXT: [[I163:%.*]] = alloca i32, align 4 -// CHECK-NEXT: [[AGG_CAPTURED164:%.*]] = alloca [[STRUCT_ANON_15:%.*]], align 8 -// CHECK-NEXT: [[AGG_CAPTURED165:%.*]] = alloca [[STRUCT_ANON_16:%.*]], align 4 -// CHECK-NEXT: [[DOTCOUNT_ADDR166:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[I165:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[AGG_CAPTURED166:%.*]] = alloca [[STRUCT_ANON_15:%.*]], align 8 +// CHECK-NEXT: [[AGG_CAPTURED167:%.*]] = alloca [[STRUCT_ANON_16:%.*]], align 4 +// CHECK-NEXT: [[DOTCOUNT_ADDR168:%.*]] = alloca i32, align 4 // CHECK-NEXT: br label [[OMP_PAR_REGION:%.*]] // CHECK: omp.par.region: // CHECK-NEXT: store i32 0, ptr [[I]], align 4 @@ -567,58 +573,60 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 1, ptr @_Z14parallel_for_2Pfid..omp_par.22, ptr [[STRUCTARG]]) // CHECK-NEXT: br label [[OMP_PAR_EXIT11:%.*]] // CHECK: omp.par.exit11: -// CHECK-NEXT: store i32 0, ptr [[I163]], align 4 -// CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_15]], ptr [[AGG_CAPTURED164]], i32 0, i32 0 -// CHECK-NEXT: store ptr [[I163]], ptr [[TMP9]], align 8 -// CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_16]], ptr [[AGG_CAPTURED165]], i32 0, i32 0 -// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[I163]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I165]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_15]], ptr [[AGG_CAPTURED166]], i32 0, i32 0 +// CHECK-NEXT: store ptr [[I165]], ptr [[TMP9]], align 8 +// CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_16]], ptr [[AGG_CAPTURED167]], i32 0, i32 0 +// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[I165]], align 4 // CHECK-NEXT: store i32 [[TMP11]], ptr [[TMP10]], align 4 -// CHECK-NEXT: call void @__captured_stmt.17(ptr [[DOTCOUNT_ADDR166]], ptr [[AGG_CAPTURED164]]) -// CHECK-NEXT: [[DOTCOUNT167:%.*]] = load i32, ptr [[DOTCOUNT_ADDR166]], align 4 -// CHECK-NEXT: br label [[OMP_LOOP_PREHEADER168:%.*]] -// CHECK: omp_loop.preheader168: -// CHECK-NEXT: store i32 0, ptr [[P_LOWERBOUND182]], align 4 -// CHECK-NEXT: [[TMP12:%.*]] = sub i32 [[DOTCOUNT167]], 1 -// CHECK-NEXT: store i32 [[TMP12]], ptr [[P_UPPERBOUND183]], align 4 -// CHECK-NEXT: store i32 1, ptr [[P_STRIDE184]], align 4 -// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM185:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) -// CHECK-NEXT: call void @__kmpc_for_static_init_4u(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM185]], i32 34, ptr [[P_LASTITER181]], ptr [[P_LOWERBOUND182]], ptr [[P_UPPERBOUND183]], ptr [[P_STRIDE184]], i32 1, i32 0) -// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[P_LOWERBOUND182]], align 4 -// CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[P_UPPERBOUND183]], align 4 -// CHECK-NEXT: [[TRIP_COUNT_MINUS1186:%.*]] = sub i32 [[TMP14]], [[TMP13]] -// CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TRIP_COUNT_MINUS1186]], 1 -// CHECK-NEXT: br label [[OMP_LOOP_HEADER169:%.*]] -// CHECK: omp_loop.header169: -// CHECK-NEXT: [[OMP_LOOP_IV175:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER168]] ], [ [[OMP_LOOP_NEXT177:%.*]], [[OMP_LOOP_INC172:%.*]] ] -// CHECK-NEXT: br label [[OMP_LOOP_COND170:%.*]] -// CHECK: omp_loop.cond170: -// CHECK-NEXT: [[OMP_LOOP_CMP176:%.*]] = icmp ult i32 [[OMP_LOOP_IV175]], [[TMP15]] -// CHECK-NEXT: br i1 [[OMP_LOOP_CMP176]], label [[OMP_LOOP_BODY171:%.*]], label [[OMP_LOOP_EXIT173:%.*]] -// CHECK: omp_loop.exit173: -// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM185]]) +// CHECK-NEXT: call void @__captured_stmt.17(ptr [[DOTCOUNT_ADDR168]], ptr [[AGG_CAPTURED166]]) +// CHECK-NEXT: [[DOTCOUNT169:%.*]] = load i32, ptr [[DOTCOUNT_ADDR168]], align 4 +// CHECK-NEXT: br label [[OMP_LOOP_PREHEADER170:%.*]] +// CHECK: omp_loop.preheader170: +// CHECK-NEXT: store i32 0, ptr [[P_LOWERBOUND184]], align 4 +// CHECK-NEXT: [[TMP12:%.*]] = sub i32 [[DOTCOUNT169]], 1 +// CHECK-NEXT: store i32 [[TMP12]], ptr [[P_UPPERBOUND185]], align 4 +// CHECK-NEXT: store i32 1, ptr [[P_STRIDE186]], align 4 // CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM187:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) -// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM187]]) -// CHECK-NEXT: br label [[OMP_LOOP_AFTER174:%.*]] -// CHECK: omp_loop.after174: +// CHECK-NEXT: call void @__kmpc_for_static_init_4u(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM187]], i32 34, ptr [[P_LASTITER183]], ptr [[P_LOWERBOUND184]], ptr [[P_UPPERBOUND185]], ptr [[P_STRIDE186]], i32 1, i32 0) +// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[P_LOWERBOUND184]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[P_UPPERBOUND185]], align 4 +// CHECK-NEXT: [[TRIP_COUNT_MINUS1188:%.*]] = sub i32 [[TMP14]], [[TMP13]] +// CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TRIP_COUNT_MINUS1188]], 1 +// CHECK-NEXT: br label [[OMP_LOOP_HEADER171:%.*]] +// CHECK: omp_loop.header171: +// CHECK-NEXT: [[OMP_LOOP_IV177:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER170]] ], [ [[OMP_LOOP_NEXT179:%.*]], [[OMP_LOOP_INC174:%.*]] ] +// CHECK-NEXT: br label [[OMP_LOOP_COND172:%.*]] +// CHECK: omp_loop.cond172: +// CHECK-NEXT: [[OMP_LOOP_CMP178:%.*]] = icmp ult i32 [[OMP_LOOP_IV177]], [[TMP15]] +// CHECK-NEXT: br i1 [[OMP_LOOP_CMP178]], label [[OMP_LOOP_BODY173:%.*]], label [[OMP_LOOP_EXIT175:%.*]] +// CHECK: omp_loop.exit175: +// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM187]]) +// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM189:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM189]]) +// CHECK-NEXT: br label [[OMP_LOOP_AFTER176:%.*]] +// CHECK: omp_loop.after176: // CHECK-NEXT: br label [[OMP_PAR_REGION_PARALLEL_AFTER:%.*]] // CHECK: omp.par.region.parallel.after: // CHECK-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]] // CHECK: omp.par.pre_finalize: +// CHECK-NEXT: br label [[DOTFINI190:%.*]] +// CHECK: .fini190: // CHECK-NEXT: br label [[OMP_PAR_EXIT_EXITSTUB:%.*]] -// CHECK: omp_loop.body171: -// CHECK-NEXT: [[TMP16:%.*]] = add i32 [[OMP_LOOP_IV175]], [[TMP13]] -// CHECK-NEXT: call void @__captured_stmt.18(ptr [[I163]], i32 [[TMP16]], ptr [[AGG_CAPTURED165]]) +// CHECK: omp_loop.body173: +// CHECK-NEXT: [[TMP16:%.*]] = add i32 [[OMP_LOOP_IV177]], [[TMP13]] +// CHECK-NEXT: call void @__captured_stmt.18(ptr [[I165]], i32 [[TMP16]], ptr [[AGG_CAPTURED167]]) // CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[LOADGEP_A_ADDR]], align 4 -// CHECK-NEXT: [[CONV178:%.*]] = sitofp i32 [[TMP17]] to double +// CHECK-NEXT: [[CONV180:%.*]] = sitofp i32 [[TMP17]] to double // CHECK-NEXT: [[TMP18:%.*]] = load double, ptr [[LOADGEP_B_ADDR]], align 8 -// CHECK-NEXT: [[ADD179:%.*]] = fadd double [[CONV178]], [[TMP18]] -// CHECK-NEXT: [[CONV180:%.*]] = fptrunc double [[ADD179]] to float +// CHECK-NEXT: [[ADD181:%.*]] = fadd double [[CONV180]], [[TMP18]] +// CHECK-NEXT: [[CONV182:%.*]] = fptrunc double [[ADD181]] to float // CHECK-NEXT: [[TMP19:%.*]] = load ptr, ptr [[LOADGEP_R_ADDR]], align 8 -// CHECK-NEXT: store float [[CONV180]], ptr [[TMP19]], align 4 -// CHECK-NEXT: br label [[OMP_LOOP_INC172]] -// CHECK: omp_loop.inc172: -// CHECK-NEXT: [[OMP_LOOP_NEXT177]] = add nuw i32 [[OMP_LOOP_IV175]], 1 -// CHECK-NEXT: br label [[OMP_LOOP_HEADER169]] +// CHECK-NEXT: store float [[CONV182]], ptr [[TMP19]], align 4 +// CHECK-NEXT: br label [[OMP_LOOP_INC174]] +// CHECK: omp_loop.inc174: +// CHECK-NEXT: [[OMP_LOOP_NEXT179]] = add nuw i32 [[OMP_LOOP_IV177]], 1 +// CHECK-NEXT: br label [[OMP_LOOP_HEADER171]] // CHECK: omp_loop.body: // CHECK-NEXT: [[TMP20:%.*]] = add i32 [[OMP_LOOP_IV]], [[TMP6]] // CHECK-NEXT: call void @__captured_stmt.6(ptr [[I]], i32 [[TMP20]], ptr [[AGG_CAPTURED1]]) @@ -641,17 +649,17 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-SAME: (ptr noalias [[TID_ADDR6:%.*]], ptr noalias [[ZERO_ADDR7:%.*]], ptr [[TMP0:%.*]]) #[[ATTR1]] { // CHECK-NEXT: omp.par.entry8: // CHECK-NEXT: [[GEP_A_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 0 -// CHECK-NEXT: [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META5]] +// CHECK-NEXT: [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META4]] // CHECK-NEXT: [[GEP_B_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 1 -// CHECK-NEXT: [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META6]] +// CHECK-NEXT: [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META7]] // CHECK-NEXT: [[GEP_R_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 2 -// CHECK-NEXT: [[LOADGEP_R_ADDR:%.*]] = load ptr, ptr [[GEP_R_ADDR]], align 8, !align [[META6]] -// CHECK-NEXT: [[STRUCTARG213:%.*]] = alloca { ptr, ptr, ptr }, align 8 +// CHECK-NEXT: [[LOADGEP_R_ADDR:%.*]] = load ptr, ptr [[GEP_R_ADDR]], align 8, !align [[META7]] +// CHECK-NEXT: [[STRUCTARG216:%.*]] = alloca { ptr, ptr, ptr }, align 8 // CHECK-NEXT: [[STRUCTARG:%.*]] = alloca { ptr, ptr, ptr }, align 8 -// CHECK-NEXT: [[P_LASTITER156:%.*]] = alloca i32, align 4 -// CHECK-NEXT: [[P_LOWERBOUND157:%.*]] = alloca i32, align 4 -// CHECK-NEXT: [[P_UPPERBOUND158:%.*]] = alloca i32, align 4 -// CHECK-NEXT: [[P_STRIDE159:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[P_LASTITER157:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[P_LOWERBOUND158:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[P_UPPERBOUND159:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[P_STRIDE160:%.*]] = alloca i32, align 4 // CHECK-NEXT: [[P_LASTITER95:%.*]] = alloca i32, align 4 // CHECK-NEXT: [[P_LOWERBOUND96:%.*]] = alloca i32, align 4 // CHECK-NEXT: [[P_UPPERBOUND97:%.*]] = alloca i32, align 4 @@ -672,10 +680,10 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: [[AGG_CAPTURED78:%.*]] = alloca [[STRUCT_ANON_9:%.*]], align 8 // CHECK-NEXT: [[AGG_CAPTURED79:%.*]] = alloca [[STRUCT_ANON_10:%.*]], align 4 // CHECK-NEXT: [[DOTCOUNT_ADDR80:%.*]] = alloca i32, align 4 -// CHECK-NEXT: [[I138:%.*]] = alloca i32, align 4 -// CHECK-NEXT: [[AGG_CAPTURED139:%.*]] = alloca [[STRUCT_ANON_13:%.*]], align 8 -// CHECK-NEXT: [[AGG_CAPTURED140:%.*]] = alloca [[STRUCT_ANON_14:%.*]], align 4 -// CHECK-NEXT: [[DOTCOUNT_ADDR141:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[I139:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[AGG_CAPTURED140:%.*]] = alloca [[STRUCT_ANON_13:%.*]], align 8 +// CHECK-NEXT: [[AGG_CAPTURED141:%.*]] = alloca [[STRUCT_ANON_14:%.*]], align 4 +// CHECK-NEXT: [[DOTCOUNT_ADDR142:%.*]] = alloca i32, align 4 // CHECK-NEXT: br label [[OMP_PAR_REGION9:%.*]] // CHECK: omp.par.region9: // CHECK-NEXT: store i32 0, ptr [[I16]], align 4 @@ -757,69 +765,71 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: br label [[OMP_LOOP_AFTER88:%.*]] // CHECK: omp_loop.after88: // CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM102:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) -// CHECK-NEXT: br label [[OMP_PARALLEL217:%.*]] -// CHECK: omp_parallel217: -// CHECK-NEXT: [[GEP_A_ADDR214:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[STRUCTARG213]], i32 0, i32 0 -// CHECK-NEXT: store ptr [[LOADGEP_A_ADDR]], ptr [[GEP_A_ADDR214]], align 8 -// CHECK-NEXT: [[GEP_B_ADDR215:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[STRUCTARG213]], i32 0, i32 1 -// CHECK-NEXT: store ptr [[LOADGEP_B_ADDR]], ptr [[GEP_B_ADDR215]], align 8 -// CHECK-NEXT: [[GEP_R_ADDR216:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[STRUCTARG213]], i32 0, i32 2 -// CHECK-NEXT: store ptr [[LOADGEP_R_ADDR]], ptr [[GEP_R_ADDR216]], align 8 -// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 1, ptr @_Z14parallel_for_2Pfid..omp_par.21, ptr [[STRUCTARG213]]) +// CHECK-NEXT: br label [[OMP_PARALLEL220:%.*]] +// CHECK: omp_parallel220: +// CHECK-NEXT: [[GEP_A_ADDR217:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[STRUCTARG216]], i32 0, i32 0 +// CHECK-NEXT: store ptr [[LOADGEP_A_ADDR]], ptr [[GEP_A_ADDR217]], align 8 +// CHECK-NEXT: [[GEP_B_ADDR218:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[STRUCTARG216]], i32 0, i32 1 +// CHECK-NEXT: store ptr [[LOADGEP_B_ADDR]], ptr [[GEP_B_ADDR218]], align 8 +// CHECK-NEXT: [[GEP_R_ADDR219:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[STRUCTARG216]], i32 0, i32 2 +// CHECK-NEXT: store ptr [[LOADGEP_R_ADDR]], ptr [[GEP_R_ADDR219]], align 8 +// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 1, ptr @_Z14parallel_for_2Pfid..omp_par.21, ptr [[STRUCTARG216]]) // CHECK-NEXT: br label [[OMP_PAR_EXIT108:%.*]] // CHECK: omp.par.exit108: -// CHECK-NEXT: store i32 0, ptr [[I138]], align 4 -// CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_13]], ptr [[AGG_CAPTURED139]], i32 0, i32 0 -// CHECK-NEXT: store ptr [[I138]], ptr [[TMP16]], align 8 -// CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_14]], ptr [[AGG_CAPTURED140]], i32 0, i32 0 -// CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[I138]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I139]], align 4 +// CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_13]], ptr [[AGG_CAPTURED140]], i32 0, i32 0 +// CHECK-NEXT: store ptr [[I139]], ptr [[TMP16]], align 8 +// CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_14]], ptr [[AGG_CAPTURED141]], i32 0, i32 0 +// CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[I139]], align 4 // CHECK-NEXT: store i32 [[TMP18]], ptr [[TMP17]], align 4 -// CHECK-NEXT: call void @__captured_stmt.15(ptr [[DOTCOUNT_ADDR141]], ptr [[AGG_CAPTURED139]]) -// CHECK-NEXT: [[DOTCOUNT142:%.*]] = load i32, ptr [[DOTCOUNT_ADDR141]], align 4 -// CHECK-NEXT: br label [[OMP_LOOP_PREHEADER143:%.*]] -// CHECK: omp_loop.preheader143: -// CHECK-NEXT: store i32 0, ptr [[P_LOWERBOUND157]], align 4 -// CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[DOTCOUNT142]], 1 -// CHECK-NEXT: store i32 [[TMP19]], ptr [[P_UPPERBOUND158]], align 4 -// CHECK-NEXT: store i32 1, ptr [[P_STRIDE159]], align 4 -// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM160:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) -// CHECK-NEXT: call void @__kmpc_for_static_init_4u(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM160]], i32 34, ptr [[P_LASTITER156]], ptr [[P_LOWERBOUND157]], ptr [[P_UPPERBOUND158]], ptr [[P_STRIDE159]], i32 1, i32 0) -// CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[P_LOWERBOUND157]], align 4 -// CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[P_UPPERBOUND158]], align 4 -// CHECK-NEXT: [[TRIP_COUNT_MINUS1161:%.*]] = sub i32 [[TMP21]], [[TMP20]] -// CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TRIP_COUNT_MINUS1161]], 1 -// CHECK-NEXT: br label [[OMP_LOOP_HEADER144:%.*]] -// CHECK: omp_loop.header144: -// CHECK-NEXT: [[OMP_LOOP_IV150:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER143]] ], [ [[OMP_LOOP_NEXT152:%.*]], [[OMP_LOOP_INC147:%.*]] ] -// CHECK-NEXT: br label [[OMP_LOOP_COND145:%.*]] -// CHECK: omp_loop.cond145: -// CHECK-NEXT: [[OMP_LOOP_CMP151:%.*]] = icmp ult i32 [[OMP_LOOP_IV150]], [[TMP22]] -// CHECK-NEXT: br i1 [[OMP_LOOP_CMP151]], label [[OMP_LOOP_BODY146:%.*]], label [[OMP_LOOP_EXIT148:%.*]] -// CHECK: omp_loop.exit148: -// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM160]]) -// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM162:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) -// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM162]]) -// CHECK-NEXT: br label [[OMP_LOOP_AFTER149:%.*]] -// CHECK: omp_loop.after149: +// CHECK-NEXT: call void @__captured_stmt.15(ptr [[DOTCOUNT_ADDR142]], ptr [[AGG_CAPTURED140]]) +// CHECK-NEXT: [[DOTCOUNT143:%.*]] = load i32, ptr [[DOTCOUNT_ADDR142]], align 4 +// CHECK-NEXT: br label [[OMP_LOOP_PREHEADER144:%.*]] +// CHECK: omp_loop.preheader144: +// CHECK-NEXT: store i32 0, ptr [[P_LOWERBOUND158]], align 4 +// CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[DOTCOUNT143]], 1 +// CHECK-NEXT: store i32 [[TMP19]], ptr [[P_UPPERBOUND159]], align 4 +// CHECK-NEXT: store i32 1, ptr [[P_STRIDE160]], align 4 +// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM161:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) +// CHECK-NEXT: call void @__kmpc_for_static_init_4u(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM161]], i32 34, ptr [[P_LASTITER157]], ptr [[P_LOWERBOUND158]], ptr [[P_UPPERBOUND159]], ptr [[P_STRIDE160]], i32 1, i32 0) +// CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[P_LOWERBOUND158]], align 4 +// CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[P_UPPERBOUND159]], align 4 +// CHECK-NEXT: [[TRIP_COUNT_MINUS1162:%.*]] = sub i32 [[TMP21]], [[TMP20]] +// CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TRIP_COUNT_MINUS1162]], 1 +// CHECK-NEXT: br label [[OMP_LOOP_HEADER145:%.*]] +// CHECK: omp_loop.header145: +// CHECK-NEXT: [[OMP_LOOP_IV151:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER144]] ], [ [[OMP_LOOP_NEXT153:%.*]], [[OMP_LOOP_INC148:%.*]] ] +// CHECK-NEXT: br label [[OMP_LOOP_COND146:%.*]] +// CHECK: omp_loop.cond146: +// CHECK-NEXT: [[OMP_LOOP_CMP152:%.*]] = icmp ult i32 [[OMP_LOOP_IV151]], [[TMP22]] +// CHECK-NEXT: br i1 [[OMP_LOOP_CMP152]], label [[OMP_LOOP_BODY147:%.*]], label [[OMP_LOOP_EXIT149:%.*]] +// CHECK: omp_loop.exit149: +// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM161]]) +// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM163:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM163]]) +// CHECK-NEXT: br label [[OMP_LOOP_AFTER150:%.*]] +// CHECK: omp_loop.after150: // CHECK-NEXT: br label [[OMP_PAR_REGION9_PARALLEL_AFTER:%.*]] // CHECK: omp.par.region9.parallel.after: // CHECK-NEXT: br label [[OMP_PAR_PRE_FINALIZE10:%.*]] // CHECK: omp.par.pre_finalize10: +// CHECK-NEXT: br label [[DOTFINI164:%.*]] +// CHECK: .fini164: // CHECK-NEXT: br label [[OMP_PAR_EXIT11_EXITSTUB:%.*]] -// CHECK: omp_loop.body146: -// CHECK-NEXT: [[TMP23:%.*]] = add i32 [[OMP_LOOP_IV150]], [[TMP20]] -// CHECK-NEXT: call void @__captured_stmt.16(ptr [[I138]], i32 [[TMP23]], ptr [[AGG_CAPTURED140]]) +// CHECK: omp_loop.body147: +// CHECK-NEXT: [[TMP23:%.*]] = add i32 [[OMP_LOOP_IV151]], [[TMP20]] +// CHECK-NEXT: call void @__captured_stmt.16(ptr [[I139]], i32 [[TMP23]], ptr [[AGG_CAPTURED141]]) // CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[LOADGEP_A_ADDR]], align 4 -// CHECK-NEXT: [[CONV153:%.*]] = sitofp i32 [[TMP24]] to double +// CHECK-NEXT: [[CONV154:%.*]] = sitofp i32 [[TMP24]] to double // CHECK-NEXT: [[TMP25:%.*]] = load double, ptr [[LOADGEP_B_ADDR]], align 8 -// CHECK-NEXT: [[ADD154:%.*]] = fadd double [[CONV153]], [[TMP25]] -// CHECK-NEXT: [[CONV155:%.*]] = fptrunc double [[ADD154]] to float +// CHECK-NEXT: [[ADD155:%.*]] = fadd double [[CONV154]], [[TMP25]] +// CHECK-NEXT: [[CONV156:%.*]] = fptrunc double [[ADD155]] to float // CHECK-NEXT: [[TMP26:%.*]] = load ptr, ptr [[LOADGEP_R_ADDR]], align 8 -// CHECK-NEXT: store float [[CONV155]], ptr [[TMP26]], align 4 -// CHECK-NEXT: br label [[OMP_LOOP_INC147]] -// CHECK: omp_loop.inc147: -// CHECK-NEXT: [[OMP_LOOP_NEXT152]] = add nuw i32 [[OMP_LOOP_IV150]], 1 -// CHECK-NEXT: br label [[OMP_LOOP_HEADER144]] +// CHECK-NEXT: store float [[CONV156]], ptr [[TMP26]], align 4 +// CHECK-NEXT: br label [[OMP_LOOP_INC148]] +// CHECK: omp_loop.inc148: +// CHECK-NEXT: [[OMP_LOOP_NEXT153]] = add nuw i32 [[OMP_LOOP_IV151]], 1 +// CHECK-NEXT: br label [[OMP_LOOP_HEADER145]] // CHECK: omp_loop.body85: // CHECK-NEXT: [[TMP27:%.*]] = add i32 [[OMP_LOOP_IV89]], [[TMP13]] // CHECK-NEXT: call void @__captured_stmt.12(ptr [[I77]], i32 [[TMP27]], ptr [[AGG_CAPTURED79]]) @@ -856,11 +866,11 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-SAME: (ptr noalias [[TID_ADDR103:%.*]], ptr noalias [[ZERO_ADDR104:%.*]], ptr [[TMP0:%.*]]) #[[ATTR1]] { // CHECK-NEXT: omp.par.entry105: // CHECK-NEXT: [[GEP_A_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 0 -// CHECK-NEXT: [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META5]] +// CHECK-NEXT: [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META4]] // CHECK-NEXT: [[GEP_B_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 1 -// CHECK-NEXT: [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META6]] +// CHECK-NEXT: [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META7]] // CHECK-NEXT: [[GEP_R_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 2 -// CHECK-NEXT: [[LOADGEP_R_ADDR:%.*]] = load ptr, ptr [[GEP_R_ADDR]], align 8, !align [[META6]] +// CHECK-NEXT: [[LOADGEP_R_ADDR:%.*]] = load ptr, ptr [[GEP_R_ADDR]], align 8, !align [[META7]] // CHECK-NEXT: [[P_LASTITER131:%.*]] = alloca i32, align 4 // CHECK-NEXT: [[P_LOWERBOUND132:%.*]] = alloca i32, align 4 // CHECK-NEXT: [[P_UPPERBOUND133:%.*]] = alloca i32, align 4 @@ -912,6 +922,8 @@ void parallel_for_2(float *r, int a, double b) { // CHECK: omp.par.region106.parallel.after: // CHECK-NEXT: br label [[OMP_PAR_PRE_FINALIZE107:%.*]] // CHECK: omp.par.pre_finalize107: +// CHECK-NEXT: br label [[DOTFINI138:%.*]] +// CHECK: .fini138: // CHECK-NEXT: br label [[OMP_PAR_EXIT108_EXITSTUB:%.*]] // CHECK: omp_loop.body121: // CHECK-NEXT: [[TMP9:%.*]] = add i32 [[OMP_LOOP_IV125]], [[TMP6]] @@ -935,11 +947,11 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-SAME: (ptr noalias [[TID_ADDR42:%.*]], ptr noalias [[ZERO_ADDR43:%.*]], ptr [[TMP0:%.*]]) #[[ATTR1]] { // CHECK-NEXT: omp.par.entry44: // CHECK-NEXT: [[GEP_A_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 0 -// CHECK-NEXT: [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META5]] +// CHECK-NEXT: [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META4]] // CHECK-NEXT: [[GEP_B_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 1 -// CHECK-NEXT: [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META6]] +// CHECK-NEXT: [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META7]] // CHECK-NEXT: [[GEP_R_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 2 -// CHECK-NEXT: [[LOADGEP_R_ADDR:%.*]] = load ptr, ptr [[GEP_R_ADDR]], align 8, !align [[META6]] +// CHECK-NEXT: [[LOADGEP_R_ADDR:%.*]] = load ptr, ptr [[GEP_R_ADDR]], align 8, !align [[META7]] // CHECK-NEXT: [[P_LASTITER70:%.*]] = alloca i32, align 4 // CHECK-NEXT: [[P_LOWERBOUND71:%.*]] = alloca i32, align 4 // CHECK-NEXT: [[P_UPPERBOUND72:%.*]] = alloca i32, align 4 @@ -991,6 +1003,8 @@ void parallel_for_2(float *r, int a, double b) { // CHECK: omp.par.region45.parallel.after: // CHECK-NEXT: br label [[OMP_PAR_PRE_FINALIZE46:%.*]] // CHECK: omp.par.pre_finalize46: +// CHECK-NEXT: br label [[DOTFINI:%.*]] +// CHECK: .fini: // CHECK-NEXT: br label [[OMP_PAR_EXIT47_EXITSTUB:%.*]] // CHECK: omp_loop.body60: // CHECK-NEXT: [[TMP9:%.*]] = add i32 [[OMP_LOOP_IV64]], [[TMP6]] @@ -1022,7 +1036,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: store ptr [[__CONTEXT]], ptr [[__CONTEXT_ADDR]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_3:%.*]], ptr [[TMP0]], i32 0, i32 0 -// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 // CHECK-NEXT: store i32 [[TMP3]], ptr [[DOTSTART]], align 4 // CHECK-NEXT: store i32 100, ptr [[DOTSTOP]], align 4 @@ -1045,7 +1059,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: br label [[COND_END]] // CHECK: cond.end: // CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ] -// CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8 +// CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK-NEXT: store i32 [[COND]], ptr [[TMP10]], align 4 // CHECK-NEXT: ret void // @@ -1065,7 +1079,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4 // CHECK-NEXT: [[MUL:%.*]] = mul i32 1, [[TMP3]] // CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]] -// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK-NEXT: store i32 [[ADD]], ptr [[TMP4]], align 4 // CHECK-NEXT: ret void // @@ -1082,7 +1096,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: store ptr [[__CONTEXT]], ptr [[__CONTEXT_ADDR]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_5:%.*]], ptr [[TMP0]], i32 0, i32 0 -// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 // CHECK-NEXT: store i32 [[TMP3]], ptr [[DOTSTART]], align 4 // CHECK-NEXT: store i32 100, ptr [[DOTSTOP]], align 4 @@ -1105,7 +1119,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: br label [[COND_END]] // CHECK: cond.end: // CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ] -// CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8 +// CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK-NEXT: store i32 [[COND]], ptr [[TMP10]], align 4 // CHECK-NEXT: ret void // @@ -1125,7 +1139,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4 // CHECK-NEXT: [[MUL:%.*]] = mul i32 1, [[TMP3]] // CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]] -// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK-NEXT: store i32 [[ADD]], ptr [[TMP4]], align 4 // CHECK-NEXT: ret void // @@ -1142,7 +1156,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: store ptr [[__CONTEXT]], ptr [[__CONTEXT_ADDR]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_7:%.*]], ptr [[TMP0]], i32 0, i32 0 -// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 // CHECK-NEXT: store i32 [[TMP3]], ptr [[DOTSTART]], align 4 // CHECK-NEXT: store i32 100, ptr [[DOTSTOP]], align 4 @@ -1165,7 +1179,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: br label [[COND_END]] // CHECK: cond.end: // CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ] -// CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8 +// CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK-NEXT: store i32 [[COND]], ptr [[TMP10]], align 4 // CHECK-NEXT: ret void // @@ -1185,7 +1199,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4 // CHECK-NEXT: [[MUL:%.*]] = mul i32 1, [[TMP3]] // CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]] -// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK-NEXT: store i32 [[ADD]], ptr [[TMP4]], align 4 // CHECK-NEXT: ret void // @@ -1202,7 +1216,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: store ptr [[__CONTEXT]], ptr [[__CONTEXT_ADDR]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_9:%.*]], ptr [[TMP0]], i32 0, i32 0 -// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 // CHECK-NEXT: store i32 [[TMP3]], ptr [[DOTSTART]], align 4 // CHECK-NEXT: store i32 100, ptr [[DOTSTOP]], align 4 @@ -1225,7 +1239,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: br label [[COND_END]] // CHECK: cond.end: // CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ] -// CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8 +// CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK-NEXT: store i32 [[COND]], ptr [[TMP10]], align 4 // CHECK-NEXT: ret void // @@ -1245,7 +1259,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4 // CHECK-NEXT: [[MUL:%.*]] = mul i32 1, [[TMP3]] // CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]] -// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK-NEXT: store i32 [[ADD]], ptr [[TMP4]], align 4 // CHECK-NEXT: ret void // @@ -1262,7 +1276,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: store ptr [[__CONTEXT]], ptr [[__CONTEXT_ADDR]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_11:%.*]], ptr [[TMP0]], i32 0, i32 0 -// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 // CHECK-NEXT: store i32 [[TMP3]], ptr [[DOTSTART]], align 4 // CHECK-NEXT: store i32 100, ptr [[DOTSTOP]], align 4 @@ -1285,7 +1299,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: br label [[COND_END]] // CHECK: cond.end: // CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ] -// CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8 +// CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK-NEXT: store i32 [[COND]], ptr [[TMP10]], align 4 // CHECK-NEXT: ret void // @@ -1305,7 +1319,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4 // CHECK-NEXT: [[MUL:%.*]] = mul i32 1, [[TMP3]] // CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]] -// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK-NEXT: store i32 [[ADD]], ptr [[TMP4]], align 4 // CHECK-NEXT: ret void // @@ -1322,7 +1336,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: store ptr [[__CONTEXT]], ptr [[__CONTEXT_ADDR]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_13:%.*]], ptr [[TMP0]], i32 0, i32 0 -// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 // CHECK-NEXT: store i32 [[TMP3]], ptr [[DOTSTART]], align 4 // CHECK-NEXT: store i32 100, ptr [[DOTSTOP]], align 4 @@ -1345,7 +1359,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: br label [[COND_END]] // CHECK: cond.end: // CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ] -// CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8 +// CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK-NEXT: store i32 [[COND]], ptr [[TMP10]], align 4 // CHECK-NEXT: ret void // @@ -1365,7 +1379,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4 // CHECK-NEXT: [[MUL:%.*]] = mul i32 1, [[TMP3]] // CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]] -// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK-NEXT: store i32 [[ADD]], ptr [[TMP4]], align 4 // CHECK-NEXT: ret void // @@ -1382,7 +1396,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: store ptr [[__CONTEXT]], ptr [[__CONTEXT_ADDR]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_15:%.*]], ptr [[TMP0]], i32 0, i32 0 -// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 // CHECK-NEXT: store i32 [[TMP3]], ptr [[DOTSTART]], align 4 // CHECK-NEXT: store i32 100, ptr [[DOTSTOP]], align 4 @@ -1405,7 +1419,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: br label [[COND_END]] // CHECK: cond.end: // CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ] -// CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8 +// CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK-NEXT: store i32 [[COND]], ptr [[TMP10]], align 4 // CHECK-NEXT: ret void // @@ -1425,7 +1439,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4 // CHECK-NEXT: [[MUL:%.*]] = mul i32 1, [[TMP3]] // CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]] -// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK-NEXT: store i32 [[ADD]], ptr [[TMP4]], align 4 // CHECK-NEXT: ret void // @@ -1442,7 +1456,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: store ptr [[__CONTEXT]], ptr [[__CONTEXT_ADDR]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_17:%.*]], ptr [[TMP0]], i32 0, i32 0 -// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 // CHECK-NEXT: store i32 [[TMP3]], ptr [[DOTSTART]], align 4 // CHECK-NEXT: store i32 100, ptr [[DOTSTOP]], align 4 @@ -1465,7 +1479,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: br label [[COND_END]] // CHECK: cond.end: // CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ] -// CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8 +// CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK-NEXT: store i32 [[COND]], ptr [[TMP10]], align 4 // CHECK-NEXT: ret void // @@ -1485,7 +1499,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4 // CHECK-NEXT: [[MUL:%.*]] = mul i32 1, [[TMP3]] // CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]] -// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK-NEXT: store i32 [[ADD]], ptr [[TMP4]], align 4 // CHECK-NEXT: ret void // @@ -1557,6 +1571,8 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG: omp.par.region.parallel.after: // CHECK-DEBUG-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]] // CHECK-DEBUG: omp.par.pre_finalize: +// CHECK-DEBUG-NEXT: br label [[DOTFINI:%.*]] +// CHECK-DEBUG: .fini: // CHECK-DEBUG-NEXT: br label [[OMP_PAR_EXIT_EXITSTUB:%.*]], !dbg [[DBG30]] // CHECK-DEBUG: omp_loop.body: // CHECK-DEBUG-NEXT: [[TMP8:%.*]] = add i32 [[OMP_LOOP_IV]], [[TMP5]], !dbg [[DBG29]] @@ -1584,73 +1600,73 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8 // CHECK-DEBUG-NEXT: #dbg_declare(ptr [[DOTSTART]], [[META42:![0-9]+]], !DIExpression(), [[META44:![0-9]+]]) // CHECK-DEBUG-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON:%.*]], ptr [[TMP0]], i32 0, i32 0, !dbg [[DBG45:![0-9]+]] -// CHECK-DEBUG-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG45]] +// CHECK-DEBUG-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG45]], !nonnull [[META12:![0-9]+]], !align [[META47:![0-9]+]] // CHECK-DEBUG-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !dbg [[DBG45]] // CHECK-DEBUG-NEXT: store i32 [[TMP3]], ptr [[DOTSTART]], align 4, !dbg [[META44]] -// CHECK-DEBUG-NEXT: #dbg_declare(ptr [[DOTSTOP]], [[META47:![0-9]+]], !DIExpression(), [[META48:![0-9]+]]) -// CHECK-DEBUG-NEXT: store i32 100, ptr [[DOTSTOP]], align 4, !dbg [[META48]] -// CHECK-DEBUG-NEXT: #dbg_declare(ptr [[DOTSTEP]], [[META49:![0-9]+]], !DIExpression(), [[META48]]) -// CHECK-DEBUG-NEXT: store i32 1, ptr [[DOTSTEP]], align 4, !dbg [[META48]] -// CHECK-DEBUG-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTSTART]], align 4, !dbg [[META48]] -// CHECK-DEBUG-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTSTOP]], align 4, !dbg [[META48]] -// CHECK-DEBUG-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP4]], [[TMP5]], !dbg [[META48]] -// CHECK-DEBUG-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[META48]] +// CHECK-DEBUG-NEXT: #dbg_declare(ptr [[DOTSTOP]], [[META48:![0-9]+]], !DIExpression(), [[META49:![0-9]+]]) +// CHECK-DEBUG-NEXT: store i32 100, ptr [[DOTSTOP]], align 4, !dbg [[META49]] +// CHECK-DEBUG-NEXT: #dbg_declare(ptr [[DOTSTEP]], [[META50:![0-9]+]], !DIExpression(), [[META49]]) +// CHECK-DEBUG-NEXT: store i32 1, ptr [[DOTSTEP]], align 4, !dbg [[META49]] +// CHECK-DEBUG-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTSTART]], align 4, !dbg [[META49]] +// CHECK-DEBUG-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTSTOP]], align 4, !dbg [[META49]] +// CHECK-DEBUG-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP4]], [[TMP5]], !dbg [[META49]] +// CHECK-DEBUG-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[META49]] // CHECK-DEBUG: cond.true: -// CHECK-DEBUG-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTSTOP]], align 4, !dbg [[META48]] -// CHECK-DEBUG-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTSTART]], align 4, !dbg [[META48]] -// CHECK-DEBUG-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], [[TMP7]], !dbg [[META48]] -// CHECK-DEBUG-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTSTEP]], align 4, !dbg [[META48]] -// CHECK-DEBUG-NEXT: [[SUB1:%.*]] = sub i32 [[TMP8]], 1, !dbg [[META48]] -// CHECK-DEBUG-NEXT: [[ADD:%.*]] = add i32 [[SUB]], [[SUB1]], !dbg [[META48]] -// CHECK-DEBUG-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTSTEP]], align 4, !dbg [[META48]] -// CHECK-DEBUG-NEXT: [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP9]], !dbg [[META48]] -// CHECK-DEBUG-NEXT: br label [[COND_END:%.*]], !dbg [[META48]] +// CHECK-DEBUG-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTSTOP]], align 4, !dbg [[META49]] +// CHECK-DEBUG-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTSTART]], align 4, !dbg [[META49]] +// CHECK-DEBUG-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], [[TMP7]], !dbg [[META49]] +// CHECK-DEBUG-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTSTEP]], align 4, !dbg [[META49]] +// CHECK-DEBUG-NEXT: [[SUB1:%.*]] = sub i32 [[TMP8]], 1, !dbg [[META49]] +// CHECK-DEBUG-NEXT: [[ADD:%.*]] = add i32 [[SUB]], [[SUB1]], !dbg [[META49]] +// CHECK-DEBUG-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTSTEP]], align 4, !dbg [[META49]] +// CHECK-DEBUG-NEXT: [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP9]], !dbg [[META49]] +// CHECK-DEBUG-NEXT: br label [[COND_END:%.*]], !dbg [[META49]] // CHECK-DEBUG: cond.false: -// CHECK-DEBUG-NEXT: br label [[COND_END]], !dbg [[META48]] +// CHECK-DEBUG-NEXT: br label [[COND_END]], !dbg [[META49]] // CHECK-DEBUG: cond.end: -// CHECK-DEBUG-NEXT: [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ], !dbg [[META48]] -// CHECK-DEBUG-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META48]] -// CHECK-DEBUG-NEXT: store i32 [[COND]], ptr [[TMP10]], align 4, !dbg [[META48]] -// CHECK-DEBUG-NEXT: ret void, !dbg [[DBG50:![0-9]+]] +// CHECK-DEBUG-NEXT: [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ], !dbg [[META49]] +// CHECK-DEBUG-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META49]], !nonnull [[META12]], !align [[META47]] +// CHECK-DEBUG-NEXT: store i32 [[COND]], ptr [[TMP10]], align 4, !dbg [[META49]] +// CHECK-DEBUG-NEXT: ret void, !dbg [[DBG51:![0-9]+]] // // // CHECK-DEBUG-LABEL: define {{[^@]+}}@__captured_stmt.1 -// CHECK-DEBUG-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[LOOPVAR:%.*]], i32 noundef [[LOGICAL:%.*]], ptr noalias noundef [[__CONTEXT:%.*]]) #[[ATTR3]] !dbg [[DBG52:![0-9]+]] { +// CHECK-DEBUG-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[LOOPVAR:%.*]], i32 noundef [[LOGICAL:%.*]], ptr noalias noundef [[__CONTEXT:%.*]]) #[[ATTR3]] !dbg [[DBG53:![0-9]+]] { // CHECK-DEBUG-NEXT: entry: // CHECK-DEBUG-NEXT: [[LOOPVAR_ADDR:%.*]] = alloca ptr, align 8 // CHECK-DEBUG-NEXT: [[LOGICAL_ADDR:%.*]] = alloca i32, align 4 // CHECK-DEBUG-NEXT: [[__CONTEXT_ADDR:%.*]] = alloca ptr, align 8 // CHECK-DEBUG-NEXT: store ptr [[LOOPVAR]], ptr [[LOOPVAR_ADDR]], align 8 -// CHECK-DEBUG-NEXT: #dbg_declare(ptr [[LOOPVAR_ADDR]], [[META60:![0-9]+]], !DIExpression(), [[META61:![0-9]+]]) +// CHECK-DEBUG-NEXT: #dbg_declare(ptr [[LOOPVAR_ADDR]], [[META61:![0-9]+]], !DIExpression(), [[META62:![0-9]+]]) // CHECK-DEBUG-NEXT: store i32 [[LOGICAL]], ptr [[LOGICAL_ADDR]], align 4 -// CHECK-DEBUG-NEXT: #dbg_declare(ptr [[LOGICAL_ADDR]], [[META62:![0-9]+]], !DIExpression(), [[META61]]) +// CHECK-DEBUG-NEXT: #dbg_declare(ptr [[LOGICAL_ADDR]], [[META63:![0-9]+]], !DIExpression(), [[META62]]) // CHECK-DEBUG-NEXT: store ptr [[__CONTEXT]], ptr [[__CONTEXT_ADDR]], align 8 -// CHECK-DEBUG-NEXT: #dbg_declare(ptr [[__CONTEXT_ADDR]], [[META63:![0-9]+]], !DIExpression(), [[META61]]) +// CHECK-DEBUG-NEXT: #dbg_declare(ptr [[__CONTEXT_ADDR]], [[META64:![0-9]+]], !DIExpression(), [[META62]]) // CHECK-DEBUG-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8 -// CHECK-DEBUG-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_0:%.*]], ptr [[TMP0]], i32 0, i32 0, !dbg [[DBG64:![0-9]+]] -// CHECK-DEBUG-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4, !dbg [[DBG64]] -// CHECK-DEBUG-NEXT: [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4, !dbg [[DBG66:![0-9]+]] -// CHECK-DEBUG-NEXT: [[MUL:%.*]] = mul i32 1, [[TMP3]], !dbg [[DBG66]] -// CHECK-DEBUG-NEXT: [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]], !dbg [[DBG66]] -// CHECK-DEBUG-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG66]] -// CHECK-DEBUG-NEXT: store i32 [[ADD]], ptr [[TMP4]], align 4, !dbg [[META61]] -// CHECK-DEBUG-NEXT: ret void, !dbg [[DBG64]] +// CHECK-DEBUG-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_0:%.*]], ptr [[TMP0]], i32 0, i32 0, !dbg [[DBG65:![0-9]+]] +// CHECK-DEBUG-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4, !dbg [[DBG65]] +// CHECK-DEBUG-NEXT: [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4, !dbg [[DBG67:![0-9]+]] +// CHECK-DEBUG-NEXT: [[MUL:%.*]] = mul i32 1, [[TMP3]], !dbg [[DBG67]] +// CHECK-DEBUG-NEXT: [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]], !dbg [[DBG67]] +// CHECK-DEBUG-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG67]], !nonnull [[META12]], !align [[META47]] +// CHECK-DEBUG-NEXT: store i32 [[ADD]], ptr [[TMP4]], align 4, !dbg [[META62]] +// CHECK-DEBUG-NEXT: ret void, !dbg [[DBG65]] // // // CHECK-DEBUG-LABEL: define {{[^@]+}}@_Z14parallel_for_1Pfid -// CHECK-DEBUG-SAME: (ptr noundef [[R:%.*]], i32 noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] !dbg [[DBG69:![0-9]+]] { +// CHECK-DEBUG-SAME: (ptr noundef [[R:%.*]], i32 noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] !dbg [[DBG70:![0-9]+]] { // CHECK-DEBUG-NEXT: entry: // CHECK-DEBUG-NEXT: [[STRUCTARG:%.*]] = alloca { ptr, ptr, ptr }, align 8 // CHECK-DEBUG-NEXT: [[R_ADDR:%.*]] = alloca ptr, align 8 // CHECK-DEBUG-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 // CHECK-DEBUG-NEXT: [[B_ADDR:%.*]] = alloca double, align 8 // CHECK-DEBUG-NEXT: store ptr [[R]], ptr [[R_ADDR]], align 8 -// CHECK-DEBUG-NEXT: #dbg_declare(ptr [[R_ADDR]], [[META75:![0-9]+]], !DIExpression(), [[META76:![0-9]+]]) +// CHECK-DEBUG-NEXT: #dbg_declare(ptr [[R_ADDR]], [[META76:![0-9]+]], !DIExpression(), [[META77:![0-9]+]]) // CHECK-DEBUG-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 -// CHECK-DEBUG-NEXT: #dbg_declare(ptr [[A_ADDR]], [[META77:![0-9]+]], !DIExpression(), [[META78:![0-9]+]]) +// CHECK-DEBUG-NEXT: #dbg_declare(ptr [[A_ADDR]], [[META78:![0-9]+]], !DIExpression(), [[META79:![0-9]+]]) // CHECK-DEBUG-NEXT: store double [[B]], ptr [[B_ADDR]], align 8 -// CHECK-DEBUG-NEXT: #dbg_declare(ptr [[B_ADDR]], [[META79:![0-9]+]], !DIExpression(), [[META80:![0-9]+]]) -// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB6:[0-9]+]]), !dbg [[DBG81:![0-9]+]] +// CHECK-DEBUG-NEXT: #dbg_declare(ptr [[B_ADDR]], [[META80:![0-9]+]], !DIExpression(), [[META81:![0-9]+]]) +// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB6:[0-9]+]]), !dbg [[DBG82:![0-9]+]] // CHECK-DEBUG-NEXT: br label [[OMP_PARALLEL:%.*]] // CHECK-DEBUG: omp_parallel: // CHECK-DEBUG-NEXT: [[GEP_A_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[STRUCTARG]], i32 0, i32 0 @@ -1659,17 +1675,17 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-NEXT: store ptr [[B_ADDR]], ptr [[GEP_B_ADDR]], align 8 // CHECK-DEBUG-NEXT: [[GEP_R_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[STRUCTARG]], i32 0, i32 2 // CHECK-DEBUG-NEXT: store ptr [[R_ADDR]], ptr [[GEP_R_ADDR]], align 8 -// CHECK-DEBUG-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB6]], i32 1, ptr @_Z14parallel_for_1Pfid..omp_par.4, ptr [[STRUCTARG]]), !dbg [[DBG82:![0-9]+]] +// CHECK-DEBUG-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB6]], i32 1, ptr @_Z14parallel_for_1Pfid..omp_par.4, ptr [[STRUCTARG]]), !dbg [[DBG83:![0-9]+]] // CHECK-DEBUG-NEXT: br label [[OMP_PAR_EXIT:%.*]] // CHECK-DEBUG: omp.par.exit: -// CHECK-DEBUG-NEXT: ret void, !dbg [[DBG84:![0-9]+]] +// CHECK-DEBUG-NEXT: ret void, !dbg [[DBG85:![0-9]+]] // // // CHECK-DEBUG-LABEL: define {{[^@]+}}@_Z14parallel_for_1Pfid..omp_par.4 -// CHECK-DEBUG-SAME: (ptr noalias [[TID_ADDR:%.*]], ptr noalias [[ZERO_ADDR:%.*]], ptr [[TMP0:%.*]]) #[[ATTR1]] !dbg [[DBG85:![0-9]+]] { +// CHECK-DEBUG-SAME: (ptr noalias [[TID_ADDR:%.*]], ptr noalias [[ZERO_ADDR:%.*]], ptr [[TMP0:%.*]]) #[[ATTR1]] !dbg [[DBG86:![0-9]+]] { // CHECK-DEBUG-NEXT: omp.par.entry: // CHECK-DEBUG-NEXT: [[GEP_A_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 0 -// CHECK-DEBUG-NEXT: [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META86:![0-9]+]] +// CHECK-DEBUG-NEXT: [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META47]] // CHECK-DEBUG-NEXT: [[GEP_B_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 1 // CHECK-DEBUG-NEXT: [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META87:![0-9]+]] // CHECK-DEBUG-NEXT: [[GEP_R_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 2 @@ -1700,6 +1716,8 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG: omp.par.region.parallel.after: // CHECK-DEBUG-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]] // CHECK-DEBUG: omp.par.pre_finalize: +// CHECK-DEBUG-NEXT: br label [[DOTFINI16:%.*]] +// CHECK-DEBUG: .fini16: // CHECK-DEBUG-NEXT: br label [[OMP_PAR_EXIT_EXITSTUB:%.*]], !dbg [[DBG100]] // CHECK-DEBUG: omp.par.exit.exitStub: // CHECK-DEBUG-NEXT: ret void @@ -1709,7 +1727,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-SAME: (ptr noalias [[TID_ADDR2:%.*]], ptr noalias [[ZERO_ADDR3:%.*]], ptr [[TMP0:%.*]]) #[[ATTR1]] !dbg [[DBG101:![0-9]+]] { // CHECK-DEBUG-NEXT: omp.par.entry4: // CHECK-DEBUG-NEXT: [[GEP_A_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 0 -// CHECK-DEBUG-NEXT: [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META86]] +// CHECK-DEBUG-NEXT: [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META47]] // CHECK-DEBUG-NEXT: [[GEP_B_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 1 // CHECK-DEBUG-NEXT: [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META87]] // CHECK-DEBUG-NEXT: [[GEP_R_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 2 @@ -1769,6 +1787,8 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG: omp.par.region5.parallel.after: // CHECK-DEBUG-NEXT: br label [[OMP_PAR_PRE_FINALIZE6:%.*]] // CHECK-DEBUG: omp.par.pre_finalize6: +// CHECK-DEBUG-NEXT: br label [[DOTFINI:%.*]] +// CHECK-DEBUG: .fini: // CHECK-DEBUG-NEXT: br label [[OMP_PAR_EXIT7_EXITSTUB:%.*]], !dbg [[DBG117]] // CHECK-DEBUG: omp_loop.body: // CHECK-DEBUG-NEXT: [[TMP9:%.*]] = add i32 [[OMP_LOOP_IV]], [[TMP6]], !dbg [[DBG116]] @@ -1803,7 +1823,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8 // CHECK-DEBUG-NEXT: #dbg_declare(ptr [[DOTSTART]], [[META128:![0-9]+]], !DIExpression(), [[META130:![0-9]+]]) // CHECK-DEBUG-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_1:%.*]], ptr [[TMP0]], i32 0, i32 0, !dbg [[DBG131:![0-9]+]] -// CHECK-DEBUG-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG131]] +// CHECK-DEBUG-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG131]], !nonnull [[META12]], !align [[META47]] // CHECK-DEBUG-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !dbg [[DBG131]] // CHECK-DEBUG-NEXT: store i32 [[TMP3]], ptr [[DOTSTART]], align 4, !dbg [[META130]] // CHECK-DEBUG-NEXT: #dbg_declare(ptr [[DOTSTOP]], [[META133:![0-9]+]], !DIExpression(), [[META134:![0-9]+]]) @@ -1828,7 +1848,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-NEXT: br label [[COND_END]], !dbg [[META134]] // CHECK-DEBUG: cond.end: // CHECK-DEBUG-NEXT: [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ], !dbg [[META134]] -// CHECK-DEBUG-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META134]] +// CHECK-DEBUG-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META134]], !nonnull [[META12]], !align [[META47]] // CHECK-DEBUG-NEXT: store i32 [[COND]], ptr [[TMP10]], align 4, !dbg [[META134]] // CHECK-DEBUG-NEXT: ret void, !dbg [[DBG136:![0-9]+]] // @@ -1851,7 +1871,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-NEXT: [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4, !dbg [[DBG145:![0-9]+]] // CHECK-DEBUG-NEXT: [[MUL:%.*]] = mul i32 1, [[TMP3]], !dbg [[DBG145]] // CHECK-DEBUG-NEXT: [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]], !dbg [[DBG145]] -// CHECK-DEBUG-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG145]] +// CHECK-DEBUG-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG145]], !nonnull [[META12]], !align [[META47]] // CHECK-DEBUG-NEXT: store i32 [[ADD]], ptr [[TMP4]], align 4, !dbg [[META140]] // CHECK-DEBUG-NEXT: ret void, !dbg [[DBG143]] // @@ -1863,14 +1883,14 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-NEXT: [[R_ADDR:%.*]] = alloca ptr, align 8 // CHECK-DEBUG-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 // CHECK-DEBUG-NEXT: [[B_ADDR:%.*]] = alloca double, align 8 -// CHECK-DEBUG-NEXT: [[I188:%.*]] = alloca i32, align 4 -// CHECK-DEBUG-NEXT: [[AGG_CAPTURED189:%.*]] = alloca [[STRUCT_ANON_17:%.*]], align 8 -// CHECK-DEBUG-NEXT: [[AGG_CAPTURED190:%.*]] = alloca [[STRUCT_ANON_18:%.*]], align 4 -// CHECK-DEBUG-NEXT: [[DOTCOUNT_ADDR191:%.*]] = alloca i32, align 4 -// CHECK-DEBUG-NEXT: [[P_LASTITER206:%.*]] = alloca i32, align 4 -// CHECK-DEBUG-NEXT: [[P_LOWERBOUND207:%.*]] = alloca i32, align 4 -// CHECK-DEBUG-NEXT: [[P_UPPERBOUND208:%.*]] = alloca i32, align 4 -// CHECK-DEBUG-NEXT: [[P_STRIDE209:%.*]] = alloca i32, align 4 +// CHECK-DEBUG-NEXT: [[I191:%.*]] = alloca i32, align 4 +// CHECK-DEBUG-NEXT: [[AGG_CAPTURED192:%.*]] = alloca [[STRUCT_ANON_17:%.*]], align 8 +// CHECK-DEBUG-NEXT: [[AGG_CAPTURED193:%.*]] = alloca [[STRUCT_ANON_18:%.*]], align 4 +// CHECK-DEBUG-NEXT: [[DOTCOUNT_ADDR194:%.*]] = alloca i32, align 4 +// CHECK-DEBUG-NEXT: [[P_LASTITER209:%.*]] = alloca i32, align 4 +// CHECK-DEBUG-NEXT: [[P_LOWERBOUND210:%.*]] = alloca i32, align 4 +// CHECK-DEBUG-NEXT: [[P_UPPERBOUND211:%.*]] = alloca i32, align 4 +// CHECK-DEBUG-NEXT: [[P_STRIDE212:%.*]] = alloca i32, align 4 // CHECK-DEBUG-NEXT: store ptr [[R]], ptr [[R_ADDR]], align 8 // CHECK-DEBUG-NEXT: #dbg_declare(ptr [[R_ADDR]], [[META147:![0-9]+]], !DIExpression(), [[META148:![0-9]+]]) // CHECK-DEBUG-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 @@ -1889,54 +1909,54 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB13]], i32 1, ptr @_Z14parallel_for_2Pfid..omp_par.23, ptr [[STRUCTARG]]), !dbg [[DBG154:![0-9]+]] // CHECK-DEBUG-NEXT: br label [[OMP_PAR_EXIT:%.*]] // CHECK-DEBUG: omp.par.exit: -// CHECK-DEBUG-NEXT: #dbg_declare(ptr [[I188]], [[META158:![0-9]+]], !DIExpression(), [[META161:![0-9]+]]) -// CHECK-DEBUG-NEXT: store i32 0, ptr [[I188]], align 4, !dbg [[META161]] -// CHECK-DEBUG-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_17]], ptr [[AGG_CAPTURED189]], i32 0, i32 0, !dbg [[DBG162:![0-9]+]] -// CHECK-DEBUG-NEXT: store ptr [[I188]], ptr [[TMP0]], align 8, !dbg [[DBG162]] -// CHECK-DEBUG-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_18]], ptr [[AGG_CAPTURED190]], i32 0, i32 0, !dbg [[DBG162]] -// CHECK-DEBUG-NEXT: [[TMP2:%.*]] = load i32, ptr [[I188]], align 4, !dbg [[DBG163:![0-9]+]] +// CHECK-DEBUG-NEXT: #dbg_declare(ptr [[I191]], [[META158:![0-9]+]], !DIExpression(), [[META161:![0-9]+]]) +// CHECK-DEBUG-NEXT: store i32 0, ptr [[I191]], align 4, !dbg [[META161]] +// CHECK-DEBUG-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_17]], ptr [[AGG_CAPTURED192]], i32 0, i32 0, !dbg [[DBG162:![0-9]+]] +// CHECK-DEBUG-NEXT: store ptr [[I191]], ptr [[TMP0]], align 8, !dbg [[DBG162]] +// CHECK-DEBUG-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_18]], ptr [[AGG_CAPTURED193]], i32 0, i32 0, !dbg [[DBG162]] +// CHECK-DEBUG-NEXT: [[TMP2:%.*]] = load i32, ptr [[I191]], align 4, !dbg [[DBG163:![0-9]+]] // CHECK-DEBUG-NEXT: store i32 [[TMP2]], ptr [[TMP1]], align 4, !dbg [[DBG162]] -// CHECK-DEBUG-NEXT: call void @__captured_stmt.19(ptr [[DOTCOUNT_ADDR191]], ptr [[AGG_CAPTURED189]]), !dbg [[DBG162]] -// CHECK-DEBUG-NEXT: [[DOTCOUNT192:%.*]] = load i32, ptr [[DOTCOUNT_ADDR191]], align 4, !dbg [[DBG162]] -// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_PREHEADER193:%.*]], !dbg [[DBG162]] -// CHECK-DEBUG: omp_loop.preheader193: -// CHECK-DEBUG-NEXT: store i32 0, ptr [[P_LOWERBOUND207]], align 4, !dbg [[DBG162]] -// CHECK-DEBUG-NEXT: [[TMP3:%.*]] = sub i32 [[DOTCOUNT192]], 1, !dbg [[DBG162]] -// CHECK-DEBUG-NEXT: store i32 [[TMP3]], ptr [[P_UPPERBOUND208]], align 4, !dbg [[DBG162]] -// CHECK-DEBUG-NEXT: store i32 1, ptr [[P_STRIDE209]], align 4, !dbg [[DBG162]] -// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM210:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB42:[0-9]+]]), !dbg [[DBG162]] -// CHECK-DEBUG-NEXT: call void @__kmpc_for_static_init_4u(ptr @[[GLOB42]], i32 [[OMP_GLOBAL_THREAD_NUM210]], i32 34, ptr [[P_LASTITER206]], ptr [[P_LOWERBOUND207]], ptr [[P_UPPERBOUND208]], ptr [[P_STRIDE209]], i32 1, i32 0), !dbg [[DBG162]] -// CHECK-DEBUG-NEXT: [[TMP4:%.*]] = load i32, ptr [[P_LOWERBOUND207]], align 4, !dbg [[DBG162]] -// CHECK-DEBUG-NEXT: [[TMP5:%.*]] = load i32, ptr [[P_UPPERBOUND208]], align 4, !dbg [[DBG162]] -// CHECK-DEBUG-NEXT: [[TRIP_COUNT_MINUS1211:%.*]] = sub i32 [[TMP5]], [[TMP4]], !dbg [[DBG162]] -// CHECK-DEBUG-NEXT: [[TMP6:%.*]] = add i32 [[TRIP_COUNT_MINUS1211]], 1, !dbg [[DBG162]] -// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_HEADER194:%.*]], !dbg [[DBG162]] -// CHECK-DEBUG: omp_loop.header194: -// CHECK-DEBUG-NEXT: [[OMP_LOOP_IV200:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER193]] ], [ [[OMP_LOOP_NEXT202:%.*]], [[OMP_LOOP_INC197:%.*]] ], !dbg [[DBG162]] -// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_COND195:%.*]], !dbg [[DBG162]] -// CHECK-DEBUG: omp_loop.cond195: -// CHECK-DEBUG-NEXT: [[OMP_LOOP_CMP201:%.*]] = icmp ult i32 [[OMP_LOOP_IV200]], [[TMP6]], !dbg [[DBG162]] -// CHECK-DEBUG-NEXT: br i1 [[OMP_LOOP_CMP201]], label [[OMP_LOOP_BODY196:%.*]], label [[OMP_LOOP_EXIT198:%.*]], !dbg [[DBG162]] -// CHECK-DEBUG: omp_loop.body196: -// CHECK-DEBUG-NEXT: [[TMP7:%.*]] = add i32 [[OMP_LOOP_IV200]], [[TMP4]], !dbg [[DBG164:![0-9]+]] -// CHECK-DEBUG-NEXT: call void @__captured_stmt.20(ptr [[I188]], i32 [[TMP7]], ptr [[AGG_CAPTURED190]]), !dbg [[DBG162]] +// CHECK-DEBUG-NEXT: call void @__captured_stmt.19(ptr [[DOTCOUNT_ADDR194]], ptr [[AGG_CAPTURED192]]), !dbg [[DBG162]] +// CHECK-DEBUG-NEXT: [[DOTCOUNT195:%.*]] = load i32, ptr [[DOTCOUNT_ADDR194]], align 4, !dbg [[DBG162]] +// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_PREHEADER196:%.*]], !dbg [[DBG162]] +// CHECK-DEBUG: omp_loop.preheader196: +// CHECK-DEBUG-NEXT: store i32 0, ptr [[P_LOWERBOUND210]], align 4, !dbg [[DBG162]] +// CHECK-DEBUG-NEXT: [[TMP3:%.*]] = sub i32 [[DOTCOUNT195]], 1, !dbg [[DBG162]] +// CHECK-DEBUG-NEXT: store i32 [[TMP3]], ptr [[P_UPPERBOUND211]], align 4, !dbg [[DBG162]] +// CHECK-DEBUG-NEXT: store i32 1, ptr [[P_STRIDE212]], align 4, !dbg [[DBG162]] +// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM213:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB42:[0-9]+]]), !dbg [[DBG162]] +// CHECK-DEBUG-NEXT: call void @__kmpc_for_static_init_4u(ptr @[[GLOB42]], i32 [[OMP_GLOBAL_THREAD_NUM213]], i32 34, ptr [[P_LASTITER209]], ptr [[P_LOWERBOUND210]], ptr [[P_UPPERBOUND211]], ptr [[P_STRIDE212]], i32 1, i32 0), !dbg [[DBG162]] +// CHECK-DEBUG-NEXT: [[TMP4:%.*]] = load i32, ptr [[P_LOWERBOUND210]], align 4, !dbg [[DBG162]] +// CHECK-DEBUG-NEXT: [[TMP5:%.*]] = load i32, ptr [[P_UPPERBOUND211]], align 4, !dbg [[DBG162]] +// CHECK-DEBUG-NEXT: [[TRIP_COUNT_MINUS1214:%.*]] = sub i32 [[TMP5]], [[TMP4]], !dbg [[DBG162]] +// CHECK-DEBUG-NEXT: [[TMP6:%.*]] = add i32 [[TRIP_COUNT_MINUS1214]], 1, !dbg [[DBG162]] +// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_HEADER197:%.*]], !dbg [[DBG162]] +// CHECK-DEBUG: omp_loop.header197: +// CHECK-DEBUG-NEXT: [[OMP_LOOP_IV203:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER196]] ], [ [[OMP_LOOP_NEXT205:%.*]], [[OMP_LOOP_INC200:%.*]] ], !dbg [[DBG162]] +// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_COND198:%.*]], !dbg [[DBG162]] +// CHECK-DEBUG: omp_loop.cond198: +// CHECK-DEBUG-NEXT: [[OMP_LOOP_CMP204:%.*]] = icmp ult i32 [[OMP_LOOP_IV203]], [[TMP6]], !dbg [[DBG162]] +// CHECK-DEBUG-NEXT: br i1 [[OMP_LOOP_CMP204]], label [[OMP_LOOP_BODY199:%.*]], label [[OMP_LOOP_EXIT201:%.*]], !dbg [[DBG162]] +// CHECK-DEBUG: omp_loop.body199: +// CHECK-DEBUG-NEXT: [[TMP7:%.*]] = add i32 [[OMP_LOOP_IV203]], [[TMP4]], !dbg [[DBG164:![0-9]+]] +// CHECK-DEBUG-NEXT: call void @__captured_stmt.20(ptr [[I191]], i32 [[TMP7]], ptr [[AGG_CAPTURED193]]), !dbg [[DBG162]] // CHECK-DEBUG-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG165:![0-9]+]] -// CHECK-DEBUG-NEXT: [[CONV203:%.*]] = sitofp i32 [[TMP8]] to double, !dbg [[DBG165]] +// CHECK-DEBUG-NEXT: [[CONV206:%.*]] = sitofp i32 [[TMP8]] to double, !dbg [[DBG165]] // CHECK-DEBUG-NEXT: [[TMP9:%.*]] = load double, ptr [[B_ADDR]], align 8, !dbg [[DBG164]] -// CHECK-DEBUG-NEXT: [[ADD204:%.*]] = fadd double [[CONV203]], [[TMP9]], !dbg [[DBG166:![0-9]+]] -// CHECK-DEBUG-NEXT: [[CONV205:%.*]] = fptrunc double [[ADD204]] to float, !dbg [[DBG165]] +// CHECK-DEBUG-NEXT: [[ADD207:%.*]] = fadd double [[CONV206]], [[TMP9]], !dbg [[DBG166:![0-9]+]] +// CHECK-DEBUG-NEXT: [[CONV208:%.*]] = fptrunc double [[ADD207]] to float, !dbg [[DBG165]] // CHECK-DEBUG-NEXT: [[TMP10:%.*]] = load ptr, ptr [[R_ADDR]], align 8, !dbg [[DBG167:![0-9]+]] -// CHECK-DEBUG-NEXT: store float [[CONV205]], ptr [[TMP10]], align 4, !dbg [[DBG168:![0-9]+]] -// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_INC197]], !dbg [[DBG162]] -// CHECK-DEBUG: omp_loop.inc197: -// CHECK-DEBUG-NEXT: [[OMP_LOOP_NEXT202]] = add nuw i32 [[OMP_LOOP_IV200]], 1, !dbg [[DBG162]] -// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_HEADER194]], !dbg [[DBG162]] -// CHECK-DEBUG: omp_loop.exit198: -// CHECK-DEBUG-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB42]], i32 [[OMP_GLOBAL_THREAD_NUM210]]), !dbg [[DBG162]] -// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM212:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB42]]), !dbg [[DBG164]] -// CHECK-DEBUG-NEXT: call void @__kmpc_barrier(ptr @[[GLOB43:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM212]]), !dbg [[DBG164]] -// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_AFTER199:%.*]], !dbg [[DBG162]] -// CHECK-DEBUG: omp_loop.after199: +// CHECK-DEBUG-NEXT: store float [[CONV208]], ptr [[TMP10]], align 4, !dbg [[DBG168:![0-9]+]] +// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_INC200]], !dbg [[DBG162]] +// CHECK-DEBUG: omp_loop.inc200: +// CHECK-DEBUG-NEXT: [[OMP_LOOP_NEXT205]] = add nuw i32 [[OMP_LOOP_IV203]], 1, !dbg [[DBG162]] +// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_HEADER197]], !dbg [[DBG162]] +// CHECK-DEBUG: omp_loop.exit201: +// CHECK-DEBUG-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB42]], i32 [[OMP_GLOBAL_THREAD_NUM213]]), !dbg [[DBG162]] +// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM215:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB42]]), !dbg [[DBG164]] +// CHECK-DEBUG-NEXT: call void @__kmpc_barrier(ptr @[[GLOB43:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM215]]), !dbg [[DBG164]] +// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_AFTER202:%.*]], !dbg [[DBG162]] +// CHECK-DEBUG: omp_loop.after202: // CHECK-DEBUG-NEXT: ret void, !dbg [[DBG169:![0-9]+]] // // @@ -1944,16 +1964,16 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-SAME: (ptr noalias [[TID_ADDR:%.*]], ptr noalias [[ZERO_ADDR:%.*]], ptr [[TMP0:%.*]]) #[[ATTR1]] !dbg [[DBG170:![0-9]+]] { // CHECK-DEBUG-NEXT: omp.par.entry: // CHECK-DEBUG-NEXT: [[GEP_A_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 0 -// CHECK-DEBUG-NEXT: [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META86]] +// CHECK-DEBUG-NEXT: [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META47]] // CHECK-DEBUG-NEXT: [[GEP_B_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 1 // CHECK-DEBUG-NEXT: [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META87]] // CHECK-DEBUG-NEXT: [[GEP_R_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 2 // CHECK-DEBUG-NEXT: [[LOADGEP_R_ADDR:%.*]] = load ptr, ptr [[GEP_R_ADDR]], align 8, !align [[META87]] // CHECK-DEBUG-NEXT: [[STRUCTARG:%.*]] = alloca { ptr, ptr, ptr }, align 8 -// CHECK-DEBUG-NEXT: [[P_LASTITER181:%.*]] = alloca i32, align 4 -// CHECK-DEBUG-NEXT: [[P_LOWERBOUND182:%.*]] = alloca i32, align 4 -// CHECK-DEBUG-NEXT: [[P_UPPERBOUND183:%.*]] = alloca i32, align 4 -// CHECK-DEBUG-NEXT: [[P_STRIDE184:%.*]] = alloca i32, align 4 +// CHECK-DEBUG-NEXT: [[P_LASTITER183:%.*]] = alloca i32, align 4 +// CHECK-DEBUG-NEXT: [[P_LOWERBOUND184:%.*]] = alloca i32, align 4 +// CHECK-DEBUG-NEXT: [[P_UPPERBOUND185:%.*]] = alloca i32, align 4 +// CHECK-DEBUG-NEXT: [[P_STRIDE186:%.*]] = alloca i32, align 4 // CHECK-DEBUG-NEXT: [[P_LASTITER:%.*]] = alloca i32, align 4 // CHECK-DEBUG-NEXT: [[P_LOWERBOUND:%.*]] = alloca i32, align 4 // CHECK-DEBUG-NEXT: [[P_UPPERBOUND:%.*]] = alloca i32, align 4 @@ -1966,10 +1986,10 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-NEXT: [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON_3:%.*]], align 8 // CHECK-DEBUG-NEXT: [[AGG_CAPTURED1:%.*]] = alloca [[STRUCT_ANON_4:%.*]], align 4 // CHECK-DEBUG-NEXT: [[DOTCOUNT_ADDR:%.*]] = alloca i32, align 4 -// CHECK-DEBUG-NEXT: [[I163:%.*]] = alloca i32, align 4 -// CHECK-DEBUG-NEXT: [[AGG_CAPTURED164:%.*]] = alloca [[STRUCT_ANON_15:%.*]], align 8 -// CHECK-DEBUG-NEXT: [[AGG_CAPTURED165:%.*]] = alloca [[STRUCT_ANON_16:%.*]], align 4 -// CHECK-DEBUG-NEXT: [[DOTCOUNT_ADDR166:%.*]] = alloca i32, align 4 +// CHECK-DEBUG-NEXT: [[I165:%.*]] = alloca i32, align 4 +// CHECK-DEBUG-NEXT: [[AGG_CAPTURED166:%.*]] = alloca [[STRUCT_ANON_15:%.*]], align 8 +// CHECK-DEBUG-NEXT: [[AGG_CAPTURED167:%.*]] = alloca [[STRUCT_ANON_16:%.*]], align 4 +// CHECK-DEBUG-NEXT: [[DOTCOUNT_ADDR168:%.*]] = alloca i32, align 4 // CHECK-DEBUG-NEXT: #dbg_declare(ptr [[LOADGEP_A_ADDR]], [[META171:![0-9]+]], !DIExpression(), [[META172:![0-9]+]]) // CHECK-DEBUG-NEXT: #dbg_declare(ptr [[LOADGEP_B_ADDR]], [[META173:![0-9]+]], !DIExpression(), [[META174:![0-9]+]]) // CHECK-DEBUG-NEXT: #dbg_declare(ptr [[LOADGEP_R_ADDR]], [[META175:![0-9]+]], !DIExpression(), [[META176:![0-9]+]]) @@ -2021,59 +2041,61 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB18]], i32 1, ptr @_Z14parallel_for_2Pfid..omp_par.22, ptr [[STRUCTARG]]), !dbg [[DBG186:![0-9]+]] // CHECK-DEBUG-NEXT: br label [[OMP_PAR_EXIT11:%.*]] // CHECK-DEBUG: omp.par.exit11: -// CHECK-DEBUG-NEXT: #dbg_declare(ptr [[I163]], [[META190:![0-9]+]], !DIExpression(), [[META193:![0-9]+]]) -// CHECK-DEBUG-NEXT: store i32 0, ptr [[I163]], align 4, !dbg [[META193]] -// CHECK-DEBUG-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_15]], ptr [[AGG_CAPTURED164]], i32 0, i32 0, !dbg [[DBG194:![0-9]+]] -// CHECK-DEBUG-NEXT: store ptr [[I163]], ptr [[TMP9]], align 8, !dbg [[DBG194]] -// CHECK-DEBUG-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_16]], ptr [[AGG_CAPTURED165]], i32 0, i32 0, !dbg [[DBG194]] -// CHECK-DEBUG-NEXT: [[TMP11:%.*]] = load i32, ptr [[I163]], align 4, !dbg [[DBG195:![0-9]+]] +// CHECK-DEBUG-NEXT: #dbg_declare(ptr [[I165]], [[META190:![0-9]+]], !DIExpression(), [[META193:![0-9]+]]) +// CHECK-DEBUG-NEXT: store i32 0, ptr [[I165]], align 4, !dbg [[META193]] +// CHECK-DEBUG-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_15]], ptr [[AGG_CAPTURED166]], i32 0, i32 0, !dbg [[DBG194:![0-9]+]] +// CHECK-DEBUG-NEXT: store ptr [[I165]], ptr [[TMP9]], align 8, !dbg [[DBG194]] +// CHECK-DEBUG-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_16]], ptr [[AGG_CAPTURED167]], i32 0, i32 0, !dbg [[DBG194]] +// CHECK-DEBUG-NEXT: [[TMP11:%.*]] = load i32, ptr [[I165]], align 4, !dbg [[DBG195:![0-9]+]] // CHECK-DEBUG-NEXT: store i32 [[TMP11]], ptr [[TMP10]], align 4, !dbg [[DBG194]] -// CHECK-DEBUG-NEXT: call void @__captured_stmt.17(ptr [[DOTCOUNT_ADDR166]], ptr [[AGG_CAPTURED164]]), !dbg [[DBG194]] -// CHECK-DEBUG-NEXT: [[DOTCOUNT167:%.*]] = load i32, ptr [[DOTCOUNT_ADDR166]], align 4, !dbg [[DBG194]] -// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_PREHEADER168:%.*]], !dbg [[DBG194]] -// CHECK-DEBUG: omp_loop.preheader168: -// CHECK-DEBUG-NEXT: store i32 0, ptr [[P_LOWERBOUND182]], align 4, !dbg [[DBG194]] -// CHECK-DEBUG-NEXT: [[TMP12:%.*]] = sub i32 [[DOTCOUNT167]], 1, !dbg [[DBG194]] -// CHECK-DEBUG-NEXT: store i32 [[TMP12]], ptr [[P_UPPERBOUND183]], align 4, !dbg [[DBG194]] -// CHECK-DEBUG-NEXT: store i32 1, ptr [[P_STRIDE184]], align 4, !dbg [[DBG194]] -// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM185:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB39:[0-9]+]]), !dbg [[DBG194]] -// CHECK-DEBUG-NEXT: call void @__kmpc_for_static_init_4u(ptr @[[GLOB39]], i32 [[OMP_GLOBAL_THREAD_NUM185]], i32 34, ptr [[P_LASTITER181]], ptr [[P_LOWERBOUND182]], ptr [[P_UPPERBOUND183]], ptr [[P_STRIDE184]], i32 1, i32 0), !dbg [[DBG194]] -// CHECK-DEBUG-NEXT: [[TMP13:%.*]] = load i32, ptr [[P_LOWERBOUND182]], align 4, !dbg [[DBG194]] -// CHECK-DEBUG-NEXT: [[TMP14:%.*]] = load i32, ptr [[P_UPPERBOUND183]], align 4, !dbg [[DBG194]] -// CHECK-DEBUG-NEXT: [[TRIP_COUNT_MINUS1186:%.*]] = sub i32 [[TMP14]], [[TMP13]], !dbg [[DBG194]] -// CHECK-DEBUG-NEXT: [[TMP15:%.*]] = add i32 [[TRIP_COUNT_MINUS1186]], 1, !dbg [[DBG194]] -// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_HEADER169:%.*]], !dbg [[DBG194]] -// CHECK-DEBUG: omp_loop.header169: -// CHECK-DEBUG-NEXT: [[OMP_LOOP_IV175:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER168]] ], [ [[OMP_LOOP_NEXT177:%.*]], [[OMP_LOOP_INC172:%.*]] ], !dbg [[DBG194]] -// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_COND170:%.*]], !dbg [[DBG194]] -// CHECK-DEBUG: omp_loop.cond170: -// CHECK-DEBUG-NEXT: [[OMP_LOOP_CMP176:%.*]] = icmp ult i32 [[OMP_LOOP_IV175]], [[TMP15]], !dbg [[DBG194]] -// CHECK-DEBUG-NEXT: br i1 [[OMP_LOOP_CMP176]], label [[OMP_LOOP_BODY171:%.*]], label [[OMP_LOOP_EXIT173:%.*]], !dbg [[DBG194]] -// CHECK-DEBUG: omp_loop.exit173: -// CHECK-DEBUG-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB39]], i32 [[OMP_GLOBAL_THREAD_NUM185]]), !dbg [[DBG194]] -// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM187:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB39]]), !dbg [[DBG196:![0-9]+]] -// CHECK-DEBUG-NEXT: call void @__kmpc_barrier(ptr @[[GLOB40:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM187]]), !dbg [[DBG196]] -// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_AFTER174:%.*]], !dbg [[DBG194]] -// CHECK-DEBUG: omp_loop.after174: +// CHECK-DEBUG-NEXT: call void @__captured_stmt.17(ptr [[DOTCOUNT_ADDR168]], ptr [[AGG_CAPTURED166]]), !dbg [[DBG194]] +// CHECK-DEBUG-NEXT: [[DOTCOUNT169:%.*]] = load i32, ptr [[DOTCOUNT_ADDR168]], align 4, !dbg [[DBG194]] +// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_PREHEADER170:%.*]], !dbg [[DBG194]] +// CHECK-DEBUG: omp_loop.preheader170: +// CHECK-DEBUG-NEXT: store i32 0, ptr [[P_LOWERBOUND184]], align 4, !dbg [[DBG194]] +// CHECK-DEBUG-NEXT: [[TMP12:%.*]] = sub i32 [[DOTCOUNT169]], 1, !dbg [[DBG194]] +// CHECK-DEBUG-NEXT: store i32 [[TMP12]], ptr [[P_UPPERBOUND185]], align 4, !dbg [[DBG194]] +// CHECK-DEBUG-NEXT: store i32 1, ptr [[P_STRIDE186]], align 4, !dbg [[DBG194]] +// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM187:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB39:[0-9]+]]), !dbg [[DBG194]] +// CHECK-DEBUG-NEXT: call void @__kmpc_for_static_init_4u(ptr @[[GLOB39]], i32 [[OMP_GLOBAL_THREAD_NUM187]], i32 34, ptr [[P_LASTITER183]], ptr [[P_LOWERBOUND184]], ptr [[P_UPPERBOUND185]], ptr [[P_STRIDE186]], i32 1, i32 0), !dbg [[DBG194]] +// CHECK-DEBUG-NEXT: [[TMP13:%.*]] = load i32, ptr [[P_LOWERBOUND184]], align 4, !dbg [[DBG194]] +// CHECK-DEBUG-NEXT: [[TMP14:%.*]] = load i32, ptr [[P_UPPERBOUND185]], align 4, !dbg [[DBG194]] +// CHECK-DEBUG-NEXT: [[TRIP_COUNT_MINUS1188:%.*]] = sub i32 [[TMP14]], [[TMP13]], !dbg [[DBG194]] +// CHECK-DEBUG-NEXT: [[TMP15:%.*]] = add i32 [[TRIP_COUNT_MINUS1188]], 1, !dbg [[DBG194]] +// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_HEADER171:%.*]], !dbg [[DBG194]] +// CHECK-DEBUG: omp_loop.header171: +// CHECK-DEBUG-NEXT: [[OMP_LOOP_IV177:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER170]] ], [ [[OMP_LOOP_NEXT179:%.*]], [[OMP_LOOP_INC174:%.*]] ], !dbg [[DBG194]] +// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_COND172:%.*]], !dbg [[DBG194]] +// CHECK-DEBUG: omp_loop.cond172: +// CHECK-DEBUG-NEXT: [[OMP_LOOP_CMP178:%.*]] = icmp ult i32 [[OMP_LOOP_IV177]], [[TMP15]], !dbg [[DBG194]] +// CHECK-DEBUG-NEXT: br i1 [[OMP_LOOP_CMP178]], label [[OMP_LOOP_BODY173:%.*]], label [[OMP_LOOP_EXIT175:%.*]], !dbg [[DBG194]] +// CHECK-DEBUG: omp_loop.exit175: +// CHECK-DEBUG-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB39]], i32 [[OMP_GLOBAL_THREAD_NUM187]]), !dbg [[DBG194]] +// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM189:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB39]]), !dbg [[DBG196:![0-9]+]] +// CHECK-DEBUG-NEXT: call void @__kmpc_barrier(ptr @[[GLOB40:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM189]]), !dbg [[DBG196]] +// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_AFTER176:%.*]], !dbg [[DBG194]] +// CHECK-DEBUG: omp_loop.after176: // CHECK-DEBUG-NEXT: br label [[OMP_PAR_REGION_PARALLEL_AFTER:%.*]], !dbg [[DBG197:![0-9]+]] // CHECK-DEBUG: omp.par.region.parallel.after: // CHECK-DEBUG-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]] // CHECK-DEBUG: omp.par.pre_finalize: +// CHECK-DEBUG-NEXT: br label [[DOTFINI190:%.*]] +// CHECK-DEBUG: .fini190: // CHECK-DEBUG-NEXT: br label [[OMP_PAR_EXIT_EXITSTUB:%.*]], !dbg [[DBG197]] -// CHECK-DEBUG: omp_loop.body171: -// CHECK-DEBUG-NEXT: [[TMP16:%.*]] = add i32 [[OMP_LOOP_IV175]], [[TMP13]], !dbg [[DBG196]] -// CHECK-DEBUG-NEXT: call void @__captured_stmt.18(ptr [[I163]], i32 [[TMP16]], ptr [[AGG_CAPTURED165]]), !dbg [[DBG194]] +// CHECK-DEBUG: omp_loop.body173: +// CHECK-DEBUG-NEXT: [[TMP16:%.*]] = add i32 [[OMP_LOOP_IV177]], [[TMP13]], !dbg [[DBG196]] +// CHECK-DEBUG-NEXT: call void @__captured_stmt.18(ptr [[I165]], i32 [[TMP16]], ptr [[AGG_CAPTURED167]]), !dbg [[DBG194]] // CHECK-DEBUG-NEXT: [[TMP17:%.*]] = load i32, ptr [[LOADGEP_A_ADDR]], align 4, !dbg [[DBG198:![0-9]+]] -// CHECK-DEBUG-NEXT: [[CONV178:%.*]] = sitofp i32 [[TMP17]] to double, !dbg [[DBG198]] +// CHECK-DEBUG-NEXT: [[CONV180:%.*]] = sitofp i32 [[TMP17]] to double, !dbg [[DBG198]] // CHECK-DEBUG-NEXT: [[TMP18:%.*]] = load double, ptr [[LOADGEP_B_ADDR]], align 8, !dbg [[DBG196]] -// CHECK-DEBUG-NEXT: [[ADD179:%.*]] = fadd double [[CONV178]], [[TMP18]], !dbg [[DBG199:![0-9]+]] -// CHECK-DEBUG-NEXT: [[CONV180:%.*]] = fptrunc double [[ADD179]] to float, !dbg [[DBG198]] +// CHECK-DEBUG-NEXT: [[ADD181:%.*]] = fadd double [[CONV180]], [[TMP18]], !dbg [[DBG199:![0-9]+]] +// CHECK-DEBUG-NEXT: [[CONV182:%.*]] = fptrunc double [[ADD181]] to float, !dbg [[DBG198]] // CHECK-DEBUG-NEXT: [[TMP19:%.*]] = load ptr, ptr [[LOADGEP_R_ADDR]], align 8, !dbg [[DBG200:![0-9]+]] -// CHECK-DEBUG-NEXT: store float [[CONV180]], ptr [[TMP19]], align 4, !dbg [[DBG201:![0-9]+]] -// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_INC172]], !dbg [[DBG194]] -// CHECK-DEBUG: omp_loop.inc172: -// CHECK-DEBUG-NEXT: [[OMP_LOOP_NEXT177]] = add nuw i32 [[OMP_LOOP_IV175]], 1, !dbg [[DBG194]] -// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_HEADER169]], !dbg [[DBG194]] +// CHECK-DEBUG-NEXT: store float [[CONV182]], ptr [[TMP19]], align 4, !dbg [[DBG201:![0-9]+]] +// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_INC174]], !dbg [[DBG194]] +// CHECK-DEBUG: omp_loop.inc174: +// CHECK-DEBUG-NEXT: [[OMP_LOOP_NEXT179]] = add nuw i32 [[OMP_LOOP_IV177]], 1, !dbg [[DBG194]] +// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_HEADER171]], !dbg [[DBG194]] // CHECK-DEBUG: omp_loop.body: // CHECK-DEBUG-NEXT: [[TMP20:%.*]] = add i32 [[OMP_LOOP_IV]], [[TMP6]], !dbg [[DBG184]] // CHECK-DEBUG-NEXT: call void @__captured_stmt.6(ptr [[I]], i32 [[TMP20]], ptr [[AGG_CAPTURED1]]), !dbg [[DBG182]] @@ -2096,17 +2118,17 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-SAME: (ptr noalias [[TID_ADDR6:%.*]], ptr noalias [[ZERO_ADDR7:%.*]], ptr [[TMP0:%.*]]) #[[ATTR1]] !dbg [[DBG206:![0-9]+]] { // CHECK-DEBUG-NEXT: omp.par.entry8: // CHECK-DEBUG-NEXT: [[GEP_A_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 0 -// CHECK-DEBUG-NEXT: [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META86]] +// CHECK-DEBUG-NEXT: [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META47]] // CHECK-DEBUG-NEXT: [[GEP_B_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 1 // CHECK-DEBUG-NEXT: [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META87]] // CHECK-DEBUG-NEXT: [[GEP_R_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 2 // CHECK-DEBUG-NEXT: [[LOADGEP_R_ADDR:%.*]] = load ptr, ptr [[GEP_R_ADDR]], align 8, !align [[META87]] -// CHECK-DEBUG-NEXT: [[STRUCTARG213:%.*]] = alloca { ptr, ptr, ptr }, align 8 +// CHECK-DEBUG-NEXT: [[STRUCTARG216:%.*]] = alloca { ptr, ptr, ptr }, align 8 // CHECK-DEBUG-NEXT: [[STRUCTARG:%.*]] = alloca { ptr, ptr, ptr }, align 8 -// CHECK-DEBUG-NEXT: [[P_LASTITER156:%.*]] = alloca i32, align 4 -// CHECK-DEBUG-NEXT: [[P_LOWERBOUND157:%.*]] = alloca i32, align 4 -// CHECK-DEBUG-NEXT: [[P_UPPERBOUND158:%.*]] = alloca i32, align 4 -// CHECK-DEBUG-NEXT: [[P_STRIDE159:%.*]] = alloca i32, align 4 +// CHECK-DEBUG-NEXT: [[P_LASTITER157:%.*]] = alloca i32, align 4 +// CHECK-DEBUG-NEXT: [[P_LOWERBOUND158:%.*]] = alloca i32, align 4 +// CHECK-DEBUG-NEXT: [[P_UPPERBOUND159:%.*]] = alloca i32, align 4 +// CHECK-DEBUG-NEXT: [[P_STRIDE160:%.*]] = alloca i32, align 4 // CHECK-DEBUG-NEXT: [[P_LASTITER95:%.*]] = alloca i32, align 4 // CHECK-DEBUG-NEXT: [[P_LOWERBOUND96:%.*]] = alloca i32, align 4 // CHECK-DEBUG-NEXT: [[P_UPPERBOUND97:%.*]] = alloca i32, align 4 @@ -2127,10 +2149,10 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-NEXT: [[AGG_CAPTURED78:%.*]] = alloca [[STRUCT_ANON_9:%.*]], align 8 // CHECK-DEBUG-NEXT: [[AGG_CAPTURED79:%.*]] = alloca [[STRUCT_ANON_10:%.*]], align 4 // CHECK-DEBUG-NEXT: [[DOTCOUNT_ADDR80:%.*]] = alloca i32, align 4 -// CHECK-DEBUG-NEXT: [[I138:%.*]] = alloca i32, align 4 -// CHECK-DEBUG-NEXT: [[AGG_CAPTURED139:%.*]] = alloca [[STRUCT_ANON_13:%.*]], align 8 -// CHECK-DEBUG-NEXT: [[AGG_CAPTURED140:%.*]] = alloca [[STRUCT_ANON_14:%.*]], align 4 -// CHECK-DEBUG-NEXT: [[DOTCOUNT_ADDR141:%.*]] = alloca i32, align 4 +// CHECK-DEBUG-NEXT: [[I139:%.*]] = alloca i32, align 4 +// CHECK-DEBUG-NEXT: [[AGG_CAPTURED140:%.*]] = alloca [[STRUCT_ANON_13:%.*]], align 8 +// CHECK-DEBUG-NEXT: [[AGG_CAPTURED141:%.*]] = alloca [[STRUCT_ANON_14:%.*]], align 4 +// CHECK-DEBUG-NEXT: [[DOTCOUNT_ADDR142:%.*]] = alloca i32, align 4 // CHECK-DEBUG-NEXT: #dbg_declare(ptr [[LOADGEP_A_ADDR]], [[META207:![0-9]+]], !DIExpression(), [[META208:![0-9]+]]) // CHECK-DEBUG-NEXT: #dbg_declare(ptr [[LOADGEP_B_ADDR]], [[META209:![0-9]+]], !DIExpression(), [[META210:![0-9]+]]) // CHECK-DEBUG-NEXT: #dbg_declare(ptr [[LOADGEP_R_ADDR]], [[META211:![0-9]+]], !DIExpression(), [[META212:![0-9]+]]) @@ -2217,70 +2239,72 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-NEXT: br label [[OMP_LOOP_AFTER88:%.*]], !dbg [[DBG231]] // CHECK-DEBUG: omp_loop.after88: // CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM102:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB31:[0-9]+]]), !dbg [[DBG234:![0-9]+]] -// CHECK-DEBUG-NEXT: br label [[OMP_PARALLEL217:%.*]] -// CHECK-DEBUG: omp_parallel217: -// CHECK-DEBUG-NEXT: [[GEP_A_ADDR214:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[STRUCTARG213]], i32 0, i32 0 -// CHECK-DEBUG-NEXT: store ptr [[LOADGEP_A_ADDR]], ptr [[GEP_A_ADDR214]], align 8 -// CHECK-DEBUG-NEXT: [[GEP_B_ADDR215:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[STRUCTARG213]], i32 0, i32 1 -// CHECK-DEBUG-NEXT: store ptr [[LOADGEP_B_ADDR]], ptr [[GEP_B_ADDR215]], align 8 -// CHECK-DEBUG-NEXT: [[GEP_R_ADDR216:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[STRUCTARG213]], i32 0, i32 2 -// CHECK-DEBUG-NEXT: store ptr [[LOADGEP_R_ADDR]], ptr [[GEP_R_ADDR216]], align 8 -// CHECK-DEBUG-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB31]], i32 1, ptr @_Z14parallel_for_2Pfid..omp_par.21, ptr [[STRUCTARG213]]), !dbg [[DBG235:![0-9]+]] +// CHECK-DEBUG-NEXT: br label [[OMP_PARALLEL220:%.*]] +// CHECK-DEBUG: omp_parallel220: +// CHECK-DEBUG-NEXT: [[GEP_A_ADDR217:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[STRUCTARG216]], i32 0, i32 0 +// CHECK-DEBUG-NEXT: store ptr [[LOADGEP_A_ADDR]], ptr [[GEP_A_ADDR217]], align 8 +// CHECK-DEBUG-NEXT: [[GEP_B_ADDR218:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[STRUCTARG216]], i32 0, i32 1 +// CHECK-DEBUG-NEXT: store ptr [[LOADGEP_B_ADDR]], ptr [[GEP_B_ADDR218]], align 8 +// CHECK-DEBUG-NEXT: [[GEP_R_ADDR219:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[STRUCTARG216]], i32 0, i32 2 +// CHECK-DEBUG-NEXT: store ptr [[LOADGEP_R_ADDR]], ptr [[GEP_R_ADDR219]], align 8 +// CHECK-DEBUG-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB31]], i32 1, ptr @_Z14parallel_for_2Pfid..omp_par.21, ptr [[STRUCTARG216]]), !dbg [[DBG235:![0-9]+]] // CHECK-DEBUG-NEXT: br label [[OMP_PAR_EXIT108:%.*]] // CHECK-DEBUG: omp.par.exit108: -// CHECK-DEBUG-NEXT: #dbg_declare(ptr [[I138]], [[META239:![0-9]+]], !DIExpression(), [[META242:![0-9]+]]) -// CHECK-DEBUG-NEXT: store i32 0, ptr [[I138]], align 4, !dbg [[META242]] -// CHECK-DEBUG-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_13]], ptr [[AGG_CAPTURED139]], i32 0, i32 0, !dbg [[DBG243:![0-9]+]] -// CHECK-DEBUG-NEXT: store ptr [[I138]], ptr [[TMP16]], align 8, !dbg [[DBG243]] -// CHECK-DEBUG-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_14]], ptr [[AGG_CAPTURED140]], i32 0, i32 0, !dbg [[DBG243]] -// CHECK-DEBUG-NEXT: [[TMP18:%.*]] = load i32, ptr [[I138]], align 4, !dbg [[DBG244:![0-9]+]] +// CHECK-DEBUG-NEXT: #dbg_declare(ptr [[I139]], [[META239:![0-9]+]], !DIExpression(), [[META242:![0-9]+]]) +// CHECK-DEBUG-NEXT: store i32 0, ptr [[I139]], align 4, !dbg [[META242]] +// CHECK-DEBUG-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_13]], ptr [[AGG_CAPTURED140]], i32 0, i32 0, !dbg [[DBG243:![0-9]+]] +// CHECK-DEBUG-NEXT: store ptr [[I139]], ptr [[TMP16]], align 8, !dbg [[DBG243]] +// CHECK-DEBUG-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_14]], ptr [[AGG_CAPTURED141]], i32 0, i32 0, !dbg [[DBG243]] +// CHECK-DEBUG-NEXT: [[TMP18:%.*]] = load i32, ptr [[I139]], align 4, !dbg [[DBG244:![0-9]+]] // CHECK-DEBUG-NEXT: store i32 [[TMP18]], ptr [[TMP17]], align 4, !dbg [[DBG243]] -// CHECK-DEBUG-NEXT: call void @__captured_stmt.15(ptr [[DOTCOUNT_ADDR141]], ptr [[AGG_CAPTURED139]]), !dbg [[DBG243]] -// CHECK-DEBUG-NEXT: [[DOTCOUNT142:%.*]] = load i32, ptr [[DOTCOUNT_ADDR141]], align 4, !dbg [[DBG243]] -// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_PREHEADER143:%.*]], !dbg [[DBG243]] -// CHECK-DEBUG: omp_loop.preheader143: -// CHECK-DEBUG-NEXT: store i32 0, ptr [[P_LOWERBOUND157]], align 4, !dbg [[DBG243]] -// CHECK-DEBUG-NEXT: [[TMP19:%.*]] = sub i32 [[DOTCOUNT142]], 1, !dbg [[DBG243]] -// CHECK-DEBUG-NEXT: store i32 [[TMP19]], ptr [[P_UPPERBOUND158]], align 4, !dbg [[DBG243]] -// CHECK-DEBUG-NEXT: store i32 1, ptr [[P_STRIDE159]], align 4, !dbg [[DBG243]] -// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM160:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB36:[0-9]+]]), !dbg [[DBG243]] -// CHECK-DEBUG-NEXT: call void @__kmpc_for_static_init_4u(ptr @[[GLOB36]], i32 [[OMP_GLOBAL_THREAD_NUM160]], i32 34, ptr [[P_LASTITER156]], ptr [[P_LOWERBOUND157]], ptr [[P_UPPERBOUND158]], ptr [[P_STRIDE159]], i32 1, i32 0), !dbg [[DBG243]] -// CHECK-DEBUG-NEXT: [[TMP20:%.*]] = load i32, ptr [[P_LOWERBOUND157]], align 4, !dbg [[DBG243]] -// CHECK-DEBUG-NEXT: [[TMP21:%.*]] = load i32, ptr [[P_UPPERBOUND158]], align 4, !dbg [[DBG243]] -// CHECK-DEBUG-NEXT: [[TRIP_COUNT_MINUS1161:%.*]] = sub i32 [[TMP21]], [[TMP20]], !dbg [[DBG243]] -// CHECK-DEBUG-NEXT: [[TMP22:%.*]] = add i32 [[TRIP_COUNT_MINUS1161]], 1, !dbg [[DBG243]] -// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_HEADER144:%.*]], !dbg [[DBG243]] -// CHECK-DEBUG: omp_loop.header144: -// CHECK-DEBUG-NEXT: [[OMP_LOOP_IV150:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER143]] ], [ [[OMP_LOOP_NEXT152:%.*]], [[OMP_LOOP_INC147:%.*]] ], !dbg [[DBG243]] -// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_COND145:%.*]], !dbg [[DBG243]] -// CHECK-DEBUG: omp_loop.cond145: -// CHECK-DEBUG-NEXT: [[OMP_LOOP_CMP151:%.*]] = icmp ult i32 [[OMP_LOOP_IV150]], [[TMP22]], !dbg [[DBG243]] -// CHECK-DEBUG-NEXT: br i1 [[OMP_LOOP_CMP151]], label [[OMP_LOOP_BODY146:%.*]], label [[OMP_LOOP_EXIT148:%.*]], !dbg [[DBG243]] -// CHECK-DEBUG: omp_loop.exit148: -// CHECK-DEBUG-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB36]], i32 [[OMP_GLOBAL_THREAD_NUM160]]), !dbg [[DBG243]] -// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM162:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB36]]), !dbg [[DBG245:![0-9]+]] -// CHECK-DEBUG-NEXT: call void @__kmpc_barrier(ptr @[[GLOB37:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM162]]), !dbg [[DBG245]] -// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_AFTER149:%.*]], !dbg [[DBG243]] -// CHECK-DEBUG: omp_loop.after149: +// CHECK-DEBUG-NEXT: call void @__captured_stmt.15(ptr [[DOTCOUNT_ADDR142]], ptr [[AGG_CAPTURED140]]), !dbg [[DBG243]] +// CHECK-DEBUG-NEXT: [[DOTCOUNT143:%.*]] = load i32, ptr [[DOTCOUNT_ADDR142]], align 4, !dbg [[DBG243]] +// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_PREHEADER144:%.*]], !dbg [[DBG243]] +// CHECK-DEBUG: omp_loop.preheader144: +// CHECK-DEBUG-NEXT: store i32 0, ptr [[P_LOWERBOUND158]], align 4, !dbg [[DBG243]] +// CHECK-DEBUG-NEXT: [[TMP19:%.*]] = sub i32 [[DOTCOUNT143]], 1, !dbg [[DBG243]] +// CHECK-DEBUG-NEXT: store i32 [[TMP19]], ptr [[P_UPPERBOUND159]], align 4, !dbg [[DBG243]] +// CHECK-DEBUG-NEXT: store i32 1, ptr [[P_STRIDE160]], align 4, !dbg [[DBG243]] +// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM161:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB36:[0-9]+]]), !dbg [[DBG243]] +// CHECK-DEBUG-NEXT: call void @__kmpc_for_static_init_4u(ptr @[[GLOB36]], i32 [[OMP_GLOBAL_THREAD_NUM161]], i32 34, ptr [[P_LASTITER157]], ptr [[P_LOWERBOUND158]], ptr [[P_UPPERBOUND159]], ptr [[P_STRIDE160]], i32 1, i32 0), !dbg [[DBG243]] +// CHECK-DEBUG-NEXT: [[TMP20:%.*]] = load i32, ptr [[P_LOWERBOUND158]], align 4, !dbg [[DBG243]] +// CHECK-DEBUG-NEXT: [[TMP21:%.*]] = load i32, ptr [[P_UPPERBOUND159]], align 4, !dbg [[DBG243]] +// CHECK-DEBUG-NEXT: [[TRIP_COUNT_MINUS1162:%.*]] = sub i32 [[TMP21]], [[TMP20]], !dbg [[DBG243]] +// CHECK-DEBUG-NEXT: [[TMP22:%.*]] = add i32 [[TRIP_COUNT_MINUS1162]], 1, !dbg [[DBG243]] +// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_HEADER145:%.*]], !dbg [[DBG243]] +// CHECK-DEBUG: omp_loop.header145: +// CHECK-DEBUG-NEXT: [[OMP_LOOP_IV151:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER144]] ], [ [[OMP_LOOP_NEXT153:%.*]], [[OMP_LOOP_INC148:%.*]] ], !dbg [[DBG243]] +// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_COND146:%.*]], !dbg [[DBG243]] +// CHECK-DEBUG: omp_loop.cond146: +// CHECK-DEBUG-NEXT: [[OMP_LOOP_CMP152:%.*]] = icmp ult i32 [[OMP_LOOP_IV151]], [[TMP22]], !dbg [[DBG243]] +// CHECK-DEBUG-NEXT: br i1 [[OMP_LOOP_CMP152]], label [[OMP_LOOP_BODY147:%.*]], label [[OMP_LOOP_EXIT149:%.*]], !dbg [[DBG243]] +// CHECK-DEBUG: omp_loop.exit149: +// CHECK-DEBUG-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB36]], i32 [[OMP_GLOBAL_THREAD_NUM161]]), !dbg [[DBG243]] +// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM163:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB36]]), !dbg [[DBG245:![0-9]+]] +// CHECK-DEBUG-NEXT: call void @__kmpc_barrier(ptr @[[GLOB37:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM163]]), !dbg [[DBG245]] +// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_AFTER150:%.*]], !dbg [[DBG243]] +// CHECK-DEBUG: omp_loop.after150: // CHECK-DEBUG-NEXT: br label [[OMP_PAR_REGION9_PARALLEL_AFTER:%.*]], !dbg [[DBG246:![0-9]+]] // CHECK-DEBUG: omp.par.region9.parallel.after: // CHECK-DEBUG-NEXT: br label [[OMP_PAR_PRE_FINALIZE10:%.*]] // CHECK-DEBUG: omp.par.pre_finalize10: +// CHECK-DEBUG-NEXT: br label [[DOTFINI164:%.*]] +// CHECK-DEBUG: .fini164: // CHECK-DEBUG-NEXT: br label [[OMP_PAR_EXIT11_EXITSTUB:%.*]], !dbg [[DBG246]] -// CHECK-DEBUG: omp_loop.body146: -// CHECK-DEBUG-NEXT: [[TMP23:%.*]] = add i32 [[OMP_LOOP_IV150]], [[TMP20]], !dbg [[DBG245]] -// CHECK-DEBUG-NEXT: call void @__captured_stmt.16(ptr [[I138]], i32 [[TMP23]], ptr [[AGG_CAPTURED140]]), !dbg [[DBG243]] +// CHECK-DEBUG: omp_loop.body147: +// CHECK-DEBUG-NEXT: [[TMP23:%.*]] = add i32 [[OMP_LOOP_IV151]], [[TMP20]], !dbg [[DBG245]] +// CHECK-DEBUG-NEXT: call void @__captured_stmt.16(ptr [[I139]], i32 [[TMP23]], ptr [[AGG_CAPTURED141]]), !dbg [[DBG243]] // CHECK-DEBUG-NEXT: [[TMP24:%.*]] = load i32, ptr [[LOADGEP_A_ADDR]], align 4, !dbg [[DBG247:![0-9]+]] -// CHECK-DEBUG-NEXT: [[CONV153:%.*]] = sitofp i32 [[TMP24]] to double, !dbg [[DBG247]] +// CHECK-DEBUG-NEXT: [[CONV154:%.*]] = sitofp i32 [[TMP24]] to double, !dbg [[DBG247]] // CHECK-DEBUG-NEXT: [[TMP25:%.*]] = load double, ptr [[LOADGEP_B_ADDR]], align 8, !dbg [[DBG245]] -// CHECK-DEBUG-NEXT: [[ADD154:%.*]] = fadd double [[CONV153]], [[TMP25]], !dbg [[DBG248:![0-9]+]] -// CHECK-DEBUG-NEXT: [[CONV155:%.*]] = fptrunc double [[ADD154]] to float, !dbg [[DBG247]] +// CHECK-DEBUG-NEXT: [[ADD155:%.*]] = fadd double [[CONV154]], [[TMP25]], !dbg [[DBG248:![0-9]+]] +// CHECK-DEBUG-NEXT: [[CONV156:%.*]] = fptrunc double [[ADD155]] to float, !dbg [[DBG247]] // CHECK-DEBUG-NEXT: [[TMP26:%.*]] = load ptr, ptr [[LOADGEP_R_ADDR]], align 8, !dbg [[DBG249:![0-9]+]] -// CHECK-DEBUG-NEXT: store float [[CONV155]], ptr [[TMP26]], align 4, !dbg [[DBG250:![0-9]+]] -// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_INC147]], !dbg [[DBG243]] -// CHECK-DEBUG: omp_loop.inc147: -// CHECK-DEBUG-NEXT: [[OMP_LOOP_NEXT152]] = add nuw i32 [[OMP_LOOP_IV150]], 1, !dbg [[DBG243]] -// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_HEADER144]], !dbg [[DBG243]] +// CHECK-DEBUG-NEXT: store float [[CONV156]], ptr [[TMP26]], align 4, !dbg [[DBG250:![0-9]+]] +// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_INC148]], !dbg [[DBG243]] +// CHECK-DEBUG: omp_loop.inc148: +// CHECK-DEBUG-NEXT: [[OMP_LOOP_NEXT153]] = add nuw i32 [[OMP_LOOP_IV151]], 1, !dbg [[DBG243]] +// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_HEADER145]], !dbg [[DBG243]] // CHECK-DEBUG: omp_loop.body85: // CHECK-DEBUG-NEXT: [[TMP27:%.*]] = add i32 [[OMP_LOOP_IV89]], [[TMP13]], !dbg [[DBG233]] // CHECK-DEBUG-NEXT: call void @__captured_stmt.12(ptr [[I77]], i32 [[TMP27]], ptr [[AGG_CAPTURED79]]), !dbg [[DBG231]] @@ -2317,7 +2341,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-SAME: (ptr noalias [[TID_ADDR103:%.*]], ptr noalias [[ZERO_ADDR104:%.*]], ptr [[TMP0:%.*]]) #[[ATTR1]] !dbg [[DBG259:![0-9]+]] { // CHECK-DEBUG-NEXT: omp.par.entry105: // CHECK-DEBUG-NEXT: [[GEP_A_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 0 -// CHECK-DEBUG-NEXT: [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META86]] +// CHECK-DEBUG-NEXT: [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META47]] // CHECK-DEBUG-NEXT: [[GEP_B_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 1 // CHECK-DEBUG-NEXT: [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META87]] // CHECK-DEBUG-NEXT: [[GEP_R_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 2 @@ -2377,6 +2401,8 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG: omp.par.region106.parallel.after: // CHECK-DEBUG-NEXT: br label [[OMP_PAR_PRE_FINALIZE107:%.*]] // CHECK-DEBUG: omp.par.pre_finalize107: +// CHECK-DEBUG-NEXT: br label [[DOTFINI138:%.*]] +// CHECK-DEBUG: .fini138: // CHECK-DEBUG-NEXT: br label [[OMP_PAR_EXIT108_EXITSTUB:%.*]], !dbg [[DBG276]] // CHECK-DEBUG: omp_loop.body121: // CHECK-DEBUG-NEXT: [[TMP9:%.*]] = add i32 [[OMP_LOOP_IV125]], [[TMP6]], !dbg [[DBG275]] @@ -2400,7 +2426,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-SAME: (ptr noalias [[TID_ADDR42:%.*]], ptr noalias [[ZERO_ADDR43:%.*]], ptr [[TMP0:%.*]]) #[[ATTR1]] !dbg [[DBG281:![0-9]+]] { // CHECK-DEBUG-NEXT: omp.par.entry44: // CHECK-DEBUG-NEXT: [[GEP_A_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 0 -// CHECK-DEBUG-NEXT: [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META86]] +// CHECK-DEBUG-NEXT: [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META47]] // CHECK-DEBUG-NEXT: [[GEP_B_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 1 // CHECK-DEBUG-NEXT: [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META87]] // CHECK-DEBUG-NEXT: [[GEP_R_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 2 @@ -2460,6 +2486,8 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG: omp.par.region45.parallel.after: // CHECK-DEBUG-NEXT: br label [[OMP_PAR_PRE_FINALIZE46:%.*]] // CHECK-DEBUG: omp.par.pre_finalize46: +// CHECK-DEBUG-NEXT: br label [[DOTFINI:%.*]] +// CHECK-DEBUG: .fini: // CHECK-DEBUG-NEXT: br label [[OMP_PAR_EXIT47_EXITSTUB:%.*]], !dbg [[DBG298]] // CHECK-DEBUG: omp_loop.body60: // CHECK-DEBUG-NEXT: [[TMP9:%.*]] = add i32 [[OMP_LOOP_IV64]], [[TMP6]], !dbg [[DBG297]] @@ -2494,7 +2522,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8 // CHECK-DEBUG-NEXT: #dbg_declare(ptr [[DOTSTART]], [[META307:![0-9]+]], !DIExpression(), [[META309:![0-9]+]]) // CHECK-DEBUG-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_3:%.*]], ptr [[TMP0]], i32 0, i32 0, !dbg [[DBG310:![0-9]+]] -// CHECK-DEBUG-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG310]] +// CHECK-DEBUG-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG310]], !nonnull [[META12]], !align [[META47]] // CHECK-DEBUG-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !dbg [[DBG310]] // CHECK-DEBUG-NEXT: store i32 [[TMP3]], ptr [[DOTSTART]], align 4, !dbg [[META309]] // CHECK-DEBUG-NEXT: #dbg_declare(ptr [[DOTSTOP]], [[META312:![0-9]+]], !DIExpression(), [[META313:![0-9]+]]) @@ -2519,7 +2547,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-NEXT: br label [[COND_END]], !dbg [[META313]] // CHECK-DEBUG: cond.end: // CHECK-DEBUG-NEXT: [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ], !dbg [[META313]] -// CHECK-DEBUG-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META313]] +// CHECK-DEBUG-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META313]], !nonnull [[META12]], !align [[META47]] // CHECK-DEBUG-NEXT: store i32 [[COND]], ptr [[TMP10]], align 4, !dbg [[META313]] // CHECK-DEBUG-NEXT: ret void, !dbg [[DBG315:![0-9]+]] // @@ -2542,7 +2570,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-NEXT: [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4, !dbg [[DBG324:![0-9]+]] // CHECK-DEBUG-NEXT: [[MUL:%.*]] = mul i32 1, [[TMP3]], !dbg [[DBG324]] // CHECK-DEBUG-NEXT: [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]], !dbg [[DBG324]] -// CHECK-DEBUG-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG324]] +// CHECK-DEBUG-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG324]], !nonnull [[META12]], !align [[META47]] // CHECK-DEBUG-NEXT: store i32 [[ADD]], ptr [[TMP4]], align 4, !dbg [[META319]] // CHECK-DEBUG-NEXT: ret void, !dbg [[DBG322]] // @@ -2562,7 +2590,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8 // CHECK-DEBUG-NEXT: #dbg_declare(ptr [[DOTSTART]], [[META329:![0-9]+]], !DIExpression(), [[META331:![0-9]+]]) // CHECK-DEBUG-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_5:%.*]], ptr [[TMP0]], i32 0, i32 0, !dbg [[DBG332:![0-9]+]] -// CHECK-DEBUG-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG332]] +// CHECK-DEBUG-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG332]], !nonnull [[META12]], !align [[META47]] // CHECK-DEBUG-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !dbg [[DBG332]] // CHECK-DEBUG-NEXT: store i32 [[TMP3]], ptr [[DOTSTART]], align 4, !dbg [[META331]] // CHECK-DEBUG-NEXT: #dbg_declare(ptr [[DOTSTOP]], [[META334:![0-9]+]], !DIExpression(), [[META335:![0-9]+]]) @@ -2587,7 +2615,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-NEXT: br label [[COND_END]], !dbg [[META335]] // CHECK-DEBUG: cond.end: // CHECK-DEBUG-NEXT: [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ], !dbg [[META335]] -// CHECK-DEBUG-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META335]] +// CHECK-DEBUG-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META335]], !nonnull [[META12]], !align [[META47]] // CHECK-DEBUG-NEXT: store i32 [[COND]], ptr [[TMP10]], align 4, !dbg [[META335]] // CHECK-DEBUG-NEXT: ret void, !dbg [[DBG337:![0-9]+]] // @@ -2610,7 +2638,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-NEXT: [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4, !dbg [[DBG346:![0-9]+]] // CHECK-DEBUG-NEXT: [[MUL:%.*]] = mul i32 1, [[TMP3]], !dbg [[DBG346]] // CHECK-DEBUG-NEXT: [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]], !dbg [[DBG346]] -// CHECK-DEBUG-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG346]] +// CHECK-DEBUG-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG346]], !nonnull [[META12]], !align [[META47]] // CHECK-DEBUG-NEXT: store i32 [[ADD]], ptr [[TMP4]], align 4, !dbg [[META341]] // CHECK-DEBUG-NEXT: ret void, !dbg [[DBG344]] // @@ -2630,7 +2658,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8 // CHECK-DEBUG-NEXT: #dbg_declare(ptr [[DOTSTART]], [[META351:![0-9]+]], !DIExpression(), [[META353:![0-9]+]]) // CHECK-DEBUG-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_7:%.*]], ptr [[TMP0]], i32 0, i32 0, !dbg [[DBG354:![0-9]+]] -// CHECK-DEBUG-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG354]] +// CHECK-DEBUG-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG354]], !nonnull [[META12]], !align [[META47]] // CHECK-DEBUG-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !dbg [[DBG354]] // CHECK-DEBUG-NEXT: store i32 [[TMP3]], ptr [[DOTSTART]], align 4, !dbg [[META353]] // CHECK-DEBUG-NEXT: #dbg_declare(ptr [[DOTSTOP]], [[META356:![0-9]+]], !DIExpression(), [[META357:![0-9]+]]) @@ -2655,7 +2683,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-NEXT: br label [[COND_END]], !dbg [[META357]] // CHECK-DEBUG: cond.end: // CHECK-DEBUG-NEXT: [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ], !dbg [[META357]] -// CHECK-DEBUG-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META357]] +// CHECK-DEBUG-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META357]], !nonnull [[META12]], !align [[META47]] // CHECK-DEBUG-NEXT: store i32 [[COND]], ptr [[TMP10]], align 4, !dbg [[META357]] // CHECK-DEBUG-NEXT: ret void, !dbg [[DBG359:![0-9]+]] // @@ -2678,7 +2706,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-NEXT: [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4, !dbg [[DBG368:![0-9]+]] // CHECK-DEBUG-NEXT: [[MUL:%.*]] = mul i32 1, [[TMP3]], !dbg [[DBG368]] // CHECK-DEBUG-NEXT: [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]], !dbg [[DBG368]] -// CHECK-DEBUG-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG368]] +// CHECK-DEBUG-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG368]], !nonnull [[META12]], !align [[META47]] // CHECK-DEBUG-NEXT: store i32 [[ADD]], ptr [[TMP4]], align 4, !dbg [[META363]] // CHECK-DEBUG-NEXT: ret void, !dbg [[DBG366]] // @@ -2698,7 +2726,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8 // CHECK-DEBUG-NEXT: #dbg_declare(ptr [[DOTSTART]], [[META373:![0-9]+]], !DIExpression(), [[META375:![0-9]+]]) // CHECK-DEBUG-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_9:%.*]], ptr [[TMP0]], i32 0, i32 0, !dbg [[DBG376:![0-9]+]] -// CHECK-DEBUG-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG376]] +// CHECK-DEBUG-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG376]], !nonnull [[META12]], !align [[META47]] // CHECK-DEBUG-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !dbg [[DBG376]] // CHECK-DEBUG-NEXT: store i32 [[TMP3]], ptr [[DOTSTART]], align 4, !dbg [[META375]] // CHECK-DEBUG-NEXT: #dbg_declare(ptr [[DOTSTOP]], [[META378:![0-9]+]], !DIExpression(), [[META379:![0-9]+]]) @@ -2723,7 +2751,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-NEXT: br label [[COND_END]], !dbg [[META379]] // CHECK-DEBUG: cond.end: // CHECK-DEBUG-NEXT: [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ], !dbg [[META379]] -// CHECK-DEBUG-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META379]] +// CHECK-DEBUG-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META379]], !nonnull [[META12]], !align [[META47]] // CHECK-DEBUG-NEXT: store i32 [[COND]], ptr [[TMP10]], align 4, !dbg [[META379]] // CHECK-DEBUG-NEXT: ret void, !dbg [[DBG381:![0-9]+]] // @@ -2746,7 +2774,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-NEXT: [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4, !dbg [[DBG390:![0-9]+]] // CHECK-DEBUG-NEXT: [[MUL:%.*]] = mul i32 1, [[TMP3]], !dbg [[DBG390]] // CHECK-DEBUG-NEXT: [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]], !dbg [[DBG390]] -// CHECK-DEBUG-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG390]] +// CHECK-DEBUG-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG390]], !nonnull [[META12]], !align [[META47]] // CHECK-DEBUG-NEXT: store i32 [[ADD]], ptr [[TMP4]], align 4, !dbg [[META385]] // CHECK-DEBUG-NEXT: ret void, !dbg [[DBG388]] // @@ -2766,7 +2794,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8 // CHECK-DEBUG-NEXT: #dbg_declare(ptr [[DOTSTART]], [[META395:![0-9]+]], !DIExpression(), [[META397:![0-9]+]]) // CHECK-DEBUG-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_11:%.*]], ptr [[TMP0]], i32 0, i32 0, !dbg [[DBG398:![0-9]+]] -// CHECK-DEBUG-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG398]] +// CHECK-DEBUG-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG398]], !nonnull [[META12]], !align [[META47]] // CHECK-DEBUG-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !dbg [[DBG398]] // CHECK-DEBUG-NEXT: store i32 [[TMP3]], ptr [[DOTSTART]], align 4, !dbg [[META397]] // CHECK-DEBUG-NEXT: #dbg_declare(ptr [[DOTSTOP]], [[META400:![0-9]+]], !DIExpression(), [[META401:![0-9]+]]) @@ -2791,7 +2819,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-NEXT: br label [[COND_END]], !dbg [[META401]] // CHECK-DEBUG: cond.end: // CHECK-DEBUG-NEXT: [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ], !dbg [[META401]] -// CHECK-DEBUG-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META401]] +// CHECK-DEBUG-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META401]], !nonnull [[META12]], !align [[META47]] // CHECK-DEBUG-NEXT: store i32 [[COND]], ptr [[TMP10]], align 4, !dbg [[META401]] // CHECK-DEBUG-NEXT: ret void, !dbg [[DBG403:![0-9]+]] // @@ -2814,7 +2842,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-NEXT: [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4, !dbg [[DBG412:![0-9]+]] // CHECK-DEBUG-NEXT: [[MUL:%.*]] = mul i32 1, [[TMP3]], !dbg [[DBG412]] // CHECK-DEBUG-NEXT: [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]], !dbg [[DBG412]] -// CHECK-DEBUG-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG412]] +// CHECK-DEBUG-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG412]], !nonnull [[META12]], !align [[META47]] // CHECK-DEBUG-NEXT: store i32 [[ADD]], ptr [[TMP4]], align 4, !dbg [[META407]] // CHECK-DEBUG-NEXT: ret void, !dbg [[DBG410]] // @@ -2834,7 +2862,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8 // CHECK-DEBUG-NEXT: #dbg_declare(ptr [[DOTSTART]], [[META417:![0-9]+]], !DIExpression(), [[META419:![0-9]+]]) // CHECK-DEBUG-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_13:%.*]], ptr [[TMP0]], i32 0, i32 0, !dbg [[DBG420:![0-9]+]] -// CHECK-DEBUG-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG420]] +// CHECK-DEBUG-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG420]], !nonnull [[META12]], !align [[META47]] // CHECK-DEBUG-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !dbg [[DBG420]] // CHECK-DEBUG-NEXT: store i32 [[TMP3]], ptr [[DOTSTART]], align 4, !dbg [[META419]] // CHECK-DEBUG-NEXT: #dbg_declare(ptr [[DOTSTOP]], [[META422:![0-9]+]], !DIExpression(), [[META423:![0-9]+]]) @@ -2859,7 +2887,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-NEXT: br label [[COND_END]], !dbg [[META423]] // CHECK-DEBUG: cond.end: // CHECK-DEBUG-NEXT: [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ], !dbg [[META423]] -// CHECK-DEBUG-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META423]] +// CHECK-DEBUG-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META423]], !nonnull [[META12]], !align [[META47]] // CHECK-DEBUG-NEXT: store i32 [[COND]], ptr [[TMP10]], align 4, !dbg [[META423]] // CHECK-DEBUG-NEXT: ret void, !dbg [[DBG425:![0-9]+]] // @@ -2882,7 +2910,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-NEXT: [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4, !dbg [[DBG434:![0-9]+]] // CHECK-DEBUG-NEXT: [[MUL:%.*]] = mul i32 1, [[TMP3]], !dbg [[DBG434]] // CHECK-DEBUG-NEXT: [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]], !dbg [[DBG434]] -// CHECK-DEBUG-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG434]] +// CHECK-DEBUG-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG434]], !nonnull [[META12]], !align [[META47]] // CHECK-DEBUG-NEXT: store i32 [[ADD]], ptr [[TMP4]], align 4, !dbg [[META429]] // CHECK-DEBUG-NEXT: ret void, !dbg [[DBG432]] // @@ -2902,7 +2930,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8 // CHECK-DEBUG-NEXT: #dbg_declare(ptr [[DOTSTART]], [[META439:![0-9]+]], !DIExpression(), [[META441:![0-9]+]]) // CHECK-DEBUG-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_15:%.*]], ptr [[TMP0]], i32 0, i32 0, !dbg [[DBG442:![0-9]+]] -// CHECK-DEBUG-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG442]] +// CHECK-DEBUG-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG442]], !nonnull [[META12]], !align [[META47]] // CHECK-DEBUG-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !dbg [[DBG442]] // CHECK-DEBUG-NEXT: store i32 [[TMP3]], ptr [[DOTSTART]], align 4, !dbg [[META441]] // CHECK-DEBUG-NEXT: #dbg_declare(ptr [[DOTSTOP]], [[META444:![0-9]+]], !DIExpression(), [[META445:![0-9]+]]) @@ -2927,7 +2955,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-NEXT: br label [[COND_END]], !dbg [[META445]] // CHECK-DEBUG: cond.end: // CHECK-DEBUG-NEXT: [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ], !dbg [[META445]] -// CHECK-DEBUG-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META445]] +// CHECK-DEBUG-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META445]], !nonnull [[META12]], !align [[META47]] // CHECK-DEBUG-NEXT: store i32 [[COND]], ptr [[TMP10]], align 4, !dbg [[META445]] // CHECK-DEBUG-NEXT: ret void, !dbg [[DBG447:![0-9]+]] // @@ -2950,7 +2978,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-NEXT: [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4, !dbg [[DBG456:![0-9]+]] // CHECK-DEBUG-NEXT: [[MUL:%.*]] = mul i32 1, [[TMP3]], !dbg [[DBG456]] // CHECK-DEBUG-NEXT: [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]], !dbg [[DBG456]] -// CHECK-DEBUG-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG456]] +// CHECK-DEBUG-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG456]], !nonnull [[META12]], !align [[META47]] // CHECK-DEBUG-NEXT: store i32 [[ADD]], ptr [[TMP4]], align 4, !dbg [[META451]] // CHECK-DEBUG-NEXT: ret void, !dbg [[DBG454]] // @@ -2970,7 +2998,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8 // CHECK-DEBUG-NEXT: #dbg_declare(ptr [[DOTSTART]], [[META461:![0-9]+]], !DIExpression(), [[META463:![0-9]+]]) // CHECK-DEBUG-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_17:%.*]], ptr [[TMP0]], i32 0, i32 0, !dbg [[DBG464:![0-9]+]] -// CHECK-DEBUG-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG464]] +// CHECK-DEBUG-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG464]], !nonnull [[META12]], !align [[META47]] // CHECK-DEBUG-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !dbg [[DBG464]] // CHECK-DEBUG-NEXT: store i32 [[TMP3]], ptr [[DOTSTART]], align 4, !dbg [[META463]] // CHECK-DEBUG-NEXT: #dbg_declare(ptr [[DOTSTOP]], [[META466:![0-9]+]], !DIExpression(), [[META467:![0-9]+]]) @@ -2995,7 +3023,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-NEXT: br label [[COND_END]], !dbg [[META467]] // CHECK-DEBUG: cond.end: // CHECK-DEBUG-NEXT: [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ], !dbg [[META467]] -// CHECK-DEBUG-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META467]] +// CHECK-DEBUG-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META467]], !nonnull [[META12]], !align [[META47]] // CHECK-DEBUG-NEXT: store i32 [[COND]], ptr [[TMP10]], align 4, !dbg [[META467]] // CHECK-DEBUG-NEXT: ret void, !dbg [[DBG469:![0-9]+]] // @@ -3018,7 +3046,7 @@ void parallel_for_2(float *r, int a, double b) { // CHECK-DEBUG-NEXT: [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4, !dbg [[DBG478:![0-9]+]] // CHECK-DEBUG-NEXT: [[MUL:%.*]] = mul i32 1, [[TMP3]], !dbg [[DBG478]] // CHECK-DEBUG-NEXT: [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]], !dbg [[DBG478]] -// CHECK-DEBUG-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG478]] +// CHECK-DEBUG-NEXT: [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG478]], !nonnull [[META12]], !align [[META47]] // CHECK-DEBUG-NEXT: store i32 [[ADD]], ptr [[TMP4]], align 4, !dbg [[META473]] // CHECK-DEBUG-NEXT: ret void, !dbg [[DBG476]] // diff --git a/clang/test/OpenMP/masked_codegen.cpp b/clang/test/OpenMP/masked_codegen.cpp index a39de12d69337..bc6f68de9b248 100644 --- a/clang/test/OpenMP/masked_codegen.cpp +++ b/clang/test/OpenMP/masked_codegen.cpp @@ -35,6 +35,8 @@ int main() { // ALL-NEXT: store i8 2, ptr [[A_ADDR]] // IRBUILDER-NEXT: br label %[[AFTER:[^ ,]+]] // IRBUILDER: [[AFTER]] +// IRBUILDER-NEXT: br label %[[OMP_REGION_FINALIZE:[^ ,]+]] +// IRBUILDER: [[OMP_REGION_FINALIZE]] // ALL-NEXT: call {{.*}}void @__kmpc_end_masked(ptr [[DEFAULT_LOC]], i32 [[GTID]]) // ALL-NEXT: br label {{%?}}[[EXIT]] // ALL: [[EXIT]] diff --git a/clang/test/OpenMP/master_codegen.cpp b/clang/test/OpenMP/master_codegen.cpp index a7af326caacfe..5a92444d9a927 100644 --- a/clang/test/OpenMP/master_codegen.cpp +++ b/clang/test/OpenMP/master_codegen.cpp @@ -35,6 +35,8 @@ int main() { // ALL-NEXT: store i8 2, ptr [[A_ADDR]] // IRBUILDER-NEXT: br label %[[AFTER:[^ ,]+]] // IRBUILDER: [[AFTER]] +// IRBUILDER-NEXT: br label %[[OMP_REGION_FINALIZE:[^ ,]+]] +// IRBUILDER: [[OMP_REGION_FINALIZE]] // ALL-NEXT: call {{.*}}void @__kmpc_end_master(ptr [[DEFAULT_LOC]], i32 [[GTID]]) // ALL-NEXT: br label {{%?}}[[EXIT]] // ALL: [[EXIT]] diff --git a/clang/test/OpenMP/nested_loop_codegen.cpp b/clang/test/OpenMP/nested_loop_codegen.cpp index 9aefc6a739e51..e01fd0da31ee8 100644 --- a/clang/test/OpenMP/nested_loop_codegen.cpp +++ b/clang/test/OpenMP/nested_loop_codegen.cpp @@ -904,6 +904,8 @@ int inline_decl() { // CHECK4: omp.par.region.parallel.after: // CHECK4-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]] // CHECK4: omp.par.pre_finalize: +// CHECK4-NEXT: br label [[FINI:%.*]] +// CHECK4: .fini: // CHECK4-NEXT: br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]], !dbg [[DBG27]] // CHECK4: for.body: // CHECK4-NEXT: store i32 0, ptr [[LOADGEP_K]], align 4, !dbg [[DBG28:![0-9]+]] @@ -1083,6 +1085,8 @@ int inline_decl() { // CHECK4: omp.par.region.parallel.after: // CHECK4-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]] // CHECK4: omp.par.pre_finalize: +// CHECK4-NEXT: br label [[FINI:%.*]] +// CHECK4: .fini: // CHECK4-NEXT: br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]], !dbg [[DBG90]] // CHECK4: for.body: // CHECK4-NEXT: #dbg_declare(ptr [[K]], [[META91:![0-9]+]], !DIExpression(), [[META95:![0-9]+]]) diff --git a/clang/test/OpenMP/ordered_codegen.cpp b/clang/test/OpenMP/ordered_codegen.cpp index 5cd95f1927e5c..3b29feac7caa2 100644 --- a/clang/test/OpenMP/ordered_codegen.cpp +++ b/clang/test/OpenMP/ordered_codegen.cpp @@ -794,6 +794,8 @@ void foo_simd(int low, int up) { // CHECK1-IRBUILDER-NEXT: store float [[MUL8]], ptr [[ARRAYIDX10]], align 4 // CHECK1-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]] // CHECK1-IRBUILDER: omp.inner.for.body.ordered.after: +// CHECK1-IRBUILDER-NEXT: br label [[OMP_REGION_FINALIZE:%.*]] +// CHECK1-IRBUILDER: omp_region.finalize: // CHECK1-IRBUILDER-NEXT: call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2]]) // CHECK1-IRBUILDER-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1-IRBUILDER: omp.body.continue: @@ -884,6 +886,8 @@ void foo_simd(int low, int up) { // CHECK1-IRBUILDER-NEXT: store float [[MUL7]], ptr [[ARRAYIDX8]], align 4 // CHECK1-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]] // CHECK1-IRBUILDER: omp.inner.for.body.ordered.after: +// CHECK1-IRBUILDER-NEXT: br label [[OMP_REGION_FINALIZE:%.*]] +// CHECK1-IRBUILDER: omp_region.finalize: // CHECK1-IRBUILDER-NEXT: call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3]]) // CHECK1-IRBUILDER-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1-IRBUILDER: omp.body.continue: @@ -1022,6 +1026,8 @@ void foo_simd(int low, int up) { // CHECK1-IRBUILDER-NEXT: store float [[MUL29]], ptr [[ARRAYIDX31]], align 4 // CHECK1-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]] // CHECK1-IRBUILDER: omp.inner.for.body.ordered.after: +// CHECK1-IRBUILDER-NEXT: br label [[OMP_REGION_FINALIZE:%.*]] +// CHECK1-IRBUILDER: omp_region.finalize: // CHECK1-IRBUILDER-NEXT: call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM23]]) // CHECK1-IRBUILDER-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1-IRBUILDER: omp.body.continue: @@ -1131,6 +1137,8 @@ void foo_simd(int low, int up) { // CHECK1-IRBUILDER-NEXT: store float [[MUL14]], ptr [[ARRAYIDX16]], align 4 // CHECK1-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]] // CHECK1-IRBUILDER: omp.inner.for.body.ordered.after: +// CHECK1-IRBUILDER-NEXT: br label [[OMP_REGION_FINALIZE:%.*]] +// CHECK1-IRBUILDER: omp_region.finalize: // CHECK1-IRBUILDER-NEXT: call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM8]]) // CHECK1-IRBUILDER-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1-IRBUILDER: omp.body.continue: @@ -1296,17 +1304,19 @@ void foo_simd(int low, int up) { // CHECK1-IRBUILDER-NEXT: call void @__captured_stmt.1(ptr [[I28]]) // CHECK1-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_BODY33_ORDERED_AFTER:%.*]] // CHECK1-IRBUILDER: omp.inner.for.body33.ordered.after: -// CHECK1-IRBUILDER-NEXT: br label [[OMP_BODY_CONTINUE38:%.*]] -// CHECK1-IRBUILDER: omp.body.continue38: -// CHECK1-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_INC39:%.*]] -// CHECK1-IRBUILDER: omp.inner.for.inc39: +// CHECK1-IRBUILDER-NEXT: br label [[OMP_REGION_FINALIZE38:%.*]] +// CHECK1-IRBUILDER: omp_region.finalize38: +// CHECK1-IRBUILDER-NEXT: br label [[OMP_BODY_CONTINUE39:%.*]] +// CHECK1-IRBUILDER: omp.body.continue39: +// CHECK1-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_INC40:%.*]] +// CHECK1-IRBUILDER: omp.inner.for.inc40: // CHECK1-IRBUILDER-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4 // CHECK1-IRBUILDER-NEXT: [[ADD40:%.*]] = add i32 [[TMP32]], 1 // CHECK1-IRBUILDER-NEXT: store i32 [[ADD40]], ptr [[DOTOMP_IV16]], align 4 // CHECK1-IRBUILDER-NEXT: [[OMP_GLOBAL_THREAD_NUM41:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB12]]) // CHECK1-IRBUILDER-NEXT: call void @__kmpc_dispatch_fini_4u(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM41]]) // CHECK1-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_COND30]], !llvm.loop [[LOOP5:![0-9]+]] -// CHECK1-IRBUILDER: omp.inner.for.end42: +// CHECK1-IRBUILDER: omp.inner.for.end43: // CHECK1-IRBUILDER-NEXT: br label [[OMP_DISPATCH_INC:%.*]] // CHECK1-IRBUILDER: omp.dispatch.inc: // CHECK1-IRBUILDER-NEXT: br label [[OMP_DISPATCH_COND]] @@ -2034,6 +2044,8 @@ void foo_simd(int low, int up) { // CHECK3-IRBUILDER-NEXT: store float [[MUL8]], ptr [[ARRAYIDX10]], align 4 // CHECK3-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]] // CHECK3-IRBUILDER: omp.inner.for.body.ordered.after: +// CHECK3-IRBUILDER-NEXT: br label [[OMP_REGION_FINALIZE:%.*]] +// CHECK3-IRBUILDER: omp_region.finalize: // CHECK3-IRBUILDER-NEXT: call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2]]) // CHECK3-IRBUILDER-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK3-IRBUILDER: omp.body.continue: @@ -2124,6 +2136,8 @@ void foo_simd(int low, int up) { // CHECK3-IRBUILDER-NEXT: store float [[MUL7]], ptr [[ARRAYIDX8]], align 4 // CHECK3-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]] // CHECK3-IRBUILDER: omp.inner.for.body.ordered.after: +// CHECK3-IRBUILDER-NEXT: br label [[OMP_REGION_FINALIZE:%.*]] +// CHECK3-IRBUILDER: omp_region.finalize: // CHECK3-IRBUILDER-NEXT: call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3]]) // CHECK3-IRBUILDER-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK3-IRBUILDER: omp.body.continue: @@ -2262,6 +2276,8 @@ void foo_simd(int low, int up) { // CHECK3-IRBUILDER-NEXT: store float [[MUL29]], ptr [[ARRAYIDX31]], align 4 // CHECK3-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]] // CHECK3-IRBUILDER: omp.inner.for.body.ordered.after: +// CHECK3-IRBUILDER-NEXT: br label [[OMP_REGION_FINALIZE:%.*]] +// CHECK3-IRBUILDER: omp_region.finalize: // CHECK3-IRBUILDER-NEXT: call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM23]]) // CHECK3-IRBUILDER-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK3-IRBUILDER: omp.body.continue: @@ -2371,6 +2387,8 @@ void foo_simd(int low, int up) { // CHECK3-IRBUILDER-NEXT: store float [[MUL14]], ptr [[ARRAYIDX16]], align 4 // CHECK3-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]] // CHECK3-IRBUILDER: omp.inner.for.body.ordered.after: +// CHECK3-IRBUILDER-NEXT: br label [[OMP_REGION_FINALIZE:%.*]] +// CHECK3-IRBUILDER: omp_region.finalize: // CHECK3-IRBUILDER-NEXT: call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM8]]) // CHECK3-IRBUILDER-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK3-IRBUILDER: omp.body.continue: @@ -2536,17 +2554,19 @@ void foo_simd(int low, int up) { // CHECK3-IRBUILDER-NEXT: call void @__captured_stmt.1(ptr [[I28]]) // CHECK3-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_BODY33_ORDERED_AFTER:%.*]] // CHECK3-IRBUILDER: omp.inner.for.body33.ordered.after: -// CHECK3-IRBUILDER-NEXT: br label [[OMP_BODY_CONTINUE38:%.*]] -// CHECK3-IRBUILDER: omp.body.continue38: -// CHECK3-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_INC39:%.*]] -// CHECK3-IRBUILDER: omp.inner.for.inc39: +// CHECK3-IRBUILDER-NEXT: br label [[OMP_REGION_FINALIZE38:%.*]] +// CHECK3-IRBUILDER: omp_region.finalize38: +// CHECK3-IRBUILDER-NEXT: br label [[OMP_BODY_CONTINUE39:%.*]] +// CHECK3-IRBUILDER: omp.body.continue39: +// CHECK3-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_INC40:%.*]] +// CHECK3-IRBUILDER: omp.inner.for.inc40: // CHECK3-IRBUILDER-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4 // CHECK3-IRBUILDER-NEXT: [[ADD40:%.*]] = add i32 [[TMP32]], 1 // CHECK3-IRBUILDER-NEXT: store i32 [[ADD40]], ptr [[DOTOMP_IV16]], align 4 // CHECK3-IRBUILDER-NEXT: [[OMP_GLOBAL_THREAD_NUM41:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB12]]) // CHECK3-IRBUILDER-NEXT: call void @__kmpc_dispatch_fini_4u(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM41]]) // CHECK3-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_COND30]], !llvm.loop [[LOOP5:![0-9]+]] -// CHECK3-IRBUILDER: omp.inner.for.end42: +// CHECK3-IRBUILDER: omp.inner.for.end43: // CHECK3-IRBUILDER-NEXT: br label [[OMP_DISPATCH_INC:%.*]] // CHECK3-IRBUILDER: omp.dispatch.inc: // CHECK3-IRBUILDER-NEXT: br label [[OMP_DISPATCH_COND]] diff --git a/clang/test/OpenMP/parallel_codegen.cpp b/clang/test/OpenMP/parallel_codegen.cpp index e8e57aedaa164..9f6004e37db9c 100644 --- a/clang/test/OpenMP/parallel_codegen.cpp +++ b/clang/test/OpenMP/parallel_codegen.cpp @@ -906,6 +906,8 @@ int main (int argc, char **argv) { // CHECK4: omp.par.region.parallel.after: // CHECK4-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]] // CHECK4: omp.par.pre_finalize: +// CHECK4-NEXT: br label [[FINI:%.*]] +// CHECK4: .fini: // CHECK4-NEXT: br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]], !dbg [[DBG35]] // CHECK4: omp.par.exit.exitStub: // CHECK4-NEXT: ret void @@ -975,6 +977,8 @@ int main (int argc, char **argv) { // CHECK4: omp.par.region.parallel.after: // CHECK4-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]] // CHECK4: omp.par.pre_finalize: +// CHECK4-NEXT: br label [[FINI:%.*]] +// CHECK4: .fini: // CHECK4-NEXT: br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]], !dbg [[DBG66]] // CHECK4: omp.par.exit.exitStub: // CHECK4-NEXT: ret void diff --git a/clang/test/OpenMP/target_update_codegen.cpp b/clang/test/OpenMP/target_update_codegen.cpp index c8211f475c7fc..6c754c1c953ea 100644 --- a/clang/test/OpenMP/target_update_codegen.cpp +++ b/clang/test/OpenMP/target_update_codegen.cpp @@ -1560,5 +1560,37 @@ void foo(int arg) { { ++arg; } } +#endif +// RUN: %clang_cc1 -DCK26 -verify -Wno-vla -fopenmp -fopenmp-version=51 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK26 --check-prefix CK26-64 +// RUN: %clang_cc1 -DCK26 -fopenmp -fopenmp-version=51 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla %s -emit-llvm -o - | FileCheck %s --check-prefix CK26 --check-prefix CK26-64 +// RUN: %clang_cc1 -DCK26 -fopenmp-version=51 -verify -Wno-vla -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK26 --check-prefix CK26-32 +// RUN: %clang_cc1 -DCK26 -fopenmp -fopenmp-version=51 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla %s -emit-llvm -o - | FileCheck %s --check-prefix CK26 --check-prefix CK26-32 + +// RUN: %clang_cc1 -DCK26 -verify -Wno-vla -fopenmp-simd -fopenmp-version=51 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY19 %s +// RUN: %clang_cc1 -DCK26 -fopenmp-simd -fopenmp-version=51 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY19 %s +// RUN: %clang_cc1 -DCK26 -verify -Wno-vla -fopenmp-simd -fopenmp-version=51 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY19 %s +// RUN: %clang_cc1 -DCK26 -fopenmp-simd -fopenmp-version=51 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY19 %s +// SIMD-ONLY19-NOT: {{__kmpc|__tgt}} +#ifdef CK26 +void foo() { +int a[10]; +#pragma omp target update to(iterator(int it = 0:10) : a[it]) +// CK26-LABEL: define {{.+}}foo +// CK26: %[[ITER:[a-zA-Z0-9_]+]] = alloca i32, align 4 +// CK26: %[[LOAD2:.*]] = load i32, ptr %[[ITER]], align 4 +} + +void foo1() { +int a[10]; +#pragma omp target update from(iterator(int it = 0:10) : a[it]) +// CK26-LABEL: define {{.+}}foo1 +// CK26: %[[ITER:[a-zA-Z0-9_]+]] = alloca i32, align 4 +// CK26: %[[LOAD2:.*]] = load i32, ptr %[[ITER]], align 4 +} + #endif #endif diff --git a/clang/test/OpenMP/target_update_iterator_ast_print.cpp b/clang/test/OpenMP/target_update_iterator_ast_print.cpp new file mode 100644 index 0000000000000..322f565c9c732 --- /dev/null +++ b/clang/test/OpenMP/target_update_iterator_ast_print.cpp @@ -0,0 +1,16 @@ +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=51 -ast-print %s | FileCheck %s +// expected-no-diagnostics + +#ifndef HEADER +#define HEADER + +void test() { + int a[10]; + #pragma omp target update to(iterator(int it = 0:10): a[it]) + // CHECK: int a[10]; + // CHECK: #pragma omp target update to(iterator(int it = 0:10): a[it]) + #pragma omp target update from(iterator(int it = 0:10): a[it]) + // CHECK: #pragma omp target update from(iterator(int it = 0:10): a[it]) +} + +#endif diff --git a/clang/test/OpenMP/target_update_iterator_serialization.cpp b/clang/test/OpenMP/target_update_iterator_serialization.cpp new file mode 100644 index 0000000000000..c1ad380f7c9a5 --- /dev/null +++ b/clang/test/OpenMP/target_update_iterator_serialization.cpp @@ -0,0 +1,35 @@ +// Test without serialization: +// RUN: %clang_cc1 -std=c++20 -fopenmp %s -ast-dump | FileCheck %s + +// Test with serialization: +// RUN: %clang_cc1 -std=c++20 -fopenmp -emit-pch -o %t %s +// RUN: %clang_cc1 -x c++ -std=c++20 -fopenmp -include-pch %t -ast-dump-all /dev/null \ +// RUN: | sed -e "s/ //" -e "s/ imported//" \ +// RUN: | FileCheck %s + +// CHECK: OMPTargetUpdateDirective +// CHECK-NEXT: OMPFromClause +// CHECK-NEXT: ArraySubscriptExpr +// CHECK: DeclRefExpr {{.*}} 'a' +// CHECK: DeclRefExpr {{.*}} 'it' + + +void foo1() { + int a[10]; + +#pragma omp target update from(iterator(int it = 0:10) : a[it]) + ; +} + +// CHECK: OMPTargetUpdateDirective +// CHECK-NEXT: OMPToClause +// CHECK-NEXT: ArraySubscriptExpr +// CHECK: DeclRefExpr {{.*}} 'a' +// CHECK: DeclRefExpr {{.*}} 'it' + +void foo2() { + int a[10]; + +#pragma omp target update to(iterator(int it = 0:10) : a[it]) + ; +} diff --git a/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90 b/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90 index cf77c46346b7f..fd59d39b552da 100644 --- a/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90 +++ b/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90 @@ -174,10 +174,13 @@ subroutine worst_case(a, b, c, d) ! CHECK-NEXT: br label %omp.par.pre_finalize ! CHECK: omp.par.pre_finalize: ; preds = %reduce.finalize +! CHECK-NEXT: br label %.fini + +! CHECK: .fini: ! CHECK-NEXT: %{{.*}} = load ptr, ptr ! CHECK-NEXT: br label %omp.reduction.cleanup -! CHECK: omp.reduction.cleanup: ; preds = %omp.par.pre_finalize +! CHECK: omp.reduction.cleanup: ; preds = %.fini ! [null check] ! CHECK: br i1 %{{.*}}, label %omp.reduction.cleanup43, label %omp.reduction.cleanup44 diff --git a/lldb/tools/debugserver/source/DNB.cpp b/lldb/tools/debugserver/source/DNB.cpp index 0cd48d91a682a..4d5afcf93a44b 100644 --- a/lldb/tools/debugserver/source/DNB.cpp +++ b/lldb/tools/debugserver/source/DNB.cpp @@ -1101,7 +1101,7 @@ DNBGetLibrariesInfoForAddresses(nub_process_t pid, JSONGenerator::ObjectSP DNBGetSharedCacheInfo(nub_process_t pid) { MachProcessSP procSP; if (GetProcessSP(pid, procSP)) { - return procSP->GetSharedCacheInfo(pid); + return procSP->GetInferiorSharedCacheInfo(pid); } return JSONGenerator::ObjectSP(); } diff --git a/lldb/tools/debugserver/source/MacOSX/MachProcess.h b/lldb/tools/debugserver/source/MacOSX/MachProcess.h index 56bc9d6c7461e..67b27b9902999 100644 --- a/lldb/tools/debugserver/source/MacOSX/MachProcess.h +++ b/lldb/tools/debugserver/source/MacOSX/MachProcess.h @@ -283,7 +283,10 @@ class MachProcess { JSONGenerator::ObjectSP GetAllLoadedLibrariesInfos(nub_process_t pid, bool fetch_report_load_commands); - JSONGenerator::ObjectSP GetSharedCacheInfo(nub_process_t pid); + bool GetDebugserverSharedCacheInfo(uuid_t &uuid, + std::string &shared_cache_path); + bool GetInferiorSharedCacheFilepath(std::string &inferior_sc_path); + JSONGenerator::ObjectSP GetInferiorSharedCacheInfo(nub_process_t pid); nub_size_t GetNumThreads() const; nub_thread_t GetThreadAtIndex(nub_size_t thread_idx) const; @@ -474,6 +477,14 @@ class MachProcess { void *(*m_dyld_process_info_create)(task_t task, uint64_t timestamp, kern_return_t *kernelError); + void *(*m_dyld_process_create_for_task)(task_read_t task, kern_return_t *kr); + void *(*m_dyld_process_snapshot_create_for_process)(void *process, + kern_return_t *kr); + void *(*m_dyld_process_snapshot_get_shared_cache)(void *snapshot); + void (*m_dyld_shared_cache_for_each_file)( + void *cache, void (^block)(const char *file_path)); + void (*m_dyld_process_snapshot_dispose)(void *snapshot); + void (*m_dyld_process_dispose)(void *process); void (*m_dyld_process_info_for_each_image)( void *info, void (^callback)(uint64_t machHeaderAddress, const uuid_t uuid, const char *path)); @@ -481,6 +492,7 @@ class MachProcess { void (*m_dyld_process_info_get_cache)(void *info, void *cacheInfo); uint32_t (*m_dyld_process_info_get_platform)(void *info); void (*m_dyld_process_info_get_state)(void *info, void *stateInfo); + const char *(*m_dyld_shared_cache_file_path)(); }; #endif // LLDB_TOOLS_DEBUGSERVER_SOURCE_MACOSX_MACHPROCESS_H diff --git a/lldb/tools/debugserver/source/MacOSX/MachProcess.mm b/lldb/tools/debugserver/source/MacOSX/MachProcess.mm index 3b875e61a268d..10ed8045a9211 100644 --- a/lldb/tools/debugserver/source/MacOSX/MachProcess.mm +++ b/lldb/tools/debugserver/source/MacOSX/MachProcess.mm @@ -534,13 +534,35 @@ static bool FBSAddEventDataToOptions(NSMutableDictionary *options, m_image_infos_baton(NULL), m_sent_interrupt_signo(0), m_auto_resume_signo(0), m_did_exec(false), m_dyld_process_info_create(nullptr), + m_dyld_process_create_for_task(nullptr), + m_dyld_process_snapshot_create_for_process(nullptr), + m_dyld_process_snapshot_get_shared_cache(nullptr), + m_dyld_shared_cache_for_each_file(nullptr), + m_dyld_process_snapshot_dispose(nullptr), m_dyld_process_dispose(nullptr), m_dyld_process_info_for_each_image(nullptr), m_dyld_process_info_release(nullptr), m_dyld_process_info_get_cache(nullptr), - m_dyld_process_info_get_state(nullptr) { + m_dyld_process_info_get_state(nullptr), + m_dyld_shared_cache_file_path(nullptr) { m_dyld_process_info_create = (void *(*)(task_t task, uint64_t timestamp, kern_return_t * kernelError)) dlsym(RTLD_DEFAULT, "_dyld_process_info_create"); + + m_dyld_process_create_for_task = + (void *(*)(task_read_t, kern_return_t *))dlsym( + RTLD_DEFAULT, "dyld_process_create_for_task"); + m_dyld_process_snapshot_create_for_process = + (void *(*)(void *, kern_return_t *))dlsym( + RTLD_DEFAULT, "dyld_process_snapshot_create_for_process"); + m_dyld_process_snapshot_get_shared_cache = (void *(*)(void *))dlsym( + RTLD_DEFAULT, "dyld_process_snapshot_get_shared_cache"); + m_dyld_shared_cache_for_each_file = + (void (*)(void *, void (^)(const char *)))dlsym( + RTLD_DEFAULT, "dyld_shared_cache_for_each_file"); + m_dyld_process_snapshot_dispose = + (void (*)(void *))dlsym(RTLD_DEFAULT, "dyld_process_snapshot_dispose"); + m_dyld_process_dispose = + (void (*)(void *))dlsym(RTLD_DEFAULT, "dyld_process_dispose"); m_dyld_process_info_for_each_image = (void (*)(void *info, void (^)(uint64_t machHeaderAddress, const uuid_t uuid, const char *path))) @@ -553,6 +575,8 @@ static bool FBSAddEventDataToOptions(NSMutableDictionary *options, RTLD_DEFAULT, "_dyld_process_info_get_platform"); m_dyld_process_info_get_state = (void (*)(void *info, void *stateInfo))dlsym( RTLD_DEFAULT, "_dyld_process_info_get_state"); + m_dyld_shared_cache_file_path = + (const char *(*)())dlsym(RTLD_DEFAULT, "dyld_shared_cache_file_path"); DNBLogThreadedIf(LOG_PROCESS | LOG_VERBOSE, "%s", __PRETTY_FUNCTION__); } @@ -1179,13 +1203,82 @@ static bool mach_header_validity_test(uint32_t magic, uint32_t cputype) { /* report_load_commands = */ true); } -// From dyld's internal podyld_process_info.h: +bool MachProcess::GetDebugserverSharedCacheInfo( + uuid_t &uuid, std::string &shared_cache_path) { + uuid_clear(uuid); + shared_cache_path.clear(); + + if (m_dyld_process_info_create && m_dyld_process_info_get_cache) { + kern_return_t kern_ret; + dyld_process_info info = + m_dyld_process_info_create(mach_task_self(), 0, &kern_ret); + if (info) { + struct dyld_process_cache_info shared_cache_info; + m_dyld_process_info_get_cache(info, &shared_cache_info); + uuid_copy(uuid, shared_cache_info.cacheUUID); + m_dyld_process_info_release(info); + } + } + if (m_dyld_shared_cache_file_path) { + const char *cache_path = m_dyld_shared_cache_file_path(); + if (cache_path) + shared_cache_path = cache_path; + } + if (!uuid_is_null(uuid)) + return true; + return false; +} + +bool MachProcess::GetInferiorSharedCacheFilepath( + std::string &inferior_sc_path) { + inferior_sc_path.clear(); + + if (!m_dyld_process_create_for_task || + !m_dyld_process_snapshot_create_for_process || + !m_dyld_process_snapshot_get_shared_cache || + !m_dyld_shared_cache_for_each_file || !m_dyld_process_snapshot_dispose || + !m_dyld_process_dispose) + return false; + + __block std::string sc_path; + kern_return_t kr; + void *process = m_dyld_process_create_for_task(m_task.TaskPort(), &kr); + if (kr != KERN_SUCCESS) + return false; + void *snapshot = m_dyld_process_snapshot_create_for_process(process, &kr); + if (kr != KERN_SUCCESS) + return false; + void *cache = m_dyld_process_snapshot_get_shared_cache(snapshot); + + // The shared cache is a collection of files on disk, this callback + // will iterate over all of them. + // The first filepath provided is the base filename of the cache. + __block bool done = false; + m_dyld_shared_cache_for_each_file(cache, ^(const char *path) { + if (done) { + return; + } + done = true; + sc_path = path; + }); + m_dyld_process_snapshot_dispose(snapshot); + m_dyld_process_dispose(process); + + inferior_sc_path = sc_path; + if (!sc_path.empty()) + return true; + return false; +} + +// From dyld's internal dyld_process_info.h: -JSONGenerator::ObjectSP MachProcess::GetSharedCacheInfo(nub_process_t pid) { +JSONGenerator::ObjectSP +MachProcess::GetInferiorSharedCacheInfo(nub_process_t pid) { JSONGenerator::DictionarySP reply_sp(new JSONGenerator::Dictionary()); - kern_return_t kern_ret; + uuid_t inferior_sc_uuid; if (m_dyld_process_info_create && m_dyld_process_info_get_cache) { + kern_return_t kern_ret; dyld_process_info info = m_dyld_process_info_create(m_task.TaskPort(), 0, &kern_ret); if (info) { @@ -1197,6 +1290,7 @@ static bool mach_header_validity_test(uint32_t magic, uint32_t cputype) { uuid_string_t uuidstr; uuid_unparse_upper(shared_cache_info.cacheUUID, uuidstr); + uuid_copy(inferior_sc_uuid, shared_cache_info.cacheUUID); reply_sp->AddStringItem("shared_cache_uuid", uuidstr); reply_sp->AddBooleanItem("no_shared_cache", shared_cache_info.noCache); @@ -1206,6 +1300,29 @@ static bool mach_header_validity_test(uint32_t magic, uint32_t cputype) { m_dyld_process_info_release(info); } } + + // If debugserver and the inferior are have the same cache UUID, + // use the simple call to get the filepath to debugserver's shared + // cache, return that. + uuid_t debugserver_sc_uuid; + std::string debugserver_sc_path; + bool found_sc_filepath = false; + if (GetDebugserverSharedCacheInfo(debugserver_sc_uuid, debugserver_sc_path)) { + if (uuid_compare(inferior_sc_uuid, debugserver_sc_uuid) == 0 && + !debugserver_sc_path.empty()) { + reply_sp->AddStringItem("shared_cache_path", debugserver_sc_path); + found_sc_filepath = true; + } + } + + // Use SPI that are only available on newer OSes to fetch the + // filepath of the shared cache of the inferior, if available. + if (!found_sc_filepath) { + std::string inferior_sc_path; + if (GetInferiorSharedCacheFilepath(inferior_sc_path)) + reply_sp->AddStringItem("shared_cache_path", inferior_sc_path); + } + return reply_sp; } diff --git a/llvm/include/llvm/Analysis/DependenceAnalysis.h b/llvm/include/llvm/Analysis/DependenceAnalysis.h index 8286d8e8e45cc..ad46d2f1466cf 100644 --- a/llvm/include/llvm/Analysis/DependenceAnalysis.h +++ b/llvm/include/llvm/Analysis/DependenceAnalysis.h @@ -506,17 +506,6 @@ class DependenceInfo { bool isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *X, const SCEV *Y) const; - /// isKnownLessThan - Compare to see if S is less than Size - /// Another wrapper for isKnownNegative(S - max(Size, 1)) with some extra - /// checking if S is an AddRec and we can prove lessthan using the loop - /// bounds. - bool isKnownLessThan(const SCEV *S, const SCEV *Size) const; - - /// isKnownNonNegative - Compare to see if S is known not to be negative - /// Uses the fact that S comes from Ptr, which may be an inbound GEP, - /// Proving there is no wrapping going on. - bool isKnownNonNegative(const SCEV *S, const Value *Ptr) const; - /// collectUpperBound - All subscripts are the same type (on my machine, /// an i64). The loop bound may be a smaller type. collectUpperBound /// find the bound, if available, and zero extends it to the Type T. diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index 501cbc947132e..21f622ea471e1 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1185,11 +1185,11 @@ class SelectionDAG { SDValue getPOISON(EVT VT) { return getNode(ISD::POISON, SDLoc(), VT); } /// Return a node that represents the runtime scaling 'MulImm * RuntimeVL'. - LLVM_ABI SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm, - bool ConstantFold = true); + LLVM_ABI SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm); - LLVM_ABI SDValue getElementCount(const SDLoc &DL, EVT VT, ElementCount EC, - bool ConstantFold = true); + LLVM_ABI SDValue getElementCount(const SDLoc &DL, EVT VT, ElementCount EC); + + LLVM_ABI SDValue getTypeSize(const SDLoc &DL, EVT VT, TypeSize TS); /// Return a GLOBAL_OFFSET_TABLE node. This does not have a useful SDLoc. SDValue getGLOBAL_OFFSET_TABLE(EVT VT) { diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index 06f4fd30664a8..fa29d4afbec6f 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -578,16 +578,33 @@ class OpenMPIRBuilder { using FinalizeCallbackTy = std::function; struct FinalizationInfo { - /// The finalization callback provided by the last in-flight invocation of - /// createXXXX for the directive of kind DK. - FinalizeCallbackTy FiniCB; - + FinalizationInfo(FinalizeCallbackTy FiniCB, omp::Directive DK, + bool IsCancellable) + : DK(DK), IsCancellable(IsCancellable), FiniCB(std::move(FiniCB)) {} /// The directive kind of the innermost directive that has an associated /// region which might require finalization when it is left. - omp::Directive DK; + const omp::Directive DK; /// Flag to indicate if the directive is cancellable. - bool IsCancellable; + const bool IsCancellable; + + /// The basic block to which control should be transferred to + /// implement the FiniCB. Memoized to avoid generating finalization + /// multiple times. + Expected getFiniBB(IRBuilderBase &Builder); + + /// For cases where there is an unavoidable existing finalization block + /// (e.g. loop finialization after omp sections). The existing finalization + /// block must not contain any non-finalization code. + Error mergeFiniBB(IRBuilderBase &Builder, BasicBlock *ExistingFiniBB); + + private: + /// Access via getFiniBB. + BasicBlock *FiniBB = nullptr; + + /// The finalization callback provided by the last in-flight invocation of + /// createXXXX for the directive of kind DK. + FinalizeCallbackTy FiniCB; }; /// Push a finalization callback on the finalization stack. @@ -2268,8 +2285,7 @@ class OpenMPIRBuilder { /// /// \return an error, if any were triggered during execution. LLVM_ABI Error emitCancelationCheckImpl(Value *CancelFlag, - omp::Directive CanceledDirective, - FinalizeCallbackTy ExitCB = {}); + omp::Directive CanceledDirective); /// Generate a target region entry call. /// @@ -3496,7 +3512,8 @@ class OpenMPIRBuilder { /// Common interface to finalize the region /// /// \param OMPD Directive to generate exiting code for - /// \param FinIP Insertion point for emitting Finalization code and exit call + /// \param FinIP Insertion point for emitting Finalization code and exit call. + /// This block must not contain any non-finalization code. /// \param ExitCall Call to the ending OMP Runtime Function /// \param HasFinalize indicate if the directive will require finalization /// and has a finalization callback in the stack that diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 4274e951446b8..53b7aede7b4a5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1702,10 +1702,8 @@ void DAGTypeLegalizer::SplitVecRes_LOOP_DEPENDENCE_MASK(SDNode *N, SDValue &Lo, Lo = DAG.getNode(N->getOpcode(), DL, LoVT, PtrA, PtrB, N->getOperand(2)); unsigned EltSize = N->getConstantOperandVal(2); - unsigned Offset = EltSize * HiVT.getVectorMinNumElements(); - SDValue Addend = HiVT.isScalableVT() - ? DAG.getVScale(DL, MVT::i64, APInt(64, Offset)) - : DAG.getConstant(Offset, DL, MVT::i64); + ElementCount Offset = HiVT.getVectorElementCount() * EltSize; + SDValue Addend = DAG.getElementCount(DL, MVT::i64, Offset); PtrA = DAG.getNode(ISD::ADD, DL, MVT::i64, PtrA, Addend); Hi = DAG.getNode(N->getOpcode(), DL, HiVT, PtrA, PtrB, N->getOperand(2)); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 42786db653fa5..4d9dd842ce4c7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -2098,32 +2098,43 @@ SDValue SelectionDAG::getCondCode(ISD::CondCode Cond) { return SDValue(CondCodeNodes[Cond], 0); } -SDValue SelectionDAG::getVScale(const SDLoc &DL, EVT VT, APInt MulImm, - bool ConstantFold) { +SDValue SelectionDAG::getVScale(const SDLoc &DL, EVT VT, APInt MulImm) { assert(MulImm.getBitWidth() == VT.getSizeInBits() && "APInt size does not match type size!"); if (MulImm == 0) return getConstant(0, DL, VT); - if (ConstantFold) { - const MachineFunction &MF = getMachineFunction(); - const Function &F = MF.getFunction(); - ConstantRange CR = getVScaleRange(&F, 64); - if (const APInt *C = CR.getSingleElement()) - return getConstant(MulImm * C->getZExtValue(), DL, VT); - } + const MachineFunction &MF = getMachineFunction(); + const Function &F = MF.getFunction(); + ConstantRange CR = getVScaleRange(&F, 64); + if (const APInt *C = CR.getSingleElement()) + return getConstant(MulImm * C->getZExtValue(), DL, VT); return getNode(ISD::VSCALE, DL, VT, getConstant(MulImm, DL, VT)); } -SDValue SelectionDAG::getElementCount(const SDLoc &DL, EVT VT, ElementCount EC, - bool ConstantFold) { - if (EC.isScalable()) - return getVScale(DL, VT, - APInt(VT.getSizeInBits(), EC.getKnownMinValue())); +/// \returns a value of type \p VT that represents the runtime value of \p +/// Quantity, i.e. scaled by vscale if it's scalable, or a fixed constant +/// otherwise. Quantity should be a FixedOrScalableQuantity, i.e. ElementCount +/// or TypeSize. +template +static SDValue getFixedOrScalableQuantity(SelectionDAG &DAG, const SDLoc &DL, + EVT VT, Ty Quantity) { + if (Quantity.isScalable()) + return DAG.getVScale( + DL, VT, APInt(VT.getSizeInBits(), Quantity.getKnownMinValue())); + + return DAG.getConstant(Quantity.getKnownMinValue(), DL, VT); +} + +SDValue SelectionDAG::getElementCount(const SDLoc &DL, EVT VT, + ElementCount EC) { + return getFixedOrScalableQuantity(*this, DL, VT, EC); +} - return getConstant(EC.getKnownMinValue(), DL, VT); +SDValue SelectionDAG::getTypeSize(const SDLoc &DL, EVT VT, TypeSize TS) { + return getFixedOrScalableQuantity(*this, DL, VT, TS); } SDValue SelectionDAG::getStepVector(const SDLoc &DL, EVT ResVT) { @@ -8500,16 +8511,7 @@ static SDValue getMemsetStringVal(EVT VT, const SDLoc &dl, SelectionDAG &DAG, SDValue SelectionDAG::getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags) { - EVT VT = Base.getValueType(); - SDValue Index; - - if (Offset.isScalable()) - Index = getVScale(DL, Base.getValueType(), - APInt(Base.getValueSizeInBits().getFixedValue(), - Offset.getKnownMinValue())); - else - Index = getConstant(Offset.getFixedValue(), DL, VT); - + SDValue Index = getTypeSize(DL, Base.getValueType(), Offset); return getMemBasePlusOffset(Base, Index, DL, Flags); } @@ -13614,11 +13616,8 @@ std::pair SelectionDAG::SplitEVL(SDValue N, EVT VecVT, EVT VT = N.getValueType(); assert(VecVT.getVectorElementCount().isKnownEven() && "Expecting the mask to be an evenly-sized vector"); - unsigned HalfMinNumElts = VecVT.getVectorMinNumElements() / 2; - SDValue HalfNumElts = - VecVT.isFixedLengthVector() - ? getConstant(HalfMinNumElts, DL, VT) - : getVScale(DL, VT, APInt(VT.getScalarSizeInBits(), HalfMinNumElts)); + SDValue HalfNumElts = getElementCount( + DL, VT, VecVT.getVectorElementCount().divideCoefficientBy(2)); SDValue Lo = getNode(ISD::UMIN, DL, VT, N, HalfNumElts); SDValue Hi = getNode(ISD::USUBSAT, DL, VT, N, HalfNumElts); return std::make_pair(Lo, Hi); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 3b7db2c54bae0..73510db3d140d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -4584,17 +4584,9 @@ void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) { if (AllocSize.getValueType() != IntPtr) AllocSize = DAG.getZExtOrTrunc(AllocSize, dl, IntPtr); - if (TySize.isScalable()) - AllocSize = DAG.getNode(ISD::MUL, dl, IntPtr, AllocSize, - DAG.getVScale(dl, IntPtr, - APInt(IntPtr.getScalarSizeInBits(), - TySize.getKnownMinValue()))); - else { - SDValue TySizeValue = - DAG.getConstant(TySize.getFixedValue(), dl, MVT::getIntegerVT(64)); - AllocSize = DAG.getNode(ISD::MUL, dl, IntPtr, AllocSize, - DAG.getZExtOrTrunc(TySizeValue, dl, IntPtr)); - } + AllocSize = DAG.getNode( + ISD::MUL, dl, IntPtr, AllocSize, + DAG.getZExtOrTrunc(DAG.getTypeSize(dl, MVT::i64, TySize), dl, IntPtr)); // Handle alignment. If the requested alignment is less than or equal to // the stack alignment, ignore it. If the size is greater than or equal to diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 521d8f07434e6..783ec4b0bd211 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -10628,12 +10628,8 @@ TargetLowering::IncrementMemoryAddress(SDValue Addr, SDValue Mask, AddrVT); Increment = DAG.getZExtOrTrunc(Increment, DL, AddrVT); Increment = DAG.getNode(ISD::MUL, DL, AddrVT, Increment, Scale); - } else if (DataVT.isScalableVector()) { - Increment = DAG.getVScale(DL, AddrVT, - APInt(AddrVT.getFixedSizeInBits(), - DataVT.getStoreSize().getKnownMinValue())); } else - Increment = DAG.getConstant(DataVT.getStoreSize(), DL, AddrVT); + Increment = DAG.getTypeSize(DL, AddrVT, DataVT.getStoreSize()); return DAG.getNode(ISD::ADD, DL, AddrVT, Addr, Increment); } @@ -11926,10 +11922,8 @@ SDValue TargetLowering::expandVectorSplice(SDNode *Node, // Store the lo part of CONCAT_VECTORS(V1, V2) SDValue StoreV1 = DAG.getStore(DAG.getEntryNode(), DL, V1, StackPtr, PtrInfo); // Store the hi part of CONCAT_VECTORS(V1, V2) - SDValue OffsetToV2 = DAG.getVScale( - DL, PtrVT, - APInt(PtrVT.getFixedSizeInBits(), VT.getStoreSize().getKnownMinValue())); - SDValue StackPtr2 = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, OffsetToV2); + SDValue VTBytes = DAG.getTypeSize(DL, PtrVT, VT.getStoreSize()); + SDValue StackPtr2 = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, VTBytes); SDValue StoreV2 = DAG.getStore(StoreV1, DL, V2, StackPtr2, PtrInfo); if (Imm >= 0) { @@ -11948,13 +11942,8 @@ SDValue TargetLowering::expandVectorSplice(SDNode *Node, SDValue TrailingBytes = DAG.getConstant(TrailingElts * EltByteSize, DL, PtrVT); - if (TrailingElts > VT.getVectorMinNumElements()) { - SDValue VLBytes = - DAG.getVScale(DL, PtrVT, - APInt(PtrVT.getFixedSizeInBits(), - VT.getStoreSize().getKnownMinValue())); - TrailingBytes = DAG.getNode(ISD::UMIN, DL, PtrVT, TrailingBytes, VLBytes); - } + if (TrailingElts > VT.getVectorMinNumElements()) + TrailingBytes = DAG.getNode(ISD::UMIN, DL, PtrVT, TrailingBytes, VTBytes); // Calculate the start address of the spliced result. StackPtr2 = DAG.getNode(ISD::SUB, DL, PtrVT, StackPtr2, TrailingBytes); diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 29900115d7a25..cfaf0935e240b 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -809,6 +809,47 @@ FunctionCallee OpenMPIRBuilder::unsignedGetOrCreateAtomicCASRuntimeFunction( return {FnTy, C}; } +Expected +OpenMPIRBuilder::FinalizationInfo::getFiniBB(IRBuilderBase &Builder) { + if (!FiniBB) { + Function *ParentFunc = Builder.GetInsertBlock()->getParent(); + IRBuilderBase::InsertPointGuard Guard(Builder); + FiniBB = BasicBlock::Create(Builder.getContext(), ".fini", ParentFunc); + Builder.SetInsertPoint(FiniBB); + // FiniCB adds the branch to the exit stub. + if (Error Err = FiniCB(Builder.saveIP())) + return Err; + } + return FiniBB; +} + +Error OpenMPIRBuilder::FinalizationInfo::mergeFiniBB(IRBuilderBase &Builder, + BasicBlock *OtherFiniBB) { + // Simple case: FiniBB does not exist yet: re-use OtherFiniBB. + if (!FiniBB) { + FiniBB = OtherFiniBB; + + Builder.SetInsertPoint(FiniBB->getFirstNonPHIIt()); + if (Error Err = FiniCB(Builder.saveIP())) + return Err; + + return Error::success(); + } + + // Move instructions from FiniBB to the start of OtherFiniBB. + auto EndIt = FiniBB->end(); + if (FiniBB->size() >= 1) + if (auto Prev = std::prev(EndIt); Prev->isTerminator()) + EndIt = Prev; + OtherFiniBB->splice(OtherFiniBB->getFirstNonPHIIt(), FiniBB, FiniBB->begin(), + EndIt); + + FiniBB->replaceAllUsesWith(OtherFiniBB); + FiniBB->eraseFromParent(); + FiniBB = OtherFiniBB; + return Error::success(); +} + Function *OpenMPIRBuilder::getOrCreateRuntimeFunctionPtr(RuntimeFunction FnID) { FunctionCallee RTLFn = getOrCreateRuntimeFunction(M, FnID); auto *Fn = dyn_cast(RTLFn.getCallee()); @@ -1231,8 +1272,20 @@ OpenMPIRBuilder::createCancel(const LocationDescription &Loc, auto *UI = Builder.CreateUnreachable(); Instruction *ThenTI = UI, *ElseTI = nullptr; - if (IfCondition) + if (IfCondition) { SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI); + + // Even if the if condition evaluates to false, this should count as a + // cancellation point + Builder.SetInsertPoint(ElseTI); + auto ElseIP = Builder.saveIP(); + + InsertPointOrErrorTy IPOrErr = createCancellationPoint( + LocationDescription{ElseIP, Loc.DL}, CanceledDirective); + if (!IPOrErr) + return IPOrErr; + } + Builder.SetInsertPoint(ThenTI); Value *CancelKind = nullptr; @@ -1252,21 +1305,9 @@ OpenMPIRBuilder::createCancel(const LocationDescription &Loc, Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind}; Value *Result = createRuntimeFunctionCall( getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args); - auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error { - if (CanceledDirective == OMPD_parallel) { - IRBuilder<>::InsertPointGuard IPG(Builder); - Builder.restoreIP(IP); - return createBarrier(LocationDescription(Builder.saveIP(), Loc.DL), - omp::Directive::OMPD_unknown, - /* ForceSimpleCall */ false, - /* CheckCancelFlag */ false) - .takeError(); - } - return Error::success(); - }; // The actual cancel logic is shared with others, e.g., cancel_barriers. - if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB)) + if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective)) return Err; // Update the insertion point and remove the terminator we introduced. @@ -1303,21 +1344,9 @@ OpenMPIRBuilder::createCancellationPoint(const LocationDescription &Loc, Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind}; Value *Result = createRuntimeFunctionCall( getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancellationpoint), Args); - auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error { - if (CanceledDirective == OMPD_parallel) { - IRBuilder<>::InsertPointGuard IPG(Builder); - Builder.restoreIP(IP); - return createBarrier(LocationDescription(Builder.saveIP(), Loc.DL), - omp::Directive::OMPD_unknown, - /* ForceSimpleCall */ false, - /* CheckCancelFlag */ false) - .takeError(); - } - return Error::success(); - }; // The actual cancel logic is shared with others, e.g., cancel_barriers. - if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB)) + if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective)) return Err; // Update the insertion point and remove the terminator we introduced. @@ -1420,8 +1449,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitKernelLaunch( } Error OpenMPIRBuilder::emitCancelationCheckImpl( - Value *CancelFlag, omp::Directive CanceledDirective, - FinalizeCallbackTy ExitCB) { + Value *CancelFlag, omp::Directive CanceledDirective) { assert(isLastFinalizationInfoCancellable(CanceledDirective) && "Unexpected cancellation!"); @@ -1448,13 +1476,12 @@ Error OpenMPIRBuilder::emitCancelationCheckImpl( // From the cancellation block we finalize all variables and go to the // post finalization block that is known to the FiniCB callback. - Builder.SetInsertPoint(CancellationBlock); - if (ExitCB) - if (Error Err = ExitCB(Builder.saveIP())) - return Err; auto &FI = FinalizationStack.back(); - if (Error Err = FI.FiniCB(Builder.saveIP())) - return Err; + Expected FiniBBOrErr = FI.getFiniBB(Builder); + if (!FiniBBOrErr) + return FiniBBOrErr.takeError(); + Builder.SetInsertPoint(CancellationBlock); + Builder.CreateBr(*FiniBBOrErr); // The continuation block is where code generation continues. Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin()); @@ -2053,8 +2080,18 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel( Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator(); InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator()); - if (Error Err = FiniCB(PreFiniIP)) - return Err; + Expected FiniBBOrErr = FiniInfo.getFiniBB(Builder); + if (!FiniBBOrErr) + return FiniBBOrErr.takeError(); + { + IRBuilderBase::InsertPointGuard Guard(Builder); + Builder.restoreIP(PreFiniIP); + Builder.CreateBr(*FiniBBOrErr); + // There's currently a branch to omp.par.exit. Delete it. We will get there + // via the fini block + if (Instruction *Term = Builder.GetInsertBlock()->getTerminator()) + Term->eraseFromParent(); + } // Register the outlined info. addOutlineInfo(std::move(OI)); @@ -2493,23 +2530,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createSections( if (!updateToLocation(Loc)) return Loc.IP; - // FiniCBWrapper needs to create a branch to the loop finalization block, but - // this has not been created yet at some times when this callback runs. - SmallVector CancellationBranches; - auto FiniCBWrapper = [&](InsertPointTy IP) { - if (IP.getBlock()->end() != IP.getPoint()) - return FiniCB(IP); - // This must be done otherwise any nested constructs using FinalizeOMPRegion - // will fail because that function requires the Finalization Basic Block to - // have a terminator, which is already removed by EmitOMPRegionBody. - // IP is currently at cancelation block. - BranchInst *DummyBranch = Builder.CreateBr(IP.getBlock()); - IP = InsertPointTy(DummyBranch->getParent(), DummyBranch->getIterator()); - CancellationBranches.push_back(DummyBranch); - return FiniCB(IP); - }; - - FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable}); + FinalizationStack.push_back({FiniCB, OMPD_sections, IsCancellable}); // Each section is emitted as a switch case // Each finalization callback is handled from clang.EmitOMPSectionDirective() @@ -2576,20 +2597,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createSections( auto FiniInfo = FinalizationStack.pop_back_val(); assert(FiniInfo.DK == OMPD_sections && "Unexpected finalization stack state!"); - if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) { - Builder.restoreIP(AfterIP); - BasicBlock *FiniBB = - splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini"); - if (Error Err = CB(Builder.saveIP())) - return Err; - AfterIP = {FiniBB, FiniBB->begin()}; - } - - // Now we can fix the dummy branch to point to the right place - for (BranchInst *DummyBranch : CancellationBranches) { - assert(DummyBranch->getNumSuccessors() == 1); - DummyBranch->setSuccessor(0, LoopFini); - } + if (Error Err = FiniInfo.mergeFiniBB(Builder, LoopFini)) + return Err; return AfterIP; } @@ -6957,9 +6966,6 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion( emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize); if (!AfterIP) return AfterIP.takeError(); - assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB && - "Unexpected Control Flow State!"); - MergeBlockIntoPredecessor(FiniBB); // If we are skipping the region of a non conditional, remove the exit // block, and clear the builder's insertion point. @@ -7019,14 +7025,12 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit( FinalizationInfo Fi = FinalizationStack.pop_back_val(); assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!"); - if (Error Err = Fi.FiniCB(FinIP)) - return Err; - - BasicBlock *FiniBB = FinIP.getBlock(); - Instruction *FiniBBTI = FiniBB->getTerminator(); + if (Error Err = Fi.mergeFiniBB(Builder, FinIP.getBlock())) + return std::move(Err); - // set Builder IP for call creation - Builder.SetInsertPoint(FiniBBTI); + // Exit condition: insertion point is before the terminator of the new Fini + // block + Builder.SetInsertPoint(FinIP.getBlock()->getTerminator()); } if (!ExitCall) diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp index b454c9a4cd3ae..9beaee60d0bc1 100644 --- a/llvm/lib/IR/ConstantRange.cpp +++ b/llvm/lib/IR/ConstantRange.cpp @@ -841,6 +841,8 @@ ConstantRange ConstantRange::zeroExtend(uint32_t DstTySize) const { if (isEmptySet()) return getEmpty(DstTySize); unsigned SrcTySize = getBitWidth(); + if (DstTySize == SrcTySize) + return *this; assert(SrcTySize < DstTySize && "Not a value extension"); if (isFullSet() || isUpperWrapped()) { // Change into [0, 1 << src bit width) @@ -858,6 +860,8 @@ ConstantRange ConstantRange::signExtend(uint32_t DstTySize) const { if (isEmptySet()) return getEmpty(DstTySize); unsigned SrcTySize = getBitWidth(); + if (DstTySize == SrcTySize) + return *this; assert(SrcTySize < DstTySize && "Not a value extension"); // special case: [X, INT_MIN) -- not really wrapping around @@ -874,6 +878,8 @@ ConstantRange ConstantRange::signExtend(uint32_t DstTySize) const { ConstantRange ConstantRange::truncate(uint32_t DstTySize, unsigned NoWrapKind) const { + if (DstTySize == getBitWidth()) + return *this; assert(getBitWidth() > DstTySize && "Not a value truncation"); if (isEmptySet()) return getEmpty(DstTySize); diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 34d74d04c4419..60e6a82d41cc8 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -1717,6 +1717,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, } case AArch64::InOutZAUsePseudo: case AArch64::RequiresZASavePseudo: + case AArch64::RequiresZT0SavePseudo: case AArch64::SMEStateAllocPseudo: case AArch64::COALESCER_BARRIER_FPR16: case AArch64::COALESCER_BARRIER_FPR32: diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 6072fd9d8f242..b4f47d249885d 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8647,7 +8647,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( Subtarget->isWindowsArm64EC()) && "Indirect arguments should be scalable on most subtargets"); - uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue(); + TypeSize PartSize = VA.getValVT().getStoreSize(); unsigned NumParts = 1; if (Ins[i].Flags.isInConsecutiveRegs()) { while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast()) @@ -8664,16 +8664,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments( InVals.push_back(ArgValue); NumParts--; if (NumParts > 0) { - SDValue BytesIncrement; - if (PartLoad.isScalableVector()) { - BytesIncrement = DAG.getVScale( - DL, Ptr.getValueType(), - APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize)); - } else { - BytesIncrement = DAG.getConstant( - APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL, - Ptr.getValueType()); - } + SDValue BytesIncrement = + DAG.getTypeSize(DL, Ptr.getValueType(), PartSize); Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, BytesIncrement, SDNodeFlags::NoUnsignedWrap); ExtraArgLocs++; @@ -9642,6 +9634,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, if (CallAttrs.requiresLazySave() || CallAttrs.requiresPreservingAllZAState()) ZAMarkerNode = AArch64ISD::REQUIRES_ZA_SAVE; + else if (CallAttrs.requiresPreservingZT0()) + ZAMarkerNode = AArch64ISD::REQUIRES_ZT0_SAVE; else if (CallAttrs.caller().hasZAState() || CallAttrs.caller().hasZT0State()) ZAMarkerNode = AArch64ISD::INOUT_ZA_USE; @@ -9761,7 +9755,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, SDValue ZTFrameIdx; MachineFrameInfo &MFI = MF.getFrameInfo(); - bool ShouldPreserveZT0 = CallAttrs.requiresPreservingZT0(); + bool ShouldPreserveZT0 = + !UseNewSMEABILowering && CallAttrs.requiresPreservingZT0(); // If the caller has ZT0 state which will not be preserved by the callee, // spill ZT0 before the call. @@ -9774,7 +9769,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // If caller shares ZT0 but the callee is not shared ZA, we need to stop // PSTATE.ZA before the call if there is no lazy-save active. - bool DisableZA = CallAttrs.requiresDisablingZABeforeCall(); + bool DisableZA = + !UseNewSMEABILowering && CallAttrs.requiresDisablingZABeforeCall(); assert((!DisableZA || !RequiresLazySave) && "Lazy-save should have PSTATE.SM=1 on entry to the function"); @@ -9876,8 +9872,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, assert((isScalable || Subtarget->isWindowsArm64EC()) && "Indirect arguments should be scalable on most subtargets"); - uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue(); - uint64_t PartSize = StoreSize; + TypeSize StoreSize = VA.getValVT().getStoreSize(); + TypeSize PartSize = StoreSize; unsigned NumParts = 1; if (Outs[i].Flags.isInConsecutiveRegs()) { while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast()) @@ -9888,7 +9884,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext()); Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty); MachineFrameInfo &MFI = MF.getFrameInfo(); - int FI = MFI.CreateStackObject(StoreSize, Alignment, false); + int FI = + MFI.CreateStackObject(StoreSize.getKnownMinValue(), Alignment, false); if (isScalable) { bool IsPred = VA.getValVT() == MVT::aarch64svcount || VA.getValVT().getVectorElementType() == MVT::i1; @@ -9909,16 +9906,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, NumParts--; if (NumParts > 0) { - SDValue BytesIncrement; - if (isScalable) { - BytesIncrement = DAG.getVScale( - DL, Ptr.getValueType(), - APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize)); - } else { - BytesIncrement = DAG.getConstant( - APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL, - Ptr.getValueType()); - } + SDValue BytesIncrement = + DAG.getTypeSize(DL, Ptr.getValueType(), PartSize); MPI = MachinePointerInfo(MPI.getAddrSpace()); Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, BytesIncrement, SDNodeFlags::NoUnsignedWrap); @@ -10263,7 +10252,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, getSMToggleCondition(CallAttrs)); } - if (RequiresLazySave || CallAttrs.requiresEnablingZAAfterCall()) + if (!UseNewSMEABILowering && + (RequiresLazySave || CallAttrs.requiresEnablingZAAfterCall())) // Unconditionally resume ZA. Result = DAG.getNode( AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Result, diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index 737169253ddb3..b099f15ecf7e3 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -102,6 +102,7 @@ def : Pat<(i64 (AArch64AllocateSMESaveBuffer GPR64:$size)), let hasSideEffects = 1, isMeta = 1 in { def InOutZAUsePseudo : Pseudo<(outs), (ins), []>, Sched<[]>; def RequiresZASavePseudo : Pseudo<(outs), (ins), []>, Sched<[]>; + def RequiresZT0SavePseudo : Pseudo<(outs), (ins), []>, Sched<[]>; } def SMEStateAllocPseudo : Pseudo<(outs), (ins), []>, Sched<[]>; @@ -122,6 +123,11 @@ def AArch64_requires_za_save [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>; def : Pat<(AArch64_requires_za_save), (RequiresZASavePseudo)>; +def AArch64_requires_zt0_save + : SDNode<"AArch64ISD::REQUIRES_ZT0_SAVE", SDTypeProfile<0, 0, []>, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>; +def : Pat<(AArch64_requires_zt0_save), (RequiresZT0SavePseudo)>; + def AArch64_sme_state_alloc : SDNode<"AArch64ISD::SME_STATE_ALLOC", SDTypeProfile<0, 0,[]>, [SDNPHasChain]>; diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp index ead1dfceb96a0..b96f6f12a58d6 100644 --- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp +++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp @@ -72,20 +72,34 @@ using namespace llvm; namespace { -enum ZAState { +// Note: For agnostic ZA, we assume the function is always entered/exited in the +// "ACTIVE" state -- this _may_ not be the case (since OFF is also a +// possibility, but for the purpose of placing ZA saves/restores, that does not +// matter). +enum ZAState : uint8_t { // Any/unknown state (not valid) ANY = 0, // ZA is in use and active (i.e. within the accumulator) ACTIVE, + // ZA is active, but ZT0 has been saved. + // This handles the edge case of sharedZA && !sharesZT0. + ACTIVE_ZT0_SAVED, + // A ZA save has been set up or committed (i.e. ZA is dormant or off) + // If the function uses ZT0 it must also be saved. LOCAL_SAVED, + // ZA has been committed to the lazy save buffer of the current function. + // If the function uses ZT0 it must also be saved. + // ZA is off. + LOCAL_COMMITTED, + // The ZA/ZT0 state on entry to the function. ENTRY, - // ZA is off + // ZA is off. OFF, // The number of ZA states (not a valid state) @@ -164,6 +178,14 @@ class EmitContext { return AgnosticZABufferPtr; } + int getZT0SaveSlot(MachineFunction &MF) { + if (ZT0SaveFI) + return *ZT0SaveFI; + MachineFrameInfo &MFI = MF.getFrameInfo(); + ZT0SaveFI = MFI.CreateSpillStackObject(64, Align(16)); + return *ZT0SaveFI; + } + /// Returns true if the function must allocate a ZA save buffer on entry. This /// will be the case if, at any point in the function, a ZA save was emitted. bool needsSaveBuffer() const { @@ -173,6 +195,7 @@ class EmitContext { } private: + std::optional ZT0SaveFI; std::optional TPIDR2BlockFI; Register AgnosticZABufferPtr = AArch64::NoRegister; }; @@ -184,8 +207,10 @@ class EmitContext { /// state would not be legal, as transitioning to it drops the content of ZA. static bool isLegalEdgeBundleZAState(ZAState State) { switch (State) { - case ZAState::ACTIVE: // ZA state within the accumulator/ZT0. - case ZAState::LOCAL_SAVED: // ZA state is saved on the stack. + case ZAState::ACTIVE: // ZA state within the accumulator/ZT0. + case ZAState::ACTIVE_ZT0_SAVED: // ZT0 is saved (ZA is active). + case ZAState::LOCAL_SAVED: // ZA state may be saved on the stack. + case ZAState::LOCAL_COMMITTED: // ZA state is saved on the stack. return true; default: return false; @@ -199,7 +224,9 @@ StringRef getZAStateString(ZAState State) { switch (State) { MAKE_CASE(ZAState::ANY) MAKE_CASE(ZAState::ACTIVE) + MAKE_CASE(ZAState::ACTIVE_ZT0_SAVED) MAKE_CASE(ZAState::LOCAL_SAVED) + MAKE_CASE(ZAState::LOCAL_COMMITTED) MAKE_CASE(ZAState::ENTRY) MAKE_CASE(ZAState::OFF) default: @@ -221,18 +248,39 @@ static bool isZAorZTRegOp(const TargetRegisterInfo &TRI, /// Returns the required ZA state needed before \p MI and an iterator pointing /// to where any code required to change the ZA state should be inserted. static std::pair -getZAStateBeforeInst(const TargetRegisterInfo &TRI, MachineInstr &MI, - bool ZAOffAtReturn) { +getInstNeededZAState(const TargetRegisterInfo &TRI, MachineInstr &MI, + SMEAttrs SMEFnAttrs) { MachineBasicBlock::iterator InsertPt(MI); + // Note: InOutZAUsePseudo, RequiresZASavePseudo, and RequiresZT0SavePseudo are + // intended to mark the position immediately before a call. Due to + // SelectionDAG constraints, these markers occur after the ADJCALLSTACKDOWN, + // so we use std::prev(InsertPt) to get the position before the call. + if (MI.getOpcode() == AArch64::InOutZAUsePseudo) return {ZAState::ACTIVE, std::prev(InsertPt)}; + // Note: If we need to save both ZA and ZT0 we use RequiresZASavePseudo. if (MI.getOpcode() == AArch64::RequiresZASavePseudo) return {ZAState::LOCAL_SAVED, std::prev(InsertPt)}; - if (MI.isReturn()) + // If we only need to save ZT0 there's two cases to consider: + // 1. The function has ZA state (that we don't need to save). + // - In this case we switch to the "ACTIVE_ZT0_SAVED" state. + // This only saves ZT0. + // 2. The function does not have ZA state + // - In this case we switch to "LOCAL_COMMITTED" state. + // This saves ZT0 and turns ZA off. + if (MI.getOpcode() == AArch64::RequiresZT0SavePseudo) { + return {SMEFnAttrs.hasZAState() ? ZAState::ACTIVE_ZT0_SAVED + : ZAState::LOCAL_COMMITTED, + std::prev(InsertPt)}; + } + + if (MI.isReturn()) { + bool ZAOffAtReturn = SMEFnAttrs.hasPrivateZAInterface(); return {ZAOffAtReturn ? ZAState::OFF : ZAState::ACTIVE, InsertPt}; + } for (auto &MO : MI.operands()) { if (isZAorZTRegOp(TRI, MO)) @@ -280,6 +328,9 @@ struct MachineSMEABI : public MachineFunctionPass { /// predecessors). void propagateDesiredStates(FunctionInfo &FnInfo, bool Forwards = true); + void emitZT0SaveRestore(EmitContext &, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, bool IsSave); + // Emission routines for private and shared ZA functions (using lazy saves). void emitSMEPrologue(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); @@ -290,8 +341,8 @@ struct MachineSMEABI : public MachineFunctionPass { MachineBasicBlock::iterator MBBI); void emitAllocateLazySaveBuffer(EmitContext &, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); - void emitZAOff(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - bool ClearTPIDR2); + void emitZAMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + bool ClearTPIDR2, bool On); // Emission routines for agnostic ZA functions. void emitSetupFullZASave(MachineBasicBlock &MBB, @@ -409,7 +460,7 @@ FunctionInfo MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) { Block.FixedEntryState = ZAState::ENTRY; } else if (MBB.isEHPad()) { // EH entry block: - Block.FixedEntryState = ZAState::LOCAL_SAVED; + Block.FixedEntryState = ZAState::LOCAL_COMMITTED; } LiveRegUnits LiveUnits(*TRI); @@ -431,8 +482,7 @@ FunctionInfo MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) { PhysLiveRegsAfterSMEPrologue = PhysLiveRegs; } // Note: We treat Agnostic ZA as inout_za with an alternate save/restore. - auto [NeededState, InsertPt] = getZAStateBeforeInst( - *TRI, MI, /*ZAOffAtReturn=*/SMEFnAttrs.hasPrivateZAInterface()); + auto [NeededState, InsertPt] = getInstNeededZAState(*TRI, MI, SMEFnAttrs); assert((InsertPt == MBBI || isCallStartOpcode(InsertPt->getOpcode())) && "Unexpected state change insertion point!"); // TODO: Do something to avoid state changes where NZCV is live. @@ -752,9 +802,9 @@ void MachineSMEABI::emitRestoreLazySave(EmitContext &Context, restorePhyRegSave(RegSave, MBB, MBBI, DL); } -void MachineSMEABI::emitZAOff(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - bool ClearTPIDR2) { +void MachineSMEABI::emitZAMode(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + bool ClearTPIDR2, bool On) { DebugLoc DL = getDebugLoc(MBB, MBBI); if (ClearTPIDR2) @@ -765,7 +815,7 @@ void MachineSMEABI::emitZAOff(MachineBasicBlock &MBB, // Disable ZA. BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSRpstatesvcrImm1)) .addImm(AArch64SVCR::SVCRZA) - .addImm(0); + .addImm(On ? 1 : 0); } void MachineSMEABI::emitAllocateLazySaveBuffer( @@ -891,6 +941,28 @@ void MachineSMEABI::emitFullZASaveRestore(EmitContext &Context, restorePhyRegSave(RegSave, MBB, MBBI, DL); } +void MachineSMEABI::emitZT0SaveRestore(EmitContext &Context, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + bool IsSave) { + DebugLoc DL = getDebugLoc(MBB, MBBI); + Register ZT0Save = MRI->createVirtualRegister(&AArch64::GPR64spRegClass); + + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), ZT0Save) + .addFrameIndex(Context.getZT0SaveSlot(*MF)) + .addImm(0) + .addImm(0); + + if (IsSave) { + BuildMI(MBB, MBBI, DL, TII->get(AArch64::STR_TX)) + .addReg(AArch64::ZT0) + .addReg(ZT0Save); + } else { + BuildMI(MBB, MBBI, DL, TII->get(AArch64::LDR_TX), AArch64::ZT0) + .addReg(ZT0Save); + } +} + void MachineSMEABI::emitAllocateFullZASaveBuffer( EmitContext &Context, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, LiveRegs PhysLiveRegs) { @@ -935,6 +1007,17 @@ void MachineSMEABI::emitAllocateFullZASaveBuffer( restorePhyRegSave(RegSave, MBB, MBBI, DL); } +struct FromState { + ZAState From; + + constexpr uint8_t to(ZAState To) const { + static_assert(NUM_ZA_STATE < 16, "expected ZAState to fit in 4-bits"); + return uint8_t(From) << 4 | uint8_t(To); + } +}; + +constexpr FromState transitionFrom(ZAState From) { return FromState{From}; } + void MachineSMEABI::emitStateChange(EmitContext &Context, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, @@ -949,8 +1032,6 @@ void MachineSMEABI::emitStateChange(EmitContext &Context, if (From == ZAState::ENTRY && To == ZAState::OFF) return; - [[maybe_unused]] SMEAttrs SMEFnAttrs = AFI->getSMEFnAttrs(); - // TODO: Avoid setting up the save buffer if there's no transition to // LOCAL_SAVED. if (From == ZAState::ENTRY) { @@ -966,17 +1047,67 @@ void MachineSMEABI::emitStateChange(EmitContext &Context, From = ZAState::ACTIVE; } - if (From == ZAState::ACTIVE && To == ZAState::LOCAL_SAVED) - emitZASave(Context, MBB, InsertPt, PhysLiveRegs); - else if (From == ZAState::LOCAL_SAVED && To == ZAState::ACTIVE) - emitZARestore(Context, MBB, InsertPt, PhysLiveRegs); - else if (To == ZAState::OFF) { - assert(From != ZAState::ENTRY && - "ENTRY to OFF should have already been handled"); - assert(!SMEFnAttrs.hasAgnosticZAInterface() && - "Should not turn ZA off in agnostic ZA function"); - emitZAOff(MBB, InsertPt, /*ClearTPIDR2=*/From == ZAState::LOCAL_SAVED); - } else { + SMEAttrs SMEFnAttrs = AFI->getSMEFnAttrs(); + bool IsAgnosticZA = SMEFnAttrs.hasAgnosticZAInterface(); + bool HasZT0State = SMEFnAttrs.hasZT0State(); + bool HasZAState = IsAgnosticZA || SMEFnAttrs.hasZAState(); + + switch (transitionFrom(From).to(To)) { + // This section handles: ACTIVE <-> ACTIVE_ZT0_SAVED + case transitionFrom(ZAState::ACTIVE).to(ZAState::ACTIVE_ZT0_SAVED): + emitZT0SaveRestore(Context, MBB, InsertPt, /*IsSave=*/true); + break; + case transitionFrom(ZAState::ACTIVE_ZT0_SAVED).to(ZAState::ACTIVE): + emitZT0SaveRestore(Context, MBB, InsertPt, /*IsSave=*/false); + break; + + // This section handles: ACTIVE[_ZT0_SAVED] -> LOCAL_SAVED + case transitionFrom(ZAState::ACTIVE).to(ZAState::LOCAL_SAVED): + case transitionFrom(ZAState::ACTIVE_ZT0_SAVED).to(ZAState::LOCAL_SAVED): + if (HasZT0State && From == ZAState::ACTIVE) + emitZT0SaveRestore(Context, MBB, InsertPt, /*IsSave=*/true); + if (HasZAState) + emitZASave(Context, MBB, InsertPt, PhysLiveRegs); + break; + + // This section handles: ACTIVE -> LOCAL_COMMITTED + case transitionFrom(ZAState::ACTIVE).to(ZAState::LOCAL_COMMITTED): + // TODO: We could support ZA state here, but this transition is currently + // only possible when we _don't_ have ZA state. + assert(HasZT0State && !HasZAState && "Expect to only have ZT0 state."); + emitZT0SaveRestore(Context, MBB, InsertPt, /*IsSave=*/true); + emitZAMode(MBB, InsertPt, /*ClearTPIDR2=*/false, /*On=*/false); + break; + + // This section handles: LOCAL_COMMITTED -> (OFF|LOCAL_SAVED) + case transitionFrom(ZAState::LOCAL_COMMITTED).to(ZAState::OFF): + case transitionFrom(ZAState::LOCAL_COMMITTED).to(ZAState::LOCAL_SAVED): + // These transistions are a no-op. + break; + + // This section handles: LOCAL_(SAVED|COMMITTED) -> ACTIVE[_ZT0_SAVED] + case transitionFrom(ZAState::LOCAL_COMMITTED).to(ZAState::ACTIVE): + case transitionFrom(ZAState::LOCAL_COMMITTED).to(ZAState::ACTIVE_ZT0_SAVED): + case transitionFrom(ZAState::LOCAL_SAVED).to(ZAState::ACTIVE): + if (HasZAState) + emitZARestore(Context, MBB, InsertPt, PhysLiveRegs); + else + emitZAMode(MBB, InsertPt, /*ClearTPIDR2=*/false, /*On=*/true); + if (HasZT0State && To == ZAState::ACTIVE) + emitZT0SaveRestore(Context, MBB, InsertPt, /*IsSave=*/false); + break; + + // This section handles transistions to OFF (not previously covered) + case transitionFrom(ZAState::ACTIVE).to(ZAState::OFF): + case transitionFrom(ZAState::ACTIVE_ZT0_SAVED).to(ZAState::OFF): + case transitionFrom(ZAState::LOCAL_SAVED).to(ZAState::OFF): + assert(SMEFnAttrs.hasPrivateZAInterface() && + "Did not expect to turn ZA off in shared/agnostic ZA function"); + emitZAMode(MBB, InsertPt, /*ClearTPIDR2=*/From == ZAState::LOCAL_SAVED, + /*On=*/false); + break; + + default: dbgs() << "Error: Transition from " << getZAStateString(From) << " to " << getZAStateString(To) << '\n'; llvm_unreachable("Unimplemented state transition"); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index a6212f5cc84be..afbbc0dbeb7ad 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -12783,10 +12783,7 @@ SDValue RISCVTargetLowering::lowerVECTOR_INTERLEAVE(SDValue Op, SmallVector Loads(Factor); - SDValue Increment = - DAG.getVScale(DL, PtrVT, - APInt(PtrVT.getFixedSizeInBits(), - VecVT.getStoreSize().getKnownMinValue())); + SDValue Increment = DAG.getTypeSize(DL, PtrVT, VecVT.getStoreSize()); for (unsigned i = 0; i != Factor; ++i) { if (i != 0) StackPtr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, Increment); @@ -14184,9 +14181,8 @@ RISCVTargetLowering::lowerVPReverseExperimental(SDValue Op, // Slide off any elements from past EVL that were reversed into the low // elements. - unsigned MinElts = GatherVT.getVectorMinNumElements(); SDValue VLMax = - DAG.getVScale(DL, XLenVT, APInt(XLenVT.getSizeInBits(), MinElts)); + DAG.getElementCount(DL, XLenVT, GatherVT.getVectorElementCount()); SDValue Diff = DAG.getNode(ISD::SUB, DL, XLenVT, VLMax, EVL); Result = getVSlidedown(DAG, Subtarget, DL, GatherVT, diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td index 13d048a98d6ea..ce4db2e112fa0 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td @@ -460,8 +460,8 @@ def : Pat<(i64 (WebAssemblyWrapperREL texternalsym:$addr)), include "WebAssemblyInstrMemory.td" include "WebAssemblyInstrCall.td" include "WebAssemblyInstrControl.td" -include "WebAssemblyInstrInteger.td" include "WebAssemblyInstrConv.td" +include "WebAssemblyInstrInteger.td" include "WebAssemblyInstrFloat.td" include "WebAssemblyInstrAtomics.td" include "WebAssemblyInstrSIMD.td" diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td index d4c8f92c883e7..eb692679f5971 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td @@ -107,6 +107,9 @@ def : Pat<(rotr I32:$lhs, (and I32:$rhs, 31)), (ROTR_I32 I32:$lhs, I32:$rhs)>; def : Pat<(rotl I64:$lhs, (and I64:$rhs, 63)), (ROTL_I64 I64:$lhs, I64:$rhs)>; def : Pat<(rotr I64:$lhs, (and I64:$rhs, 63)), (ROTR_I64 I64:$lhs, I64:$rhs)>; +def : Pat<(shl I64:$lhs, (zext (and I32:$rhs, 63))), + (SHL_I64 I64:$lhs, (I64_EXTEND_U_I32 I32:$rhs))>; + defm SELECT_I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs, I32:$cond), (outs), (ins), [(set I32:$dst, (select I32:$cond, I32:$lhs, I32:$rhs))], diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index e7dc366b13798..c9f51e4b294b1 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -1163,6 +1163,7 @@ static Value *canonicalizeSaturatedAddSigned(ICmpInst *Cmp, Value *TVal, // (X >= Y) ? INT_MAX : (X + C) --> sadd.sat(X, C) // where Y is INT_MAX - C or INT_MAX - C - 1, and C > 0 if ((Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE) && + isa(Cmp1) && match(FVal, m_Add(m_Specific(Cmp0), m_StrictlyPositive(C)))) { APInt IntMax = APInt::getSignedMaxValue(Cmp1->getType()->getScalarSizeInBits()); diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp index 4947d03a2dc66..021bf0618754a 100644 --- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp +++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp @@ -2098,6 +2098,38 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) { return (void)mergeInValue(ValueState[II], II, ValueLatticeElement::getRange(Result)); } + if (II->getIntrinsicID() == Intrinsic::experimental_get_vector_length) { + Value *CountArg = II->getArgOperand(0); + Value *VF = II->getArgOperand(1); + bool Scalable = cast(II->getArgOperand(2))->isOne(); + + // Computation happens in the larger type. + unsigned BitWidth = std::max(CountArg->getType()->getScalarSizeInBits(), + VF->getType()->getScalarSizeInBits()); + + ConstantRange Count = getValueState(CountArg) + .asConstantRange(CountArg->getType(), false) + .zeroExtend(BitWidth); + ConstantRange MaxLanes = getValueState(VF) + .asConstantRange(VF->getType(), false) + .zeroExtend(BitWidth); + if (Scalable) + MaxLanes = + MaxLanes.multiply(getVScaleRange(II->getFunction(), BitWidth)); + + // The result is always less than both Count and MaxLanes. + ConstantRange Result( + APInt::getZero(BitWidth), + APIntOps::umin(Count.getUpper(), MaxLanes.getUpper())); + + // If Count <= MaxLanes, getvectorlength(Count, MaxLanes) = Count + if (Count.icmp(CmpInst::ICMP_ULE, MaxLanes)) + Result = Count; + + Result = Result.truncate(II->getType()->getScalarSizeInBits()); + return (void)mergeInValue(ValueState[II], II, + ValueLatticeElement::getRange(Result)); + } if (ConstantRange::isIntrinsicSupported(II->getIntrinsicID())) { // Compute result range for intrinsics supported by ConstantRange. diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index b12f8ccc73c7e..f7281283bae81 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -769,7 +769,8 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) { // Replace wide pointer inductions which have only their scalars used by // PtrAdd(IndStart, ScalarIVSteps (0, Step)). if (auto *PtrIV = dyn_cast(&Phi)) { - if (!PtrIV->onlyScalarsGenerated(Plan.hasScalableVF())) + if (!Plan.hasScalarVFOnly() && + !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF())) continue; VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder); diff --git a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll index a3027f01e73cf..ea1341186ddfa 100644 --- a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll +++ b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll @@ -230,10 +230,6 @@ define void @test7() nounwind "aarch64_inout_zt0" { ; CHECK-NEXT: str zt0, [x19] ; CHECK-NEXT: smstop za ; CHECK-NEXT: bl callee -; CHECK-NEXT: smstart za -; CHECK-NEXT: ldr zt0, [x19] -; CHECK-NEXT: str zt0, [x19] -; CHECK-NEXT: smstop za ; CHECK-NEXT: bl callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: ldr zt0, [x19] diff --git a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll index ef74825e02881..3947127c47844 100644 --- a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll +++ b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll @@ -511,7 +511,6 @@ exit: ; ; This code may require reloading ZT0 in the cleanup for ~ZT0Resource(). ; -; FIXME: Codegen with `-aarch64-new-sme-abi` is broken with ZT0 (as it is not implemented). define void @try_catch_shared_zt0_callee() "aarch64_inout_zt0" personality ptr @__gxx_personality_v0 { ; CHECK-LABEL: try_catch_shared_zt0_callee: ; CHECK: .Lfunc_begin3: @@ -519,52 +518,37 @@ define void @try_catch_shared_zt0_callee() "aarch64_inout_zt0" personality ptr @ ; CHECK-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0 ; CHECK-NEXT: .cfi_lsda 28, .Lexception3 ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: sub sp, sp, #80 -; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Spill +; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 96 ; CHECK-NEXT: .cfi_offset w19, -8 ; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w30, -24 -; CHECK-NEXT: .cfi_offset w29, -32 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x9, x8, x8, x9 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: stp x9, x8, [x29, #-80] +; CHECK-NEXT: .cfi_offset w30, -32 ; CHECK-NEXT: .Ltmp9: // EH_LABEL -; CHECK-NEXT: sub x19, x29, #64 +; CHECK-NEXT: mov x19, sp ; CHECK-NEXT: str zt0, [x19] ; CHECK-NEXT: smstop za ; CHECK-NEXT: bl may_throw +; CHECK-NEXT: .Ltmp10: // EH_LABEL ; CHECK-NEXT: smstart za ; CHECK-NEXT: ldr zt0, [x19] -; CHECK-NEXT: .Ltmp10: // EH_LABEL ; CHECK-NEXT: // %bb.1: // %return_normally -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Reload +; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB3_2: // %unwind_dtors ; CHECK-NEXT: .Ltmp11: // EH_LABEL -; CHECK-NEXT: sub x20, x29, #64 +; CHECK-NEXT: mov x20, sp ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: smstart za -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: sub x0, x29, #80 -; CHECK-NEXT: cbnz x8, .LBB3_4 -; CHECK-NEXT: // %bb.3: // %unwind_dtors -; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB3_4: // %unwind_dtors -; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: ldr zt0, [x20] ; CHECK-NEXT: bl shared_zt0_call ; CHECK-NEXT: str zt0, [x20] ; CHECK-NEXT: smstop za ; CHECK-NEXT: mov x0, x19 ; CHECK-NEXT: bl _Unwind_Resume -; CHECK-NEXT: smstart za -; CHECK-NEXT: ldr zt0, [x20] ; ; CHECK-SDAG-LABEL: try_catch_shared_zt0_callee: ; CHECK-SDAG: .Lfunc_begin3: @@ -965,6 +949,239 @@ exit: ret void } +define void @try_catch_inout_zt0() "aarch64_inout_zt0" personality ptr @__gxx_personality_v0 { +; CHECK-LABEL: try_catch_inout_zt0: +; CHECK: .Lfunc_begin7: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0 +; CHECK-NEXT: .cfi_lsda 28, .Lexception7 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #80 +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: .Ltmp21: // EH_LABEL +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: str zt0, [x19] +; CHECK-NEXT: smstop za +; CHECK-NEXT: bl may_throw +; CHECK-NEXT: .Ltmp22: // EH_LABEL +; CHECK-NEXT: .LBB7_1: // %exit +; CHECK-NEXT: smstart za +; CHECK-NEXT: ldr zt0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #80 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB7_2: // %catch +; CHECK-NEXT: .Ltmp23: // EH_LABEL +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: b .LBB7_1 +; +; CHECK-SDAG-LABEL: try_catch_inout_zt0: +; CHECK-SDAG: .Lfunc_begin7: +; CHECK-SDAG-NEXT: .cfi_startproc +; CHECK-SDAG-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0 +; CHECK-SDAG-NEXT: .cfi_lsda 28, .Lexception7 +; CHECK-SDAG-NEXT: // %bb.0: // %entry +; CHECK-SDAG-NEXT: sub sp, sp, #80 +; CHECK-SDAG-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: .cfi_def_cfa_offset 80 +; CHECK-SDAG-NEXT: .cfi_offset w19, -8 +; CHECK-SDAG-NEXT: .cfi_offset w30, -16 +; CHECK-SDAG-NEXT: .Ltmp21: // EH_LABEL +; CHECK-SDAG-NEXT: mov x19, sp +; CHECK-SDAG-NEXT: str zt0, [x19] +; CHECK-SDAG-NEXT: smstop za +; CHECK-SDAG-NEXT: bl may_throw +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: ldr zt0, [x19] +; CHECK-SDAG-NEXT: .Ltmp22: // EH_LABEL +; CHECK-SDAG-NEXT: .LBB7_1: // %exit +; CHECK-SDAG-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: add sp, sp, #80 +; CHECK-SDAG-NEXT: ret +; CHECK-SDAG-NEXT: .LBB7_2: // %catch +; CHECK-SDAG-NEXT: .Ltmp23: // EH_LABEL +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: ldr zt0, [x19] +; CHECK-SDAG-NEXT: str zt0, [x19] +; CHECK-SDAG-NEXT: smstop za +; CHECK-SDAG-NEXT: bl __cxa_begin_catch +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: ldr zt0, [x19] +; CHECK-SDAG-NEXT: str zt0, [x19] +; CHECK-SDAG-NEXT: smstop za +; CHECK-SDAG-NEXT: bl __cxa_end_catch +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: ldr zt0, [x19] +; CHECK-SDAG-NEXT: b .LBB7_1 +entry: + invoke void @may_throw() + to label %exit unwind label %catch + +catch: + %eh_info = landingpad { ptr, i32 } + catch ptr null + %exception_ptr = extractvalue { ptr, i32 } %eh_info, 0 + tail call ptr @__cxa_begin_catch(ptr %exception_ptr) + tail call void @__cxa_end_catch() + br label %exit + +exit: + ret void +} + +define void @try_catch_shared_za_callee_zt0_saved(ptr %callee) "aarch64_inout_za" "aarch64_in_zt0" personality ptr @__gxx_personality_v0 { +; CHECK-LABEL: try_catch_shared_za_callee_zt0_saved: +; CHECK: .Lfunc_begin8: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0 +; CHECK-NEXT: .cfi_lsda 28, .Lexception8 +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #80 +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: stp x9, x8, [x29, #-80] +; CHECK-NEXT: .Ltmp24: // EH_LABEL +; CHECK-NEXT: sub x20, x29, #64 +; CHECK-NEXT: sub x8, x29, #80 +; CHECK-NEXT: str zt0, [x20] +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl may_throw +; CHECK-NEXT: .Ltmp25: // EH_LABEL +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #80 +; CHECK-NEXT: cbnz x8, .LBB8_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB8_2: +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: ldr zt0, [x20] +; CHECK-NEXT: // %bb.3: // %return_normally +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB8_4: // %unwind_dtors +; CHECK-NEXT: .Ltmp26: // EH_LABEL +; CHECK-NEXT: mov x20, x0 +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #80 +; CHECK-NEXT: cbnz x8, .LBB8_6 +; CHECK-NEXT: // %bb.5: // %unwind_dtors +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB8_6: // %unwind_dtors +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: blr x19 +; CHECK-NEXT: sub x8, x29, #80 +; CHECK-NEXT: mov x0, x20 +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl _Unwind_Resume +; +; CHECK-SDAG-LABEL: try_catch_shared_za_callee_zt0_saved: +; CHECK-SDAG: .Lfunc_begin8: +; CHECK-SDAG-NEXT: .cfi_startproc +; CHECK-SDAG-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0 +; CHECK-SDAG-NEXT: .cfi_lsda 28, .Lexception8 +; CHECK-SDAG-NEXT: // %bb.0: +; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill +; CHECK-SDAG-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: mov x29, sp +; CHECK-SDAG-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: sub sp, sp, #80 +; CHECK-SDAG-NEXT: .cfi_def_cfa w29, 48 +; CHECK-SDAG-NEXT: .cfi_offset w19, -8 +; CHECK-SDAG-NEXT: .cfi_offset w20, -16 +; CHECK-SDAG-NEXT: .cfi_offset w21, -24 +; CHECK-SDAG-NEXT: .cfi_offset w22, -32 +; CHECK-SDAG-NEXT: .cfi_offset w30, -40 +; CHECK-SDAG-NEXT: .cfi_offset w29, -48 +; CHECK-SDAG-NEXT: rdsvl x8, #1 +; CHECK-SDAG-NEXT: mov x9, sp +; CHECK-SDAG-NEXT: mov x19, x0 +; CHECK-SDAG-NEXT: msub x9, x8, x8, x9 +; CHECK-SDAG-NEXT: mov sp, x9 +; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16] +; CHECK-SDAG-NEXT: .Ltmp24: // EH_LABEL +; CHECK-SDAG-NEXT: sub x8, x29, #16 +; CHECK-SDAG-NEXT: sub x20, x29, #80 +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x8 +; CHECK-SDAG-NEXT: str zt0, [x20] +; CHECK-SDAG-NEXT: bl may_throw +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: ldr zt0, [x20] +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB8_2 +; CHECK-SDAG-NEXT: // %bb.1: +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB8_2: +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: .Ltmp25: // EH_LABEL +; CHECK-SDAG-NEXT: // %bb.3: // %return_normally +; CHECK-SDAG-NEXT: mov sp, x29 +; CHECK-SDAG-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ret +; CHECK-SDAG-NEXT: .LBB8_4: // %unwind_dtors +; CHECK-SDAG-NEXT: .Ltmp26: // EH_LABEL +; CHECK-SDAG-NEXT: sub x21, x29, #80 +; CHECK-SDAG-NEXT: sub x22, x29, #16 +; CHECK-SDAG-NEXT: mov x20, x0 +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: ldr zt0, [x21] +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB8_6 +; CHECK-SDAG-NEXT: // %bb.5: // %unwind_dtors +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB8_6: // %unwind_dtors +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: str zt0, [x21] +; CHECK-SDAG-NEXT: blr x19 +; CHECK-SDAG-NEXT: ldr zt0, [x21] +; CHECK-SDAG-NEXT: mov x0, x20 +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x22 +; CHECK-SDAG-NEXT: str zt0, [x21] +; CHECK-SDAG-NEXT: bl _Unwind_Resume +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: ldr zt0, [x21] +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB8_8 +; CHECK-SDAG-NEXT: // %bb.7: // %unwind_dtors +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB8_8: // %unwind_dtors +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr + invoke void @may_throw() + to label %return_normally unwind label %unwind_dtors + +unwind_dtors: + %5 = landingpad { ptr, i32 } + cleanup + call void %callee() "aarch64_inout_za" + resume { ptr, i32 } %5 + +return_normally: + ret void +} + declare ptr @__cxa_allocate_exception(i64) declare void @__cxa_throw(ptr, ptr, ptr) declare ptr @__cxa_begin_catch(ptr) diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll index 69c69f027a33f..0d4a39b2eeb2f 100644 --- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll +++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll @@ -193,7 +193,7 @@ define void @zt0_new_caller_zt0_new_callee(ptr %callee) "aarch64_new_zt0" nounwi ; CHECK-NEWLOWERING-LABEL: zt0_new_caller_zt0_new_callee: ; CHECK-NEWLOWERING: // %bb.0: ; CHECK-NEWLOWERING-NEXT: sub sp, sp, #80 -; CHECK-NEWLOWERING-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: str x30, [sp, #64] // 8-byte Spill ; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEWLOWERING-NEXT: cbz x8, .LBB6_2 ; CHECK-NEWLOWERING-NEXT: // %bb.1: @@ -202,14 +202,11 @@ define void @zt0_new_caller_zt0_new_callee(ptr %callee) "aarch64_new_zt0" nounwi ; CHECK-NEWLOWERING-NEXT: zero { zt0 } ; CHECK-NEWLOWERING-NEXT: .LBB6_2: ; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mov x19, sp -; CHECK-NEWLOWERING-NEXT: str zt0, [x19] +; CHECK-NEWLOWERING-NEXT: mov x8, sp +; CHECK-NEWLOWERING-NEXT: str zt0, [x8] ; CHECK-NEWLOWERING-NEXT: smstop za ; CHECK-NEWLOWERING-NEXT: blr x0 -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: ldr zt0, [x19] -; CHECK-NEWLOWERING-NEXT: smstop za -; CHECK-NEWLOWERING-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldr x30, [sp, #64] // 8-byte Reload ; CHECK-NEWLOWERING-NEXT: add sp, sp, #80 ; CHECK-NEWLOWERING-NEXT: ret call void %callee() "aarch64_new_zt0"; @@ -246,7 +243,7 @@ define i64 @zt0_new_caller_abi_routine_callee() "aarch64_new_zt0" nounwind { ; CHECK-NEWLOWERING-LABEL: zt0_new_caller_abi_routine_callee: ; CHECK-NEWLOWERING: // %bb.0: ; CHECK-NEWLOWERING-NEXT: sub sp, sp, #80 -; CHECK-NEWLOWERING-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: str x30, [sp, #64] // 8-byte Spill ; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEWLOWERING-NEXT: cbz x8, .LBB7_2 ; CHECK-NEWLOWERING-NEXT: // %bb.1: @@ -255,12 +252,11 @@ define i64 @zt0_new_caller_abi_routine_callee() "aarch64_new_zt0" nounwind { ; CHECK-NEWLOWERING-NEXT: zero { zt0 } ; CHECK-NEWLOWERING-NEXT: .LBB7_2: ; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mov x19, sp -; CHECK-NEWLOWERING-NEXT: str zt0, [x19] -; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state -; CHECK-NEWLOWERING-NEXT: ldr zt0, [x19] +; CHECK-NEWLOWERING-NEXT: mov x8, sp +; CHECK-NEWLOWERING-NEXT: str zt0, [x8] ; CHECK-NEWLOWERING-NEXT: smstop za -; CHECK-NEWLOWERING-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state +; CHECK-NEWLOWERING-NEXT: ldr x30, [sp, #64] // 8-byte Reload ; CHECK-NEWLOWERING-NEXT: add sp, sp, #80 ; CHECK-NEWLOWERING-NEXT: ret %res = call {i64, i64} @__arm_sme_state() @@ -382,37 +378,57 @@ define void @shared_za_new_zt0(ptr %callee) "aarch64_inout_za" "aarch64_new_zt0" define void @zt0_multiple_private_za_calls(ptr %callee) "aarch64_in_zt0" nounwind { -; CHECK-COMMON-LABEL: zt0_multiple_private_za_calls: -; CHECK-COMMON: // %bb.0: -; CHECK-COMMON-NEXT: sub sp, sp, #96 -; CHECK-COMMON-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-COMMON-NEXT: mov x20, sp -; CHECK-COMMON-NEXT: mov x19, x0 -; CHECK-COMMON-NEXT: str x30, [sp, #64] // 8-byte Spill -; CHECK-COMMON-NEXT: str zt0, [x20] -; CHECK-COMMON-NEXT: smstop za -; CHECK-COMMON-NEXT: blr x0 -; CHECK-COMMON-NEXT: smstart za -; CHECK-COMMON-NEXT: ldr zt0, [x20] -; CHECK-COMMON-NEXT: str zt0, [x20] -; CHECK-COMMON-NEXT: smstop za -; CHECK-COMMON-NEXT: blr x19 -; CHECK-COMMON-NEXT: smstart za -; CHECK-COMMON-NEXT: ldr zt0, [x20] -; CHECK-COMMON-NEXT: str zt0, [x20] -; CHECK-COMMON-NEXT: smstop za -; CHECK-COMMON-NEXT: blr x19 -; CHECK-COMMON-NEXT: smstart za -; CHECK-COMMON-NEXT: ldr zt0, [x20] -; CHECK-COMMON-NEXT: str zt0, [x20] -; CHECK-COMMON-NEXT: smstop za -; CHECK-COMMON-NEXT: blr x19 -; CHECK-COMMON-NEXT: smstart za -; CHECK-COMMON-NEXT: ldr zt0, [x20] -; CHECK-COMMON-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-COMMON-NEXT: ldr x30, [sp, #64] // 8-byte Reload -; CHECK-COMMON-NEXT: add sp, sp, #96 -; CHECK-COMMON-NEXT: ret +; CHECK-LABEL: zt0_multiple_private_za_calls: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x20, sp +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Spill +; CHECK-NEXT: str zt0, [x20] +; CHECK-NEXT: smstop za +; CHECK-NEXT: blr x0 +; CHECK-NEXT: smstart za +; CHECK-NEXT: ldr zt0, [x20] +; CHECK-NEXT: str zt0, [x20] +; CHECK-NEXT: smstop za +; CHECK-NEXT: blr x19 +; CHECK-NEXT: smstart za +; CHECK-NEXT: ldr zt0, [x20] +; CHECK-NEXT: str zt0, [x20] +; CHECK-NEXT: smstop za +; CHECK-NEXT: blr x19 +; CHECK-NEXT: smstart za +; CHECK-NEXT: ldr zt0, [x20] +; CHECK-NEXT: str zt0, [x20] +; CHECK-NEXT: smstop za +; CHECK-NEXT: blr x19 +; CHECK-NEXT: smstart za +; CHECK-NEXT: ldr zt0, [x20] +; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: zt0_multiple_private_za_calls: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: sub sp, sp, #96 +; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mov x20, sp +; CHECK-NEWLOWERING-NEXT: mov x19, x0 +; CHECK-NEWLOWERING-NEXT: str x30, [sp, #64] // 8-byte Spill +; CHECK-NEWLOWERING-NEXT: str zt0, [x20] +; CHECK-NEWLOWERING-NEXT: smstop za +; CHECK-NEWLOWERING-NEXT: blr x0 +; CHECK-NEWLOWERING-NEXT: blr x19 +; CHECK-NEWLOWERING-NEXT: blr x19 +; CHECK-NEWLOWERING-NEXT: blr x19 +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: ldr zt0, [x20] +; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldr x30, [sp, #64] // 8-byte Reload +; CHECK-NEWLOWERING-NEXT: add sp, sp, #96 +; CHECK-NEWLOWERING-NEXT: ret call void %callee() call void %callee() call void %callee() diff --git a/llvm/test/CodeGen/RISCV/zicond-fp-select-zfinx.ll b/llvm/test/CodeGen/RISCV/zicond-fp-select-zfinx.ll index b505c84166eb1..0e8a0c704207d 100644 --- a/llvm/test/CodeGen/RISCV/zicond-fp-select-zfinx.ll +++ b/llvm/test/CodeGen/RISCV/zicond-fp-select-zfinx.ll @@ -1,19 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; Zicond with zfinx(implies by zdinx) -; RUN: llc -mtriple=riscv64 -mattr=+zdinx,+zicond -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV64ZDINX_ZICOND -; RUN: llc -mtriple=riscv64 -mattr=+zdinx -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV64ZDINX_NOZICOND +; RUN: llc -mtriple=riscv64 -mattr=+zdinx,+zicond -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZDINX_ZICOND,RV64ZDINX_ZICOND +; RUN: llc -mtriple=riscv64 -mattr=+zdinx -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZDINX_NOZICOND,RV64ZDINX_NOZICOND ; Zicond with zfinx(implies by zhinx) -; RUN: llc -mtriple=riscv64 -mattr=+zhinx,+zicond -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV64ZHINX_ZICOND +; RUN: llc -mtriple=riscv64 -mattr=+zhinx,+zicond -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZHINX_ZICOND,RV64ZHINX_ZICOND ; Baseline with classic FP registers (no *inx); zicond select should NOT trigger ; RUN: llc -mtriple=riscv64 -mattr=+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV64FD ; Check same optimize work on 32bit machine -; RUN: llc -mtriple=riscv32 -mattr=+zfinx,+zicond -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV32ZFINX_ZICOND +; RUN: llc -mtriple=riscv32 -mattr=+zfinx,+zicond -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZHINX_ZICOND,RV32ZFINX_ZICOND ; RUN: llc -mtriple=riscv32 -mattr=+zfinx -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV32ZFINX_NOZICOND -; RUN: llc -mtriple=riscv32 -mattr=+zdinx,+zicond -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV32ZDINX_ZICOND -; RUN: llc -mtriple=riscv32 -mattr=+zdinx -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV32ZDINX_NOZICOND +; RUN: llc -mtriple=riscv32 -mattr=+zdinx,+zicond -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZDINX_ZICOND,RV32ZDINX_ZICOND +; RUN: llc -mtriple=riscv32 -mattr=+zdinx -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZDINX_NOZICOND,RV32ZDINX_NOZICOND ; This test checks that floating-point SELECT is lowered through integer ; SELECT (and thus to Zicond czero.* sequence) when FP values live in GPRs @@ -25,37 +25,37 @@ ; ----------------------------------------------------------------------------- define float @select_f32_i1(i1 %cond, float %t, float %f) nounwind { -; RV64ZDINX_ZICOND-LABEL: select_f32_i1: -; RV64ZDINX_ZICOND: # %bb.0: # %entry -; RV64ZDINX_ZICOND-NEXT: # kill: def $x12_w killed $x12_w def $x12 -; RV64ZDINX_ZICOND-NEXT: # kill: def $x11_w killed $x11_w def $x11 -; RV64ZDINX_ZICOND-NEXT: andi a0, a0, 1 -; RV64ZDINX_ZICOND-NEXT: czero.nez a2, a2, a0 -; RV64ZDINX_ZICOND-NEXT: czero.eqz a0, a1, a0 -; RV64ZDINX_ZICOND-NEXT: or a0, a0, a2 -; RV64ZDINX_ZICOND-NEXT: # kill: def $x10_w killed $x10_w killed $x10 -; RV64ZDINX_ZICOND-NEXT: ret -; -; RV64ZDINX_NOZICOND-LABEL: select_f32_i1: -; RV64ZDINX_NOZICOND: # %bb.0: # %entry -; RV64ZDINX_NOZICOND-NEXT: andi a3, a0, 1 -; RV64ZDINX_NOZICOND-NEXT: mv a0, a1 -; RV64ZDINX_NOZICOND-NEXT: bnez a3, .LBB0_2 -; RV64ZDINX_NOZICOND-NEXT: # %bb.1: # %entry -; RV64ZDINX_NOZICOND-NEXT: mv a0, a2 -; RV64ZDINX_NOZICOND-NEXT: .LBB0_2: # %entry -; RV64ZDINX_NOZICOND-NEXT: ret -; -; RV64ZHINX_ZICOND-LABEL: select_f32_i1: -; RV64ZHINX_ZICOND: # %bb.0: # %entry -; RV64ZHINX_ZICOND-NEXT: # kill: def $x12_w killed $x12_w def $x12 -; RV64ZHINX_ZICOND-NEXT: # kill: def $x11_w killed $x11_w def $x11 -; RV64ZHINX_ZICOND-NEXT: andi a0, a0, 1 -; RV64ZHINX_ZICOND-NEXT: czero.nez a2, a2, a0 -; RV64ZHINX_ZICOND-NEXT: czero.eqz a0, a1, a0 -; RV64ZHINX_ZICOND-NEXT: or a0, a0, a2 -; RV64ZHINX_ZICOND-NEXT: # kill: def $x10_w killed $x10_w killed $x10 -; RV64ZHINX_ZICOND-NEXT: ret +; ZDINX_ZICOND-LABEL: select_f32_i1: +; ZDINX_ZICOND: # %bb.0: # %entry +; ZDINX_ZICOND-NEXT: # kill: def $x12_w killed $x12_w def $x12 +; ZDINX_ZICOND-NEXT: # kill: def $x11_w killed $x11_w def $x11 +; ZDINX_ZICOND-NEXT: andi a0, a0, 1 +; ZDINX_ZICOND-NEXT: czero.nez a2, a2, a0 +; ZDINX_ZICOND-NEXT: czero.eqz a0, a1, a0 +; ZDINX_ZICOND-NEXT: or a0, a0, a2 +; ZDINX_ZICOND-NEXT: # kill: def $x10_w killed $x10_w killed $x10 +; ZDINX_ZICOND-NEXT: ret +; +; ZDINX_NOZICOND-LABEL: select_f32_i1: +; ZDINX_NOZICOND: # %bb.0: # %entry +; ZDINX_NOZICOND-NEXT: andi a3, a0, 1 +; ZDINX_NOZICOND-NEXT: mv a0, a1 +; ZDINX_NOZICOND-NEXT: bnez a3, .LBB0_2 +; ZDINX_NOZICOND-NEXT: # %bb.1: # %entry +; ZDINX_NOZICOND-NEXT: mv a0, a2 +; ZDINX_NOZICOND-NEXT: .LBB0_2: # %entry +; ZDINX_NOZICOND-NEXT: ret +; +; ZHINX_ZICOND-LABEL: select_f32_i1: +; ZHINX_ZICOND: # %bb.0: # %entry +; ZHINX_ZICOND-NEXT: # kill: def $x12_w killed $x12_w def $x12 +; ZHINX_ZICOND-NEXT: # kill: def $x11_w killed $x11_w def $x11 +; ZHINX_ZICOND-NEXT: andi a0, a0, 1 +; ZHINX_ZICOND-NEXT: czero.nez a2, a2, a0 +; ZHINX_ZICOND-NEXT: czero.eqz a0, a1, a0 +; ZHINX_ZICOND-NEXT: or a0, a0, a2 +; ZHINX_ZICOND-NEXT: # kill: def $x10_w killed $x10_w killed $x10 +; ZHINX_ZICOND-NEXT: ret ; ; RV64FD-LABEL: select_f32_i1: ; RV64FD: # %bb.0: # %entry @@ -66,17 +66,6 @@ define float @select_f32_i1(i1 %cond, float %t, float %f) nounwind { ; RV64FD-NEXT: .LBB0_2: # %entry ; RV64FD-NEXT: ret ; -; RV32ZFINX_ZICOND-LABEL: select_f32_i1: -; RV32ZFINX_ZICOND: # %bb.0: # %entry -; RV32ZFINX_ZICOND-NEXT: # kill: def $x12_w killed $x12_w def $x12 -; RV32ZFINX_ZICOND-NEXT: # kill: def $x11_w killed $x11_w def $x11 -; RV32ZFINX_ZICOND-NEXT: andi a0, a0, 1 -; RV32ZFINX_ZICOND-NEXT: czero.nez a2, a2, a0 -; RV32ZFINX_ZICOND-NEXT: czero.eqz a0, a1, a0 -; RV32ZFINX_ZICOND-NEXT: or a0, a0, a2 -; RV32ZFINX_ZICOND-NEXT: # kill: def $x10_w killed $x10_w killed $x10 -; RV32ZFINX_ZICOND-NEXT: ret -; ; RV32ZFINX_NOZICOND-LABEL: select_f32_i1: ; RV32ZFINX_NOZICOND: # %bb.0: # %entry ; RV32ZFINX_NOZICOND-NEXT: andi a3, a0, 1 @@ -86,27 +75,6 @@ define float @select_f32_i1(i1 %cond, float %t, float %f) nounwind { ; RV32ZFINX_NOZICOND-NEXT: mv a0, a2 ; RV32ZFINX_NOZICOND-NEXT: .LBB0_2: # %entry ; RV32ZFINX_NOZICOND-NEXT: ret -; -; RV32ZDINX_ZICOND-LABEL: select_f32_i1: -; RV32ZDINX_ZICOND: # %bb.0: # %entry -; RV32ZDINX_ZICOND-NEXT: # kill: def $x12_w killed $x12_w def $x12 -; RV32ZDINX_ZICOND-NEXT: # kill: def $x11_w killed $x11_w def $x11 -; RV32ZDINX_ZICOND-NEXT: andi a0, a0, 1 -; RV32ZDINX_ZICOND-NEXT: czero.nez a2, a2, a0 -; RV32ZDINX_ZICOND-NEXT: czero.eqz a0, a1, a0 -; RV32ZDINX_ZICOND-NEXT: or a0, a0, a2 -; RV32ZDINX_ZICOND-NEXT: # kill: def $x10_w killed $x10_w killed $x10 -; RV32ZDINX_ZICOND-NEXT: ret -; -; RV32ZDINX_NOZICOND-LABEL: select_f32_i1: -; RV32ZDINX_NOZICOND: # %bb.0: # %entry -; RV32ZDINX_NOZICOND-NEXT: andi a3, a0, 1 -; RV32ZDINX_NOZICOND-NEXT: mv a0, a1 -; RV32ZDINX_NOZICOND-NEXT: bnez a3, .LBB0_2 -; RV32ZDINX_NOZICOND-NEXT: # %bb.1: # %entry -; RV32ZDINX_NOZICOND-NEXT: mv a0, a2 -; RV32ZDINX_NOZICOND-NEXT: .LBB0_2: # %entry -; RV32ZDINX_NOZICOND-NEXT: ret entry: %sel = select i1 %cond, float %t, float %f ret float %sel @@ -353,32 +321,32 @@ entry: ; ----------------------------------------------------------------------------- define dso_local noundef half @select_half_i1(i1 %cond, half %a, half %b) nounwind { -; RV64ZDINX_ZICOND-LABEL: select_half_i1: -; RV64ZDINX_ZICOND: # %bb.0: # %entry -; RV64ZDINX_ZICOND-NEXT: # kill: def $x12_w killed $x12_w def $x12 -; RV64ZDINX_ZICOND-NEXT: # kill: def $x11_w killed $x11_w def $x11 -; RV64ZDINX_ZICOND-NEXT: andi a0, a0, 1 -; RV64ZDINX_ZICOND-NEXT: czero.nez a2, a2, a0 -; RV64ZDINX_ZICOND-NEXT: czero.eqz a0, a1, a0 -; RV64ZDINX_ZICOND-NEXT: or a0, a0, a2 -; RV64ZDINX_ZICOND-NEXT: lui a1, 1048560 -; RV64ZDINX_ZICOND-NEXT: or a0, a0, a1 -; RV64ZDINX_ZICOND-NEXT: # kill: def $x10_w killed $x10_w killed $x10 -; RV64ZDINX_ZICOND-NEXT: ret -; -; RV64ZDINX_NOZICOND-LABEL: select_half_i1: -; RV64ZDINX_NOZICOND: # %bb.0: # %entry -; RV64ZDINX_NOZICOND-NEXT: # kill: def $x12_w killed $x12_w def $x12 -; RV64ZDINX_NOZICOND-NEXT: andi a0, a0, 1 -; RV64ZDINX_NOZICOND-NEXT: # kill: def $x11_w killed $x11_w def $x11 -; RV64ZDINX_NOZICOND-NEXT: bnez a0, .LBB3_2 -; RV64ZDINX_NOZICOND-NEXT: # %bb.1: # %entry -; RV64ZDINX_NOZICOND-NEXT: mv a1, a2 -; RV64ZDINX_NOZICOND-NEXT: .LBB3_2: # %entry -; RV64ZDINX_NOZICOND-NEXT: lui a0, 1048560 -; RV64ZDINX_NOZICOND-NEXT: or a0, a1, a0 -; RV64ZDINX_NOZICOND-NEXT: # kill: def $x10_w killed $x10_w killed $x10 -; RV64ZDINX_NOZICOND-NEXT: ret +; ZDINX_ZICOND-LABEL: select_half_i1: +; ZDINX_ZICOND: # %bb.0: # %entry +; ZDINX_ZICOND-NEXT: # kill: def $x12_w killed $x12_w def $x12 +; ZDINX_ZICOND-NEXT: # kill: def $x11_w killed $x11_w def $x11 +; ZDINX_ZICOND-NEXT: andi a0, a0, 1 +; ZDINX_ZICOND-NEXT: czero.nez a2, a2, a0 +; ZDINX_ZICOND-NEXT: czero.eqz a0, a1, a0 +; ZDINX_ZICOND-NEXT: or a0, a0, a2 +; ZDINX_ZICOND-NEXT: lui a1, 1048560 +; ZDINX_ZICOND-NEXT: or a0, a0, a1 +; ZDINX_ZICOND-NEXT: # kill: def $x10_w killed $x10_w killed $x10 +; ZDINX_ZICOND-NEXT: ret +; +; ZDINX_NOZICOND-LABEL: select_half_i1: +; ZDINX_NOZICOND: # %bb.0: # %entry +; ZDINX_NOZICOND-NEXT: # kill: def $x12_w killed $x12_w def $x12 +; ZDINX_NOZICOND-NEXT: andi a0, a0, 1 +; ZDINX_NOZICOND-NEXT: # kill: def $x11_w killed $x11_w def $x11 +; ZDINX_NOZICOND-NEXT: bnez a0, .LBB3_2 +; ZDINX_NOZICOND-NEXT: # %bb.1: # %entry +; ZDINX_NOZICOND-NEXT: mv a1, a2 +; ZDINX_NOZICOND-NEXT: .LBB3_2: # %entry +; ZDINX_NOZICOND-NEXT: lui a0, 1048560 +; ZDINX_NOZICOND-NEXT: or a0, a1, a0 +; ZDINX_NOZICOND-NEXT: # kill: def $x10_w killed $x10_w killed $x10 +; ZDINX_NOZICOND-NEXT: ret ; ; RV64ZHINX_ZICOND-LABEL: select_half_i1: ; RV64ZHINX_ZICOND: # %bb.0: # %entry @@ -432,33 +400,6 @@ define dso_local noundef half @select_half_i1(i1 %cond, half %a, half %b) nounwi ; RV32ZFINX_NOZICOND-NEXT: or a0, a1, a0 ; RV32ZFINX_NOZICOND-NEXT: # kill: def $x10_w killed $x10_w killed $x10 ; RV32ZFINX_NOZICOND-NEXT: ret -; -; RV32ZDINX_ZICOND-LABEL: select_half_i1: -; RV32ZDINX_ZICOND: # %bb.0: # %entry -; RV32ZDINX_ZICOND-NEXT: # kill: def $x12_w killed $x12_w def $x12 -; RV32ZDINX_ZICOND-NEXT: # kill: def $x11_w killed $x11_w def $x11 -; RV32ZDINX_ZICOND-NEXT: andi a0, a0, 1 -; RV32ZDINX_ZICOND-NEXT: czero.nez a2, a2, a0 -; RV32ZDINX_ZICOND-NEXT: czero.eqz a0, a1, a0 -; RV32ZDINX_ZICOND-NEXT: or a0, a0, a2 -; RV32ZDINX_ZICOND-NEXT: lui a1, 1048560 -; RV32ZDINX_ZICOND-NEXT: or a0, a0, a1 -; RV32ZDINX_ZICOND-NEXT: # kill: def $x10_w killed $x10_w killed $x10 -; RV32ZDINX_ZICOND-NEXT: ret -; -; RV32ZDINX_NOZICOND-LABEL: select_half_i1: -; RV32ZDINX_NOZICOND: # %bb.0: # %entry -; RV32ZDINX_NOZICOND-NEXT: # kill: def $x12_w killed $x12_w def $x12 -; RV32ZDINX_NOZICOND-NEXT: andi a0, a0, 1 -; RV32ZDINX_NOZICOND-NEXT: # kill: def $x11_w killed $x11_w def $x11 -; RV32ZDINX_NOZICOND-NEXT: bnez a0, .LBB3_2 -; RV32ZDINX_NOZICOND-NEXT: # %bb.1: # %entry -; RV32ZDINX_NOZICOND-NEXT: mv a1, a2 -; RV32ZDINX_NOZICOND-NEXT: .LBB3_2: # %entry -; RV32ZDINX_NOZICOND-NEXT: lui a0, 1048560 -; RV32ZDINX_NOZICOND-NEXT: or a0, a1, a0 -; RV32ZDINX_NOZICOND-NEXT: # kill: def $x10_w killed $x10_w killed $x10 -; RV32ZDINX_NOZICOND-NEXT: ret entry: %sel = select i1 %cond, half %a, half %b ret half %sel @@ -468,31 +409,31 @@ entry: ; Test select with i1 condition and zero ret val (cond ? a : 0), Zfinx ; ----------------------------------------------------------------------------- define dso_local noundef float @select_i1_f32_0(i1 %cond, float %t) nounwind { -; RV64ZDINX_ZICOND-LABEL: select_i1_f32_0: -; RV64ZDINX_ZICOND: # %bb.0: # %entry -; RV64ZDINX_ZICOND-NEXT: # kill: def $x11_w killed $x11_w def $x11 -; RV64ZDINX_ZICOND-NEXT: andi a0, a0, 1 -; RV64ZDINX_ZICOND-NEXT: czero.eqz a0, a1, a0 -; RV64ZDINX_ZICOND-NEXT: # kill: def $x10_w killed $x10_w killed $x10 -; RV64ZDINX_ZICOND-NEXT: ret -; -; RV64ZDINX_NOZICOND-LABEL: select_i1_f32_0: -; RV64ZDINX_NOZICOND: # %bb.0: # %entry -; RV64ZDINX_NOZICOND-NEXT: andi a2, a0, 1 -; RV64ZDINX_NOZICOND-NEXT: mv a0, a1 -; RV64ZDINX_NOZICOND-NEXT: bnez a2, .LBB4_2 -; RV64ZDINX_NOZICOND-NEXT: # %bb.1: # %entry -; RV64ZDINX_NOZICOND-NEXT: li a0, 0 -; RV64ZDINX_NOZICOND-NEXT: .LBB4_2: # %entry -; RV64ZDINX_NOZICOND-NEXT: ret -; -; RV64ZHINX_ZICOND-LABEL: select_i1_f32_0: -; RV64ZHINX_ZICOND: # %bb.0: # %entry -; RV64ZHINX_ZICOND-NEXT: # kill: def $x11_w killed $x11_w def $x11 -; RV64ZHINX_ZICOND-NEXT: andi a0, a0, 1 -; RV64ZHINX_ZICOND-NEXT: czero.eqz a0, a1, a0 -; RV64ZHINX_ZICOND-NEXT: # kill: def $x10_w killed $x10_w killed $x10 -; RV64ZHINX_ZICOND-NEXT: ret +; ZDINX_ZICOND-LABEL: select_i1_f32_0: +; ZDINX_ZICOND: # %bb.0: # %entry +; ZDINX_ZICOND-NEXT: # kill: def $x11_w killed $x11_w def $x11 +; ZDINX_ZICOND-NEXT: andi a0, a0, 1 +; ZDINX_ZICOND-NEXT: czero.eqz a0, a1, a0 +; ZDINX_ZICOND-NEXT: # kill: def $x10_w killed $x10_w killed $x10 +; ZDINX_ZICOND-NEXT: ret +; +; ZDINX_NOZICOND-LABEL: select_i1_f32_0: +; ZDINX_NOZICOND: # %bb.0: # %entry +; ZDINX_NOZICOND-NEXT: andi a2, a0, 1 +; ZDINX_NOZICOND-NEXT: mv a0, a1 +; ZDINX_NOZICOND-NEXT: bnez a2, .LBB4_2 +; ZDINX_NOZICOND-NEXT: # %bb.1: # %entry +; ZDINX_NOZICOND-NEXT: li a0, 0 +; ZDINX_NOZICOND-NEXT: .LBB4_2: # %entry +; ZDINX_NOZICOND-NEXT: ret +; +; ZHINX_ZICOND-LABEL: select_i1_f32_0: +; ZHINX_ZICOND: # %bb.0: # %entry +; ZHINX_ZICOND-NEXT: # kill: def $x11_w killed $x11_w def $x11 +; ZHINX_ZICOND-NEXT: andi a0, a0, 1 +; ZHINX_ZICOND-NEXT: czero.eqz a0, a1, a0 +; ZHINX_ZICOND-NEXT: # kill: def $x10_w killed $x10_w killed $x10 +; ZHINX_ZICOND-NEXT: ret ; ; RV64FD-LABEL: select_i1_f32_0: ; RV64FD: # %bb.0: # %entry @@ -503,14 +444,6 @@ define dso_local noundef float @select_i1_f32_0(i1 %cond, float %t) nounwind { ; RV64FD-NEXT: .LBB4_2: # %entry ; RV64FD-NEXT: ret ; -; RV32ZFINX_ZICOND-LABEL: select_i1_f32_0: -; RV32ZFINX_ZICOND: # %bb.0: # %entry -; RV32ZFINX_ZICOND-NEXT: # kill: def $x11_w killed $x11_w def $x11 -; RV32ZFINX_ZICOND-NEXT: andi a0, a0, 1 -; RV32ZFINX_ZICOND-NEXT: czero.eqz a0, a1, a0 -; RV32ZFINX_ZICOND-NEXT: # kill: def $x10_w killed $x10_w killed $x10 -; RV32ZFINX_ZICOND-NEXT: ret -; ; RV32ZFINX_NOZICOND-LABEL: select_i1_f32_0: ; RV32ZFINX_NOZICOND: # %bb.0: # %entry ; RV32ZFINX_NOZICOND-NEXT: andi a2, a0, 1 @@ -520,24 +453,6 @@ define dso_local noundef float @select_i1_f32_0(i1 %cond, float %t) nounwind { ; RV32ZFINX_NOZICOND-NEXT: li a0, 0 ; RV32ZFINX_NOZICOND-NEXT: .LBB4_2: # %entry ; RV32ZFINX_NOZICOND-NEXT: ret -; -; RV32ZDINX_ZICOND-LABEL: select_i1_f32_0: -; RV32ZDINX_ZICOND: # %bb.0: # %entry -; RV32ZDINX_ZICOND-NEXT: # kill: def $x11_w killed $x11_w def $x11 -; RV32ZDINX_ZICOND-NEXT: andi a0, a0, 1 -; RV32ZDINX_ZICOND-NEXT: czero.eqz a0, a1, a0 -; RV32ZDINX_ZICOND-NEXT: # kill: def $x10_w killed $x10_w killed $x10 -; RV32ZDINX_ZICOND-NEXT: ret -; -; RV32ZDINX_NOZICOND-LABEL: select_i1_f32_0: -; RV32ZDINX_NOZICOND: # %bb.0: # %entry -; RV32ZDINX_NOZICOND-NEXT: andi a2, a0, 1 -; RV32ZDINX_NOZICOND-NEXT: mv a0, a1 -; RV32ZDINX_NOZICOND-NEXT: bnez a2, .LBB4_2 -; RV32ZDINX_NOZICOND-NEXT: # %bb.1: # %entry -; RV32ZDINX_NOZICOND-NEXT: li a0, 0 -; RV32ZDINX_NOZICOND-NEXT: .LBB4_2: # %entry -; RV32ZDINX_NOZICOND-NEXT: ret entry: %sel = select i1 %cond, float %t, float 0.000000e+00 ret float %sel @@ -547,15 +462,15 @@ entry: ; Test select with i1 condition and zero ret val for half fp (cond ? a : 0) ; ----------------------------------------------------------------------------- define dso_local noundef half @select_i1_half_0(i1 %cond, half %val) nounwind { -; RV64ZDINX_ZICOND-LABEL: select_i1_half_0: -; RV64ZDINX_ZICOND: # %bb.0: # %entry -; RV64ZDINX_ZICOND-NEXT: # kill: def $x11_w killed $x11_w def $x11 -; RV64ZDINX_ZICOND-NEXT: andi a0, a0, 1 -; RV64ZDINX_ZICOND-NEXT: czero.eqz a0, a1, a0 -; RV64ZDINX_ZICOND-NEXT: lui a1, 1048560 -; RV64ZDINX_ZICOND-NEXT: or a0, a0, a1 -; RV64ZDINX_ZICOND-NEXT: # kill: def $x10_w killed $x10_w killed $x10 -; RV64ZDINX_ZICOND-NEXT: ret +; ZDINX_ZICOND-LABEL: select_i1_half_0: +; ZDINX_ZICOND: # %bb.0: # %entry +; ZDINX_ZICOND-NEXT: # kill: def $x11_w killed $x11_w def $x11 +; ZDINX_ZICOND-NEXT: andi a0, a0, 1 +; ZDINX_ZICOND-NEXT: czero.eqz a0, a1, a0 +; ZDINX_ZICOND-NEXT: lui a1, 1048560 +; ZDINX_ZICOND-NEXT: or a0, a0, a1 +; ZDINX_ZICOND-NEXT: # kill: def $x10_w killed $x10_w killed $x10 +; ZDINX_ZICOND-NEXT: ret ; ; RV64ZDINX_NOZICOND-LABEL: select_i1_half_0: ; RV64ZDINX_NOZICOND: # %bb.0: # %entry @@ -608,16 +523,6 @@ define dso_local noundef half @select_i1_half_0(i1 %cond, half %val) nounwind { ; RV32ZFINX_NOZICOND-NEXT: # kill: def $x10_w killed $x10_w killed $x10 ; RV32ZFINX_NOZICOND-NEXT: ret ; -; RV32ZDINX_ZICOND-LABEL: select_i1_half_0: -; RV32ZDINX_ZICOND: # %bb.0: # %entry -; RV32ZDINX_ZICOND-NEXT: # kill: def $x11_w killed $x11_w def $x11 -; RV32ZDINX_ZICOND-NEXT: andi a0, a0, 1 -; RV32ZDINX_ZICOND-NEXT: czero.eqz a0, a1, a0 -; RV32ZDINX_ZICOND-NEXT: lui a1, 1048560 -; RV32ZDINX_ZICOND-NEXT: or a0, a0, a1 -; RV32ZDINX_ZICOND-NEXT: # kill: def $x10_w killed $x10_w killed $x10 -; RV32ZDINX_ZICOND-NEXT: ret -; ; RV32ZDINX_NOZICOND-LABEL: select_i1_half_0: ; RV32ZDINX_NOZICOND: # %bb.0: # %entry ; RV32ZDINX_NOZICOND-NEXT: # kill: def $x11_w killed $x11_w def $x11 diff --git a/llvm/test/CodeGen/WebAssembly/masked-shifts.ll b/llvm/test/CodeGen/WebAssembly/masked-shifts.ll index 5bcb023e546b5..368f30fd5d7ed 100644 --- a/llvm/test/CodeGen/WebAssembly/masked-shifts.ll +++ b/llvm/test/CodeGen/WebAssembly/masked-shifts.ll @@ -18,6 +18,21 @@ define i32 @shl_i32(i32 %v, i32 %x) { ret i32 %a } +define i64 @shl_i64_zext(i64 %v, i32 %x) { +; CHECK-LABEL: shl_i64_zext: +; CHECK: .functype shl_i64_zext (i64, i32) -> (i64) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i64.extend_i32_u +; CHECK-NEXT: i64.shl +; CHECK-NEXT: # fallthrough-return + %m = and i32 %x, 63 + %z = zext i32 %m to i64 + %a = shl i64 %v, %z + ret i64 %a +} + define i32 @sra_i32(i32 %v, i32 %x) { ; CHECK-LABEL: sra_i32: ; CHECK: .functype sra_i32 (i32, i32) -> (i32) diff --git a/llvm/test/CodeGen/X86/combine-fceil.ll b/llvm/test/CodeGen/X86/combine-fceil.ll new file mode 100644 index 0000000000000..78f1476a49152 --- /dev/null +++ b/llvm/test/CodeGen/X86/combine-fceil.ll @@ -0,0 +1,175 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512 + +define <4 x double> @concat_ceil_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) { +; SSE-LABEL: concat_ceil_v4f64_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: roundpd $10, %xmm0, %xmm0 +; SSE-NEXT: roundpd $10, %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: concat_ceil_v4f64_v2f64: +; AVX: # %bb.0: +; AVX-NEXT: vroundpd $10, %xmm0, %xmm0 +; AVX-NEXT: vroundpd $10, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq + %v0 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a0) + %v1 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a1) + %res = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> + ret <4 x double> %res +} + +define <8 x float> @concat_ceil_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) { +; SSE-LABEL: concat_ceil_v8f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: roundps $10, %xmm0, %xmm0 +; SSE-NEXT: roundps $10, %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: concat_ceil_v8f32_v4f32: +; AVX: # %bb.0: +; AVX-NEXT: vroundps $10, %xmm0, %xmm0 +; AVX-NEXT: vroundps $10, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq + %v0 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a0) + %v1 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a1) + %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> + ret <8 x float> %res +} + +define <8 x double> @concat_ceil_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) { +; SSE-LABEL: concat_ceil_v8f64_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: roundpd $10, %xmm0, %xmm0 +; SSE-NEXT: roundpd $10, %xmm1, %xmm1 +; SSE-NEXT: roundpd $10, %xmm2, %xmm2 +; SSE-NEXT: roundpd $10, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_ceil_v8f64_v2f64: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundpd $10, %xmm0, %xmm0 +; AVX1OR2-NEXT: vroundpd $10, %xmm1, %xmm1 +; AVX1OR2-NEXT: vroundpd $10, %xmm2, %xmm2 +; AVX1OR2-NEXT: vroundpd $10, %xmm3, %xmm3 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_ceil_v8f64_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vroundpd $10, %xmm0, %xmm0 +; AVX512-NEXT: vroundpd $10, %xmm1, %xmm1 +; AVX512-NEXT: vroundpd $10, %xmm2, %xmm2 +; AVX512-NEXT: vroundpd $10, %xmm3, %xmm3 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a0) + %v1 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a1) + %v2 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a2) + %v3 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a3) + %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> + %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> + %res = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> + ret <8 x double> %res +} + +define <16 x float> @concat_ceil_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) { +; SSE-LABEL: concat_ceil_v16f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: roundps $10, %xmm0, %xmm0 +; SSE-NEXT: roundps $10, %xmm1, %xmm1 +; SSE-NEXT: roundps $10, %xmm2, %xmm2 +; SSE-NEXT: roundps $10, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_ceil_v16f32_v4f32: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundps $10, %xmm0, %xmm0 +; AVX1OR2-NEXT: vroundps $10, %xmm1, %xmm1 +; AVX1OR2-NEXT: vroundps $10, %xmm2, %xmm2 +; AVX1OR2-NEXT: vroundps $10, %xmm3, %xmm3 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_ceil_v16f32_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vroundps $10, %xmm0, %xmm0 +; AVX512-NEXT: vroundps $10, %xmm1, %xmm1 +; AVX512-NEXT: vroundps $10, %xmm2, %xmm2 +; AVX512-NEXT: vroundps $10, %xmm3, %xmm3 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a0) + %v1 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a1) + %v2 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a2) + %v3 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a3) + %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> + %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> + %res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> + ret <16 x float> %res +} + +define <8 x double> @concat_ceil_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) { +; SSE-LABEL: concat_ceil_v8f64_v4f64: +; SSE: # %bb.0: +; SSE-NEXT: roundpd $10, %xmm0, %xmm0 +; SSE-NEXT: roundpd $10, %xmm1, %xmm1 +; SSE-NEXT: roundpd $10, %xmm2, %xmm2 +; SSE-NEXT: roundpd $10, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_ceil_v8f64_v4f64: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundpd $10, %ymm0, %ymm0 +; AVX1OR2-NEXT: vroundpd $10, %ymm1, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_ceil_v8f64_v4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vroundpd $10, %ymm0, %ymm0 +; AVX512-NEXT: vroundpd $10, %ymm1, %ymm1 +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <4 x double> @llvm.ceil.v4f64(<4 x double> %a0) + %v1 = call <4 x double> @llvm.ceil.v4f64(<4 x double> %a1) + %res = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> + ret <8 x double> %res +} + +define <16 x float> @concat_ceil_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) { +; SSE-LABEL: concat_ceil_v16f32_v8f32: +; SSE: # %bb.0: +; SSE-NEXT: roundps $10, %xmm0, %xmm0 +; SSE-NEXT: roundps $10, %xmm1, %xmm1 +; SSE-NEXT: roundps $10, %xmm2, %xmm2 +; SSE-NEXT: roundps $10, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_ceil_v16f32_v8f32: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundps $10, %ymm0, %ymm0 +; AVX1OR2-NEXT: vroundps $10, %ymm1, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_ceil_v16f32_v8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vroundps $10, %ymm0, %ymm0 +; AVX512-NEXT: vroundps $10, %ymm1, %ymm1 +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <8 x float> @llvm.ceil.v8f32(<8 x float> %a0) + %v1 = call <8 x float> @llvm.ceil.v8f32(<8 x float> %a1) + %res = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> + ret <16 x float> %res +} diff --git a/llvm/test/CodeGen/X86/combine-fnearbyint.ll b/llvm/test/CodeGen/X86/combine-fnearbyint.ll new file mode 100644 index 0000000000000..14d1017aec630 --- /dev/null +++ b/llvm/test/CodeGen/X86/combine-fnearbyint.ll @@ -0,0 +1,175 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512 + +define <4 x double> @concat_nearbyint_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) { +; SSE-LABEL: concat_nearbyint_v4f64_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: roundpd $12, %xmm0, %xmm0 +; SSE-NEXT: roundpd $12, %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: concat_nearbyint_v4f64_v2f64: +; AVX: # %bb.0: +; AVX-NEXT: vroundpd $12, %xmm0, %xmm0 +; AVX-NEXT: vroundpd $12, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq + %v0 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a0) + %v1 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a1) + %res = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> + ret <4 x double> %res +} + +define <8 x float> @concat_nearbyint_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) { +; SSE-LABEL: concat_nearbyint_v8f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: roundps $12, %xmm0, %xmm0 +; SSE-NEXT: roundps $12, %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: concat_nearbyint_v8f32_v4f32: +; AVX: # %bb.0: +; AVX-NEXT: vroundps $12, %xmm0, %xmm0 +; AVX-NEXT: vroundps $12, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq + %v0 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a0) + %v1 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a1) + %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> + ret <8 x float> %res +} + +define <8 x double> @concat_nearbyint_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) { +; SSE-LABEL: concat_nearbyint_v8f64_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: roundpd $12, %xmm0, %xmm0 +; SSE-NEXT: roundpd $12, %xmm1, %xmm1 +; SSE-NEXT: roundpd $12, %xmm2, %xmm2 +; SSE-NEXT: roundpd $12, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_nearbyint_v8f64_v2f64: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundpd $12, %xmm0, %xmm0 +; AVX1OR2-NEXT: vroundpd $12, %xmm1, %xmm1 +; AVX1OR2-NEXT: vroundpd $12, %xmm2, %xmm2 +; AVX1OR2-NEXT: vroundpd $12, %xmm3, %xmm3 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_nearbyint_v8f64_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vroundpd $12, %xmm0, %xmm0 +; AVX512-NEXT: vroundpd $12, %xmm1, %xmm1 +; AVX512-NEXT: vroundpd $12, %xmm2, %xmm2 +; AVX512-NEXT: vroundpd $12, %xmm3, %xmm3 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a0) + %v1 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a1) + %v2 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a2) + %v3 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a3) + %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> + %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> + %res = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> + ret <8 x double> %res +} + +define <16 x float> @concat_nearbyint_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) { +; SSE-LABEL: concat_nearbyint_v16f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: roundps $12, %xmm0, %xmm0 +; SSE-NEXT: roundps $12, %xmm1, %xmm1 +; SSE-NEXT: roundps $12, %xmm2, %xmm2 +; SSE-NEXT: roundps $12, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_nearbyint_v16f32_v4f32: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundps $12, %xmm0, %xmm0 +; AVX1OR2-NEXT: vroundps $12, %xmm1, %xmm1 +; AVX1OR2-NEXT: vroundps $12, %xmm2, %xmm2 +; AVX1OR2-NEXT: vroundps $12, %xmm3, %xmm3 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_nearbyint_v16f32_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vroundps $12, %xmm0, %xmm0 +; AVX512-NEXT: vroundps $12, %xmm1, %xmm1 +; AVX512-NEXT: vroundps $12, %xmm2, %xmm2 +; AVX512-NEXT: vroundps $12, %xmm3, %xmm3 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a0) + %v1 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a1) + %v2 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a2) + %v3 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a3) + %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> + %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> + %res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> + ret <16 x float> %res +} + +define <8 x double> @concat_nearbyint_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) { +; SSE-LABEL: concat_nearbyint_v8f64_v4f64: +; SSE: # %bb.0: +; SSE-NEXT: roundpd $12, %xmm0, %xmm0 +; SSE-NEXT: roundpd $12, %xmm1, %xmm1 +; SSE-NEXT: roundpd $12, %xmm2, %xmm2 +; SSE-NEXT: roundpd $12, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_nearbyint_v8f64_v4f64: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundpd $12, %ymm0, %ymm0 +; AVX1OR2-NEXT: vroundpd $12, %ymm1, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_nearbyint_v8f64_v4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vroundpd $12, %ymm0, %ymm0 +; AVX512-NEXT: vroundpd $12, %ymm1, %ymm1 +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %a0) + %v1 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %a1) + %res = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> + ret <8 x double> %res +} + +define <16 x float> @concat_nearbyint_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) { +; SSE-LABEL: concat_nearbyint_v16f32_v8f32: +; SSE: # %bb.0: +; SSE-NEXT: roundps $12, %xmm0, %xmm0 +; SSE-NEXT: roundps $12, %xmm1, %xmm1 +; SSE-NEXT: roundps $12, %xmm2, %xmm2 +; SSE-NEXT: roundps $12, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_nearbyint_v16f32_v8f32: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundps $12, %ymm0, %ymm0 +; AVX1OR2-NEXT: vroundps $12, %ymm1, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_nearbyint_v16f32_v8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vroundps $12, %ymm0, %ymm0 +; AVX512-NEXT: vroundps $12, %ymm1, %ymm1 +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %a0) + %v1 = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %a1) + %res = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> + ret <16 x float> %res +} diff --git a/llvm/test/CodeGen/X86/combine-frint.ll b/llvm/test/CodeGen/X86/combine-frint.ll new file mode 100644 index 0000000000000..901ce2c1f0d82 --- /dev/null +++ b/llvm/test/CodeGen/X86/combine-frint.ll @@ -0,0 +1,175 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512 + +define <4 x double> @concat_rint_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) { +; SSE-LABEL: concat_rint_v4f64_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: roundpd $4, %xmm0, %xmm0 +; SSE-NEXT: roundpd $4, %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: concat_rint_v4f64_v2f64: +; AVX: # %bb.0: +; AVX-NEXT: vroundpd $4, %xmm0, %xmm0 +; AVX-NEXT: vroundpd $4, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq + %v0 = call <2 x double> @llvm.rint.v2f64(<2 x double> %a0) + %v1 = call <2 x double> @llvm.rint.v2f64(<2 x double> %a1) + %res = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> + ret <4 x double> %res +} + +define <8 x float> @concat_rint_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) { +; SSE-LABEL: concat_rint_v8f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: roundps $4, %xmm0, %xmm0 +; SSE-NEXT: roundps $4, %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: concat_rint_v8f32_v4f32: +; AVX: # %bb.0: +; AVX-NEXT: vroundps $4, %xmm0, %xmm0 +; AVX-NEXT: vroundps $4, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq + %v0 = call <4 x float> @llvm.rint.v4f32(<4 x float> %a0) + %v1 = call <4 x float> @llvm.rint.v4f32(<4 x float> %a1) + %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> + ret <8 x float> %res +} + +define <8 x double> @concat_rint_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) { +; SSE-LABEL: concat_rint_v8f64_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: roundpd $4, %xmm0, %xmm0 +; SSE-NEXT: roundpd $4, %xmm1, %xmm1 +; SSE-NEXT: roundpd $4, %xmm2, %xmm2 +; SSE-NEXT: roundpd $4, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_rint_v8f64_v2f64: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundpd $4, %xmm0, %xmm0 +; AVX1OR2-NEXT: vroundpd $4, %xmm1, %xmm1 +; AVX1OR2-NEXT: vroundpd $4, %xmm2, %xmm2 +; AVX1OR2-NEXT: vroundpd $4, %xmm3, %xmm3 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_rint_v8f64_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vroundpd $4, %xmm0, %xmm0 +; AVX512-NEXT: vroundpd $4, %xmm1, %xmm1 +; AVX512-NEXT: vroundpd $4, %xmm2, %xmm2 +; AVX512-NEXT: vroundpd $4, %xmm3, %xmm3 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <2 x double> @llvm.rint.v2f64(<2 x double> %a0) + %v1 = call <2 x double> @llvm.rint.v2f64(<2 x double> %a1) + %v2 = call <2 x double> @llvm.rint.v2f64(<2 x double> %a2) + %v3 = call <2 x double> @llvm.rint.v2f64(<2 x double> %a3) + %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> + %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> + %res = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> + ret <8 x double> %res +} + +define <16 x float> @concat_rint_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) { +; SSE-LABEL: concat_rint_v16f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: roundps $4, %xmm0, %xmm0 +; SSE-NEXT: roundps $4, %xmm1, %xmm1 +; SSE-NEXT: roundps $4, %xmm2, %xmm2 +; SSE-NEXT: roundps $4, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_rint_v16f32_v4f32: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundps $4, %xmm0, %xmm0 +; AVX1OR2-NEXT: vroundps $4, %xmm1, %xmm1 +; AVX1OR2-NEXT: vroundps $4, %xmm2, %xmm2 +; AVX1OR2-NEXT: vroundps $4, %xmm3, %xmm3 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_rint_v16f32_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vroundps $4, %xmm0, %xmm0 +; AVX512-NEXT: vroundps $4, %xmm1, %xmm1 +; AVX512-NEXT: vroundps $4, %xmm2, %xmm2 +; AVX512-NEXT: vroundps $4, %xmm3, %xmm3 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <4 x float> @llvm.rint.v4f32(<4 x float> %a0) + %v1 = call <4 x float> @llvm.rint.v4f32(<4 x float> %a1) + %v2 = call <4 x float> @llvm.rint.v4f32(<4 x float> %a2) + %v3 = call <4 x float> @llvm.rint.v4f32(<4 x float> %a3) + %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> + %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> + %res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> + ret <16 x float> %res +} + +define <8 x double> @concat_rint_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) { +; SSE-LABEL: concat_rint_v8f64_v4f64: +; SSE: # %bb.0: +; SSE-NEXT: roundpd $4, %xmm0, %xmm0 +; SSE-NEXT: roundpd $4, %xmm1, %xmm1 +; SSE-NEXT: roundpd $4, %xmm2, %xmm2 +; SSE-NEXT: roundpd $4, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_rint_v8f64_v4f64: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundpd $4, %ymm0, %ymm0 +; AVX1OR2-NEXT: vroundpd $4, %ymm1, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_rint_v8f64_v4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vroundpd $4, %ymm0, %ymm0 +; AVX512-NEXT: vroundpd $4, %ymm1, %ymm1 +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <4 x double> @llvm.rint.v4f64(<4 x double> %a0) + %v1 = call <4 x double> @llvm.rint.v4f64(<4 x double> %a1) + %res = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> + ret <8 x double> %res +} + +define <16 x float> @concat_rint_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) { +; SSE-LABEL: concat_rint_v16f32_v8f32: +; SSE: # %bb.0: +; SSE-NEXT: roundps $4, %xmm0, %xmm0 +; SSE-NEXT: roundps $4, %xmm1, %xmm1 +; SSE-NEXT: roundps $4, %xmm2, %xmm2 +; SSE-NEXT: roundps $4, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_rint_v16f32_v8f32: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundps $4, %ymm0, %ymm0 +; AVX1OR2-NEXT: vroundps $4, %ymm1, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_rint_v16f32_v8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vroundps $4, %ymm0, %ymm0 +; AVX512-NEXT: vroundps $4, %ymm1, %ymm1 +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <8 x float> @llvm.rint.v8f32(<8 x float> %a0) + %v1 = call <8 x float> @llvm.rint.v8f32(<8 x float> %a1) + %res = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> + ret <16 x float> %res +} diff --git a/llvm/test/CodeGen/X86/combine-froundeven.ll b/llvm/test/CodeGen/X86/combine-froundeven.ll new file mode 100644 index 0000000000000..484e3a9680450 --- /dev/null +++ b/llvm/test/CodeGen/X86/combine-froundeven.ll @@ -0,0 +1,175 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512 + +define <4 x double> @concat_roundeven_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) { +; SSE-LABEL: concat_roundeven_v4f64_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: roundpd $8, %xmm0, %xmm0 +; SSE-NEXT: roundpd $8, %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: concat_roundeven_v4f64_v2f64: +; AVX: # %bb.0: +; AVX-NEXT: vroundpd $8, %xmm0, %xmm0 +; AVX-NEXT: vroundpd $8, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq + %v0 = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %a0) + %v1 = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %a1) + %res = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> + ret <4 x double> %res +} + +define <8 x float> @concat_roundeven_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) { +; SSE-LABEL: concat_roundeven_v8f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: roundps $8, %xmm0, %xmm0 +; SSE-NEXT: roundps $8, %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: concat_roundeven_v8f32_v4f32: +; AVX: # %bb.0: +; AVX-NEXT: vroundps $8, %xmm0, %xmm0 +; AVX-NEXT: vroundps $8, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq + %v0 = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %a0) + %v1 = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %a1) + %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> + ret <8 x float> %res +} + +define <8 x double> @concat_roundeven_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) { +; SSE-LABEL: concat_roundeven_v8f64_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: roundpd $8, %xmm0, %xmm0 +; SSE-NEXT: roundpd $8, %xmm1, %xmm1 +; SSE-NEXT: roundpd $8, %xmm2, %xmm2 +; SSE-NEXT: roundpd $8, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_roundeven_v8f64_v2f64: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundpd $8, %xmm0, %xmm0 +; AVX1OR2-NEXT: vroundpd $8, %xmm1, %xmm1 +; AVX1OR2-NEXT: vroundpd $8, %xmm2, %xmm2 +; AVX1OR2-NEXT: vroundpd $8, %xmm3, %xmm3 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_roundeven_v8f64_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vroundpd $8, %xmm0, %xmm0 +; AVX512-NEXT: vroundpd $8, %xmm1, %xmm1 +; AVX512-NEXT: vroundpd $8, %xmm2, %xmm2 +; AVX512-NEXT: vroundpd $8, %xmm3, %xmm3 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %a0) + %v1 = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %a1) + %v2 = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %a2) + %v3 = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %a3) + %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> + %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> + %res = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> + ret <8 x double> %res +} + +define <16 x float> @concat_roundeven_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) { +; SSE-LABEL: concat_roundeven_v16f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: roundps $8, %xmm0, %xmm0 +; SSE-NEXT: roundps $8, %xmm1, %xmm1 +; SSE-NEXT: roundps $8, %xmm2, %xmm2 +; SSE-NEXT: roundps $8, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_roundeven_v16f32_v4f32: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundps $8, %xmm0, %xmm0 +; AVX1OR2-NEXT: vroundps $8, %xmm1, %xmm1 +; AVX1OR2-NEXT: vroundps $8, %xmm2, %xmm2 +; AVX1OR2-NEXT: vroundps $8, %xmm3, %xmm3 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_roundeven_v16f32_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vroundps $8, %xmm0, %xmm0 +; AVX512-NEXT: vroundps $8, %xmm1, %xmm1 +; AVX512-NEXT: vroundps $8, %xmm2, %xmm2 +; AVX512-NEXT: vroundps $8, %xmm3, %xmm3 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %a0) + %v1 = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %a1) + %v2 = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %a2) + %v3 = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %a3) + %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> + %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> + %res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> + ret <16 x float> %res +} + +define <8 x double> @concat_roundeven_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) { +; SSE-LABEL: concat_roundeven_v8f64_v4f64: +; SSE: # %bb.0: +; SSE-NEXT: roundpd $8, %xmm0, %xmm0 +; SSE-NEXT: roundpd $8, %xmm1, %xmm1 +; SSE-NEXT: roundpd $8, %xmm2, %xmm2 +; SSE-NEXT: roundpd $8, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_roundeven_v8f64_v4f64: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundpd $8, %ymm0, %ymm0 +; AVX1OR2-NEXT: vroundpd $8, %ymm1, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_roundeven_v8f64_v4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vroundpd $8, %ymm0, %ymm0 +; AVX512-NEXT: vroundpd $8, %ymm1, %ymm1 +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %a0) + %v1 = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %a1) + %res = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> + ret <8 x double> %res +} + +define <16 x float> @concat_roundeven_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) { +; SSE-LABEL: concat_roundeven_v16f32_v8f32: +; SSE: # %bb.0: +; SSE-NEXT: roundps $8, %xmm0, %xmm0 +; SSE-NEXT: roundps $8, %xmm1, %xmm1 +; SSE-NEXT: roundps $8, %xmm2, %xmm2 +; SSE-NEXT: roundps $8, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_roundeven_v16f32_v8f32: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundps $8, %ymm0, %ymm0 +; AVX1OR2-NEXT: vroundps $8, %ymm1, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_roundeven_v16f32_v8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vroundps $8, %ymm0, %ymm0 +; AVX512-NEXT: vroundps $8, %ymm1, %ymm1 +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %a0) + %v1 = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %a1) + %res = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> + ret <16 x float> %res +} diff --git a/llvm/test/CodeGen/X86/combine-fsqrt.ll b/llvm/test/CodeGen/X86/combine-fsqrt.ll new file mode 100644 index 0000000000000..ddd7d3ac24315 --- /dev/null +++ b/llvm/test/CodeGen/X86/combine-fsqrt.ll @@ -0,0 +1,91 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512 + +define <8 x float> @concat_sqrt_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) { +; SSE-LABEL: concat_sqrt_v8f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: sqrtps %xmm0, %xmm0 +; SSE-NEXT: sqrtps %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: concat_sqrt_v8f32_v4f32: +; AVX: # %bb.0: +; AVX-NEXT: vsqrtps %xmm0, %xmm0 +; AVX-NEXT: vsqrtps %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq + %v0 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a0) + %v1 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a1) + %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> + ret <8 x float> %res +} + +define <16 x float> @concat_sqrt_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) { +; SSE-LABEL: concat_sqrt_v16f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: sqrtps %xmm0, %xmm0 +; SSE-NEXT: sqrtps %xmm1, %xmm1 +; SSE-NEXT: sqrtps %xmm2, %xmm2 +; SSE-NEXT: sqrtps %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_sqrt_v16f32_v4f32: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vsqrtps %xmm0, %xmm0 +; AVX1OR2-NEXT: vsqrtps %xmm1, %xmm1 +; AVX1OR2-NEXT: vsqrtps %xmm2, %xmm2 +; AVX1OR2-NEXT: vsqrtps %xmm3, %xmm3 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_sqrt_v16f32_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vsqrtps %xmm0, %xmm0 +; AVX512-NEXT: vsqrtps %xmm1, %xmm1 +; AVX512-NEXT: vsqrtps %xmm2, %xmm2 +; AVX512-NEXT: vsqrtps %xmm3, %xmm3 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a0) + %v1 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a1) + %v2 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a2) + %v3 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a3) + %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> + %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> + %res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> + ret <16 x float> %res +} + +define <16 x float> @concat_sqrt_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) { +; SSE-LABEL: concat_sqrt_v16f32_v8f32: +; SSE: # %bb.0: +; SSE-NEXT: sqrtps %xmm0, %xmm0 +; SSE-NEXT: sqrtps %xmm1, %xmm1 +; SSE-NEXT: sqrtps %xmm2, %xmm2 +; SSE-NEXT: sqrtps %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_sqrt_v16f32_v8f32: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vsqrtps %ymm0, %ymm0 +; AVX1OR2-NEXT: vsqrtps %ymm1, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_sqrt_v16f32_v8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vsqrtps %ymm0, %ymm0 +; AVX512-NEXT: vsqrtps %ymm1, %ymm1 +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %a0) + %v1 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %a1) + %res = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> + ret <16 x float> %res +} diff --git a/llvm/test/CodeGen/X86/combine-ftrunc.ll b/llvm/test/CodeGen/X86/combine-ftrunc.ll new file mode 100644 index 0000000000000..a6c703a1cbeae --- /dev/null +++ b/llvm/test/CodeGen/X86/combine-ftrunc.ll @@ -0,0 +1,175 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512 + +define <4 x double> @concat_trunc_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) { +; SSE-LABEL: concat_trunc_v4f64_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: roundpd $11, %xmm0, %xmm0 +; SSE-NEXT: roundpd $11, %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: concat_trunc_v4f64_v2f64: +; AVX: # %bb.0: +; AVX-NEXT: vroundpd $11, %xmm0, %xmm0 +; AVX-NEXT: vroundpd $11, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq + %v0 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a0) + %v1 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a1) + %res = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> + ret <4 x double> %res +} + +define <8 x float> @concat_trunc_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) { +; SSE-LABEL: concat_trunc_v8f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: roundps $11, %xmm0, %xmm0 +; SSE-NEXT: roundps $11, %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: concat_trunc_v8f32_v4f32: +; AVX: # %bb.0: +; AVX-NEXT: vroundps $11, %xmm0, %xmm0 +; AVX-NEXT: vroundps $11, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq + %v0 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a0) + %v1 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a1) + %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> + ret <8 x float> %res +} + +define <8 x double> @concat_trunc_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) { +; SSE-LABEL: concat_trunc_v8f64_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: roundpd $11, %xmm0, %xmm0 +; SSE-NEXT: roundpd $11, %xmm1, %xmm1 +; SSE-NEXT: roundpd $11, %xmm2, %xmm2 +; SSE-NEXT: roundpd $11, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_trunc_v8f64_v2f64: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundpd $11, %xmm0, %xmm0 +; AVX1OR2-NEXT: vroundpd $11, %xmm1, %xmm1 +; AVX1OR2-NEXT: vroundpd $11, %xmm2, %xmm2 +; AVX1OR2-NEXT: vroundpd $11, %xmm3, %xmm3 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_trunc_v8f64_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vroundpd $11, %xmm0, %xmm0 +; AVX512-NEXT: vroundpd $11, %xmm1, %xmm1 +; AVX512-NEXT: vroundpd $11, %xmm2, %xmm2 +; AVX512-NEXT: vroundpd $11, %xmm3, %xmm3 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a0) + %v1 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a1) + %v2 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a2) + %v3 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a3) + %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> + %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> + %res = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> + ret <8 x double> %res +} + +define <16 x float> @concat_trunc_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) { +; SSE-LABEL: concat_trunc_v16f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: roundps $11, %xmm0, %xmm0 +; SSE-NEXT: roundps $11, %xmm1, %xmm1 +; SSE-NEXT: roundps $11, %xmm2, %xmm2 +; SSE-NEXT: roundps $11, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_trunc_v16f32_v4f32: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundps $11, %xmm0, %xmm0 +; AVX1OR2-NEXT: vroundps $11, %xmm1, %xmm1 +; AVX1OR2-NEXT: vroundps $11, %xmm2, %xmm2 +; AVX1OR2-NEXT: vroundps $11, %xmm3, %xmm3 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_trunc_v16f32_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vroundps $11, %xmm0, %xmm0 +; AVX512-NEXT: vroundps $11, %xmm1, %xmm1 +; AVX512-NEXT: vroundps $11, %xmm2, %xmm2 +; AVX512-NEXT: vroundps $11, %xmm3, %xmm3 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a0) + %v1 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a1) + %v2 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a2) + %v3 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a3) + %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> + %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> + %res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> + ret <16 x float> %res +} + +define <8 x double> @concat_trunc_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) { +; SSE-LABEL: concat_trunc_v8f64_v4f64: +; SSE: # %bb.0: +; SSE-NEXT: roundpd $11, %xmm0, %xmm0 +; SSE-NEXT: roundpd $11, %xmm1, %xmm1 +; SSE-NEXT: roundpd $11, %xmm2, %xmm2 +; SSE-NEXT: roundpd $11, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_trunc_v8f64_v4f64: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundpd $11, %ymm0, %ymm0 +; AVX1OR2-NEXT: vroundpd $11, %ymm1, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_trunc_v8f64_v4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vroundpd $11, %ymm0, %ymm0 +; AVX512-NEXT: vroundpd $11, %ymm1, %ymm1 +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <4 x double> @llvm.trunc.v4f64(<4 x double> %a0) + %v1 = call <4 x double> @llvm.trunc.v4f64(<4 x double> %a1) + %res = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> + ret <8 x double> %res +} + +define <16 x float> @concat_trunc_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) { +; SSE-LABEL: concat_trunc_v16f32_v8f32: +; SSE: # %bb.0: +; SSE-NEXT: roundps $11, %xmm0, %xmm0 +; SSE-NEXT: roundps $11, %xmm1, %xmm1 +; SSE-NEXT: roundps $11, %xmm2, %xmm2 +; SSE-NEXT: roundps $11, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_trunc_v16f32_v8f32: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundps $11, %ymm0, %ymm0 +; AVX1OR2-NEXT: vroundps $11, %ymm1, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_trunc_v16f32_v8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vroundps $11, %ymm0, %ymm0 +; AVX512-NEXT: vroundps $11, %ymm1, %ymm1 +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <8 x float> @llvm.trunc.v8f32(<8 x float> %a0) + %v1 = call <8 x float> @llvm.trunc.v8f32(<8 x float> %a1) + %res = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> + ret <16 x float> %res +} diff --git a/llvm/test/CodeGen/X86/combine-rcp.ll b/llvm/test/CodeGen/X86/combine-rcp.ll new file mode 100644 index 0000000000000..7de3e96d592db --- /dev/null +++ b/llvm/test/CodeGen/X86/combine-rcp.ll @@ -0,0 +1,65 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512 + +define <8 x float> @concat_rcp_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) { +; SSE-LABEL: concat_rcp_v8f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: rcpps %xmm0, %xmm0 +; SSE-NEXT: rcpps %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: concat_rcp_v8f32_v4f32: +; AVX: # %bb.0: +; AVX-NEXT: vrcpps %xmm0, %xmm0 +; AVX-NEXT: vrcpps %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq + %v0 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0) + %v1 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a1) + %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> + ret <8 x float> %res +} + +; Ensure we don't convert rcpps to rcp14ps +define <16 x float> @concat_rcp_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) { +; SSE-LABEL: concat_rcp_v16f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: rcpps %xmm0, %xmm0 +; SSE-NEXT: rcpps %xmm1, %xmm1 +; SSE-NEXT: rcpps %xmm2, %xmm2 +; SSE-NEXT: rcpps %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_rcp_v16f32_v4f32: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vrcpps %xmm0, %xmm0 +; AVX1OR2-NEXT: vrcpps %xmm1, %xmm1 +; AVX1OR2-NEXT: vrcpps %xmm2, %xmm2 +; AVX1OR2-NEXT: vrcpps %xmm3, %xmm3 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_rcp_v16f32_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vrcpps %xmm0, %xmm0 +; AVX512-NEXT: vrcpps %xmm1, %xmm1 +; AVX512-NEXT: vrcpps %xmm2, %xmm2 +; AVX512-NEXT: vrcpps %xmm3, %xmm3 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0) + %v1 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a1) + %v2 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a2) + %v3 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a3) + %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> + %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> + %res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> + ret <16 x float> %res +} diff --git a/llvm/test/CodeGen/X86/combine-rndscale.ll b/llvm/test/CodeGen/X86/combine-rndscale.ll new file mode 100644 index 0000000000000..25117e864b512 --- /dev/null +++ b/llvm/test/CodeGen/X86/combine-rndscale.ll @@ -0,0 +1,144 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512 + +define <4 x double> @concat_roundpd_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) { +; AVX-LABEL: concat_roundpd_v4f64_v2f64: +; AVX: # %bb.0: +; AVX-NEXT: vroundpd $4, %xmm0, %xmm0 +; AVX-NEXT: vroundpd $4, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq + %v0 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 4) + %v1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a1, i32 4) + %res = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> + ret <4 x double> %res +} + +define <8 x float> @concat_roundps_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) { +; AVX-LABEL: concat_roundps_v8f32_v4f32: +; AVX: # %bb.0: +; AVX-NEXT: vroundps $4, %xmm0, %xmm0 +; AVX-NEXT: vroundps $4, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq + %v0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 4) + %v1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a1, i32 4) + %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> + ret <8 x float> %res +} + +define <8 x double> @concat_roundpd_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) { +; AVX1OR2-LABEL: concat_roundpd_v8f64_v2f64: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundpd $4, %xmm0, %xmm0 +; AVX1OR2-NEXT: vroundpd $4, %xmm1, %xmm1 +; AVX1OR2-NEXT: vroundpd $4, %xmm2, %xmm2 +; AVX1OR2-NEXT: vroundpd $4, %xmm3, %xmm3 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_roundpd_v8f64_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vroundpd $4, %xmm0, %xmm0 +; AVX512-NEXT: vroundpd $4, %xmm1, %xmm1 +; AVX512-NEXT: vroundpd $4, %xmm2, %xmm2 +; AVX512-NEXT: vroundpd $4, %xmm3, %xmm3 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 4) + %v1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a1, i32 4) + %v2 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a2, i32 4) + %v3 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a3, i32 4) + %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> + %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> + %res = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> + ret <8 x double> %res +} + +define <16 x float> @concat_roundps_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) { +; AVX1OR2-LABEL: concat_roundps_v16f32_v4f32: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundps $4, %xmm0, %xmm0 +; AVX1OR2-NEXT: vroundps $4, %xmm1, %xmm1 +; AVX1OR2-NEXT: vroundps $4, %xmm2, %xmm2 +; AVX1OR2-NEXT: vroundps $4, %xmm3, %xmm3 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_roundps_v16f32_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vroundps $4, %xmm0, %xmm0 +; AVX512-NEXT: vroundps $4, %xmm1, %xmm1 +; AVX512-NEXT: vroundps $4, %xmm2, %xmm2 +; AVX512-NEXT: vroundps $4, %xmm3, %xmm3 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 4) + %v1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a1, i32 4) + %v2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a2, i32 4) + %v3 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a3, i32 4) + %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> + %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> + %res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> + ret <16 x float> %res +} + +define <8 x double> @concat_roundpd_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) { +; AVX1OR2-LABEL: concat_roundpd_v8f64_v4f64: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundpd $4, %ymm0, %ymm0 +; AVX1OR2-NEXT: vroundpd $4, %ymm1, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_roundpd_v8f64_v4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vroundpd $4, %ymm0, %ymm0 +; AVX512-NEXT: vroundpd $4, %ymm1, %ymm1 +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 4) + %v1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a1, i32 4) + %res = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> + ret <8 x double> %res +} + +define <16 x float> @concat_roundps_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) { +; AVX1OR2-LABEL: concat_roundps_v16f32_v8f32: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundps $4, %ymm0, %ymm0 +; AVX1OR2-NEXT: vroundps $4, %ymm1, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_roundps_v16f32_v8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vroundps $4, %ymm0, %ymm0 +; AVX512-NEXT: vroundps $4, %ymm1, %ymm1 +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 4) + %v1 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a1, i32 4) + %res = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> + ret <16 x float> %res +} + +; negative test - rounding mode mismatch +define <8 x float> @concat_roundps_v8f32_v4f32_mismatch(<4 x float> %a0, <4 x float> %a1) { +; AVX-LABEL: concat_roundps_v8f32_v4f32_mismatch: +; AVX: # %bb.0: +; AVX-NEXT: vroundps $0, %xmm0, %xmm0 +; AVX-NEXT: vroundps $4, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq + %v0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 0) + %v1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a1, i32 4) + %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> + ret <8 x float> %res +} diff --git a/llvm/test/CodeGen/X86/combine-rsqrt.ll b/llvm/test/CodeGen/X86/combine-rsqrt.ll new file mode 100644 index 0000000000000..78688701f8cd3 --- /dev/null +++ b/llvm/test/CodeGen/X86/combine-rsqrt.ll @@ -0,0 +1,65 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512 + +define <8 x float> @concat_rsqrt_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) { +; SSE-LABEL: concat_rsqrt_v8f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: rsqrtps %xmm0, %xmm0 +; SSE-NEXT: rsqrtps %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: concat_rsqrt_v8f32_v4f32: +; AVX: # %bb.0: +; AVX-NEXT: vrsqrtps %xmm0, %xmm0 +; AVX-NEXT: vrsqrtps %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq + %v0 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0) + %v1 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a1) + %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> + ret <8 x float> %res +} + +; Ensure we don't convert rsqrtps to rsqrt14ps +define <16 x float> @concat_rsqrt_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) { +; SSE-LABEL: concat_rsqrt_v16f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: rsqrtps %xmm0, %xmm0 +; SSE-NEXT: rsqrtps %xmm1, %xmm1 +; SSE-NEXT: rsqrtps %xmm2, %xmm2 +; SSE-NEXT: rsqrtps %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_rsqrt_v16f32_v4f32: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vrsqrtps %xmm0, %xmm0 +; AVX1OR2-NEXT: vrsqrtps %xmm1, %xmm1 +; AVX1OR2-NEXT: vrsqrtps %xmm2, %xmm2 +; AVX1OR2-NEXT: vrsqrtps %xmm3, %xmm3 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_rsqrt_v16f32_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vrsqrtps %xmm0, %xmm0 +; AVX512-NEXT: vrsqrtps %xmm1, %xmm1 +; AVX512-NEXT: vrsqrtps %xmm2, %xmm2 +; AVX512-NEXT: vrsqrtps %xmm3, %xmm3 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0) + %v1 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a1) + %v2 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a2) + %v3 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a3) + %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> + %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> + %res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> + ret <16 x float> %res +} diff --git a/llvm/test/Transforms/InstCombine/saturating-add-sub.ll b/llvm/test/Transforms/InstCombine/saturating-add-sub.ll index c0ad5818e448a..1294f867f07c0 100644 --- a/llvm/test/Transforms/InstCombine/saturating-add-sub.ll +++ b/llvm/test/Transforms/InstCombine/saturating-add-sub.ll @@ -2671,3 +2671,19 @@ define i8 @neg_neg_constant(i8 %x, i8 %y) { %s = select i1 %cmp, i8 127, i8 %d ret i8 %s } + +; Make sure we don't crash in this case. +define i32 @pr153053_strict_pred_with_nonconstant_rhs(i32 %x, i32 %y) { +; CHECK-LABEL: @pr153053_strict_pred_with_nonconstant_rhs( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[X]], 1 +; CHECK-NEXT: [[RES:%.*]] = select i1 [[CMP]], i32 [[ADD]], i32 2147483647 +; CHECK-NEXT: ret i32 [[RES]] +; +entry: + %cmp = icmp slt i32 %x, %y + %add = add i32 %x, 1 + %res = select i1 %cmp, i32 %add, i32 2147483647 + ret i32 %res +} diff --git a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll index b4fd06316a2e5..4f19a7c586bc3 100644 --- a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll +++ b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll @@ -152,59 +152,106 @@ for.end: ret ptr %ptr.phi } -define ptr @both(i32 %k) { -; CHECK-LABEL: define ptr @both( -; CHECK-SAME: i32 [[K:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[BASE:%.*]] = getelementptr inbounds i32, ptr undef, i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[K]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 2 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 2 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] -; CHECK-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32 -; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[N_VEC]], 4 -; CHECK-NEXT: [[IND_END1:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 4 -; CHECK-NEXT: [[IND_END2:%.*]] = getelementptr i8, ptr undef, i64 [[TMP4]] -; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}} -; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] -; CHECK-NEXT: [[IND_ESCAPE:%.*]] = getelementptr i8, ptr [[IND_END1]], i64 -4 -; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] -; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[IND_END1]], %[[MIDDLE_BLOCK]] ], [ [[BASE]], %[[ENTRY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[IND_END2]], %[[MIDDLE_BLOCK]] ], [ undef, %[[ENTRY]] ] -; CHECK-NEXT: br label %[[FOR_BODY:.*]] -; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[INC_LAG1:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[TMP:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[INC_LAG2:%.*]] = phi ptr [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[INC_LAG1]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[TMP]] = getelementptr inbounds i32, ptr [[INC_LAG1]], i64 1 -; CHECK-NEXT: [[INC]] = add nsw i32 [[INC_PHI]], 1 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[INC]], [[K]] -; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_BODY]], {{!llvm.loop ![0-9]+}} -; CHECK: [[FOR_END]]: -; CHECK-NEXT: [[INC_LAG1_LCSSA:%.*]] = phi ptr [ [[INC_LAG1]], %[[FOR_BODY]] ], [ [[IND_ESCAPE]], %[[MIDDLE_BLOCK]] ] -; CHECK-NEXT: ret ptr [[INC_LAG1_LCSSA]] +define ptr @both(ptr %p, i32 %k) { +; VEC-LABEL: define ptr @both( +; VEC-SAME: ptr [[P:%.*]], i32 [[K:%.*]]) { +; VEC-NEXT: [[ENTRY:.*]]: +; VEC-NEXT: [[BASE:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 1 +; VEC-NEXT: [[TMP0:%.*]] = add i32 [[K]], -1 +; VEC-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; VEC-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; VEC-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 2 +; VEC-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; VEC: [[VECTOR_PH]]: +; VEC-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 2 +; VEC-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] +; VEC-NEXT: [[TMP3:%.*]] = trunc i64 [[N_VEC]] to i32 +; VEC-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 4 +; VEC-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[TMP4]] +; VEC-NEXT: br label %[[VECTOR_BODY:.*]] +; VEC: [[VECTOR_BODY]]: +; VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VEC-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[BASE]], %[[VECTOR_PH]] ], [ [[PTR_IND:%.*]], %[[VECTOR_BODY]] ] +; VEC-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <2 x i64> +; VEC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; VEC-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 8 +; VEC-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VEC-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}} +; VEC: [[MIDDLE_BLOCK]]: +; VEC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x ptr> [[VECTOR_GEP]], i32 1 +; VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; VEC-NEXT: [[IND_ESCAPE:%.*]] = getelementptr i8, ptr [[TMP5]], i64 -4 +; VEC-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; VEC: [[SCALAR_PH]]: +; VEC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; VEC-NEXT: [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP5]], %[[MIDDLE_BLOCK]] ], [ [[BASE]], %[[ENTRY]] ] +; VEC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi ptr [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ [[BASE]], %[[ENTRY]] ] +; VEC-NEXT: br label %[[FOR_BODY:.*]] +; VEC: [[FOR_BODY]]: +; VEC-NEXT: [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; VEC-NEXT: [[INC_LAG1:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[TMP:%.*]], %[[FOR_BODY]] ] +; VEC-NEXT: [[INC_LAG2:%.*]] = phi ptr [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[INC_LAG1]], %[[FOR_BODY]] ] +; VEC-NEXT: [[TMP]] = getelementptr inbounds i32, ptr [[INC_LAG1]], i64 1 +; VEC-NEXT: [[INC]] = add nsw i32 [[INC_PHI]], 1 +; VEC-NEXT: [[CMP:%.*]] = icmp eq i32 [[INC]], [[K]] +; VEC-NEXT: br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_BODY]], {{!llvm.loop ![0-9]+}} +; VEC: [[FOR_END]]: +; VEC-NEXT: [[INC_LAG1_LCSSA:%.*]] = phi ptr [ [[INC_LAG1]], %[[FOR_BODY]] ], [ [[IND_ESCAPE]], %[[MIDDLE_BLOCK]] ] +; VEC-NEXT: ret ptr [[INC_LAG1_LCSSA]] +; +; INTERLEAVE-LABEL: define ptr @both( +; INTERLEAVE-SAME: ptr [[P:%.*]], i32 [[K:%.*]]) { +; INTERLEAVE-NEXT: [[ENTRY:.*]]: +; INTERLEAVE-NEXT: [[BASE:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 1 +; INTERLEAVE-NEXT: [[TMP0:%.*]] = add i32 [[K]], -1 +; INTERLEAVE-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; INTERLEAVE-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; INTERLEAVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 2 +; INTERLEAVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; INTERLEAVE: [[VECTOR_PH]]: +; INTERLEAVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 2 +; INTERLEAVE-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] +; INTERLEAVE-NEXT: [[TMP3:%.*]] = trunc i64 [[N_VEC]] to i32 +; INTERLEAVE-NEXT: [[TMP6:%.*]] = mul i64 [[N_VEC]], 4 +; INTERLEAVE-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[TMP6]] +; INTERLEAVE-NEXT: br label %[[VECTOR_BODY:.*]] +; INTERLEAVE: [[VECTOR_BODY]]: +; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; INTERLEAVE-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4 +; INTERLEAVE-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 4 +; INTERLEAVE-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[TMP8]] +; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; INTERLEAVE-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; INTERLEAVE-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}} +; INTERLEAVE: [[MIDDLE_BLOCK]]: +; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; INTERLEAVE-NEXT: [[IND_ESCAPE:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 -4 +; INTERLEAVE-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; INTERLEAVE: [[SCALAR_PH]]: +; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; INTERLEAVE-NEXT: [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[NEXT_GEP]], %[[MIDDLE_BLOCK]] ], [ [[BASE]], %[[ENTRY]] ] +; INTERLEAVE-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi ptr [ [[NEXT_GEP1]], %[[MIDDLE_BLOCK]] ], [ [[BASE]], %[[ENTRY]] ] +; INTERLEAVE-NEXT: br label %[[FOR_BODY:.*]] +; INTERLEAVE: [[FOR_BODY]]: +; INTERLEAVE-NEXT: [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; INTERLEAVE-NEXT: [[INC_LAG1:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[TMP:%.*]], %[[FOR_BODY]] ] +; INTERLEAVE-NEXT: [[INC_LAG2:%.*]] = phi ptr [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[INC_LAG1]], %[[FOR_BODY]] ] +; INTERLEAVE-NEXT: [[TMP]] = getelementptr inbounds i32, ptr [[INC_LAG1]], i64 1 +; INTERLEAVE-NEXT: [[INC]] = add nsw i32 [[INC_PHI]], 1 +; INTERLEAVE-NEXT: [[CMP:%.*]] = icmp eq i32 [[INC]], [[K]] +; INTERLEAVE-NEXT: br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_BODY]], {{!llvm.loop ![0-9]+}} +; INTERLEAVE: [[FOR_END]]: +; INTERLEAVE-NEXT: [[INC_LAG1_LCSSA:%.*]] = phi ptr [ [[INC_LAG1]], %[[FOR_BODY]] ], [ [[IND_ESCAPE]], %[[MIDDLE_BLOCK]] ] +; INTERLEAVE-NEXT: ret ptr [[INC_LAG1_LCSSA]] ; entry: - %base = getelementptr inbounds i32, ptr undef, i64 1 + %base = getelementptr inbounds i32, ptr %p, i64 1 br label %for.body for.body: %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ] %inc.lag1 = phi ptr [ %base, %entry ], [ %tmp, %for.body] - %inc.lag2 = phi ptr [ undef, %entry ], [ %inc.lag1, %for.body] + %inc.lag2 = phi ptr [ %base, %entry ], [ %inc.lag1, %for.body] %tmp = getelementptr inbounds i32, ptr %inc.lag1, i64 1 %inc = add nsw i32 %inc.phi, 1 %cmp = icmp eq i32 %inc, %k diff --git a/llvm/test/Transforms/LoopVectorize/struct-return.ll b/llvm/test/Transforms/LoopVectorize/struct-return.ll index f2e2e2846614b..70c6c7e900c51 100644 --- a/llvm/test/Transforms/LoopVectorize/struct-return.ll +++ b/llvm/test/Transforms/LoopVectorize/struct-return.ll @@ -29,8 +29,9 @@ define void @struct_return_f32_widen(ptr noalias %in, ptr noalias writeonly %out ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br [[EXIT:label %.*]] -; CHECK: [[SCALAR_PH:.*:]] +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void ; entry: br label %for.body @@ -77,8 +78,9 @@ define void @struct_return_f64_widen(ptr noalias %in, ptr noalias writeonly %out ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br [[EXIT:label %.*]] -; CHECK: [[SCALAR_PH:.*:]] +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void ; entry: br label %for.body @@ -232,8 +234,9 @@ define void @struct_return_i32_three_results_widen(ptr noalias %in, ptr noalias ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br [[EXIT:label %.*]] -; CHECK: [[SCALAR_PH:.*:]] +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void ; entry: br label %for.body @@ -273,7 +276,7 @@ define void @scalarized_predicated_struct_return(ptr %a) { ; CHECK-NEXT: br i1 [[TMP2]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; CHECK: [[PRED_STORE_IF]]: ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = tail call { i64, i64 } @bar_i64(i64 [[TMP3]]) #[[ATTR4:[0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = tail call { i64, i64 } @bar_i64(i64 [[TMP3]]) #[[ATTR2:[0-9]+]] ; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { i64, i64 } [[TMP4]], 0 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[TMP5]], [[TMP6]] @@ -286,7 +289,7 @@ define void @scalarized_predicated_struct_return(ptr %a) { ; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2]] ; CHECK: [[PRED_STORE_IF1]]: ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = tail call { i64, i64 } @bar_i64(i64 [[TMP11]]) #[[ATTR4]] +; CHECK-NEXT: [[TMP12:%.*]] = tail call { i64, i64 } @bar_i64(i64 [[TMP11]]) #[[ATTR2]] ; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { i64, i64 } [[TMP12]], 0 ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1 ; CHECK-NEXT: [[TMP15:%.*]] = udiv i64 [[TMP13]], [[TMP14]] @@ -299,8 +302,9 @@ define void @scalarized_predicated_struct_return(ptr %a) { ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br [[EXIT:label %.*]] -; CHECK: [[SCALAR_PH:.*:]] +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void ; entry: br label %for.body @@ -385,7 +389,7 @@ define void @negative_mixed_element_type_struct_return(ptr noalias %in, ptr noal ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[IV]] ; CHECK-NEXT: [[IN_VAL:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[CALL:%.*]] = tail call { float, i32 } @baz(float [[IN_VAL]]) #[[ATTR5:[0-9]+]] +; CHECK-NEXT: [[CALL:%.*]] = tail call { float, i32 } @baz(float [[IN_VAL]]) #[[ATTR3:[0-9]+]] ; CHECK-NEXT: [[EXTRACT_A:%.*]] = extractvalue { float, i32 } [[CALL]], 0 ; CHECK-NEXT: [[EXTRACT_B:%.*]] = extractvalue { float, i32 } [[CALL]], 1 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[IV]] @@ -433,7 +437,7 @@ define void @negative_named_struct_return(ptr noalias readonly %in, ptr noalias ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[IN]], i64 [[IV]] ; CHECK-NEXT: [[IN_VAL:%.*]] = load double, ptr [[ARRAYIDX]], align 8 -; CHECK-NEXT: [[CALL:%.*]] = tail call [[NAMED_STRUCT:%.*]] @[[BAR_NAMED:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](double [[IN_VAL]]) #[[ATTR6:[0-9]+]] +; CHECK-NEXT: [[CALL:%.*]] = tail call [[NAMED_STRUCT:%.*]] @[[BAR_NAMED:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](double [[IN_VAL]]) #[[ATTR4:[0-9]+]] ; CHECK-NEXT: [[EXTRACT_A:%.*]] = extractvalue [[NAMED_STRUCT]] [[CALL]], 0 ; CHECK-NEXT: [[EXTRACT_B:%.*]] = extractvalue [[NAMED_STRUCT]] [[CALL]], 1 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[OUT_A]], i64 [[IV]] diff --git a/llvm/test/Transforms/OpenMP/parallel_region_merging.ll b/llvm/test/Transforms/OpenMP/parallel_region_merging.ll index 83452e72b56b9..1bbac5cc3154b 100644 --- a/llvm/test/Transforms/OpenMP/parallel_region_merging.ll +++ b/llvm/test/Transforms/OpenMP/parallel_region_merging.ll @@ -4880,6 +4880,8 @@ entry: ; CHECK2: omp.par.merged.split: ; CHECK2-NEXT: br label [[OMP_REGION_BODY_SPLIT:%.*]] ; CHECK2: omp_region.body.split: +; CHECK2-NEXT: br label [[OMP_REGION_FINALIZE:%.*]] +; CHECK2: omp_region.finalize: ; CHECK2-NEXT: call void @__kmpc_end_master(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]]) ; CHECK2-NEXT: br label [[OMP_REGION_END]] ; CHECK2: omp.par.exit.exitStub: @@ -4974,6 +4976,8 @@ entry: ; CHECK2: omp.par.merged.split: ; CHECK2-NEXT: br label [[OMP_REGION_BODY_SPLIT:%.*]] ; CHECK2: omp_region.body.split: +; CHECK2-NEXT: br label [[OMP_REGION_FINALIZE:%.*]] +; CHECK2: omp_region.finalize: ; CHECK2-NEXT: call void @__kmpc_end_master(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]]) ; CHECK2-NEXT: br label [[OMP_REGION_END]] ; CHECK2: omp.par.exit.exitStub: @@ -5070,6 +5074,8 @@ entry: ; CHECK2: omp.par.merged.split: ; CHECK2-NEXT: br label [[OMP_REGION_BODY_SPLIT:%.*]] ; CHECK2: omp_region.body.split: +; CHECK2-NEXT: br label [[OMP_REGION_FINALIZE:%.*]] +; CHECK2: omp_region.finalize: ; CHECK2-NEXT: call void @__kmpc_end_master(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]]) ; CHECK2-NEXT: br label [[OMP_REGION_END]] ; CHECK2: omp.par.exit.exitStub: @@ -5157,6 +5163,8 @@ entry: ; CHECK2: omp.par.merged.split: ; CHECK2-NEXT: br label [[OMP_REGION_BODY_SPLIT:%.*]] ; CHECK2: omp_region.body.split: +; CHECK2-NEXT: br label [[OMP_REGION_FINALIZE:%.*]] +; CHECK2: omp_region.finalize: ; CHECK2-NEXT: call void @__kmpc_end_master(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]]) ; CHECK2-NEXT: br label [[OMP_REGION_END]] ; CHECK2: omp.par.exit.exitStub: @@ -5254,6 +5262,8 @@ entry: ; CHECK2: omp.par.merged.split: ; CHECK2-NEXT: br label [[OMP_REGION_BODY_SPLIT:%.*]] ; CHECK2: omp_region.body.split: +; CHECK2-NEXT: br label [[OMP_REGION_FINALIZE:%.*]] +; CHECK2: omp_region.finalize: ; CHECK2-NEXT: call void @__kmpc_end_master(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]]) ; CHECK2-NEXT: br label [[OMP_REGION_END]] ; CHECK2: omp.par.exit.exitStub: @@ -5434,6 +5444,8 @@ entry: ; CHECK2: omp.par.merged.split: ; CHECK2-NEXT: br label [[OMP_REGION_BODY_SPLIT:%.*]] ; CHECK2: omp_region.body.split: +; CHECK2-NEXT: br label [[OMP_REGION_FINALIZE:%.*]] +; CHECK2: omp_region.finalize: ; CHECK2-NEXT: call void @__kmpc_end_master(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]]) ; CHECK2-NEXT: br label [[OMP_REGION_END]] ; CHECK2: omp.par.exit.exitStub: @@ -5624,8 +5636,10 @@ entry: ; CHECK2: omp.par.region.split: ; CHECK2-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]] ; CHECK2: omp.par.pre_finalize: -; CHECK2-NEXT: br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]] -; CHECK2: omp_region.body5: +; CHECK2-NEXT: br label [[FINI:%.*]] +; CHECK2: .fini: +; CHECK2-NEXT: br label [[OMP_PAR_EXIT_EXITSTUB:.*]] +; CHECK2: omp_region.body6: ; CHECK2-NEXT: br label [[SEQ_PAR_MERGED2:%.*]] ; CHECK2: seq.par.merged2: ; CHECK2-NEXT: [[ADD_SEQ_OUTPUT_LOAD:%.*]] = load i32, ptr [[LOADGEP_ADD_SEQ_OUTPUT_ALLOC]], align 4 @@ -5634,7 +5648,9 @@ entry: ; CHECK2-NEXT: br label [[OMP_PAR_MERGED_SPLIT_SPLIT_SPLIT:%.*]] ; CHECK2: omp.par.merged.split.split.split: ; CHECK2-NEXT: br label [[OMP_REGION_BODY5_SPLIT:%.*]] -; CHECK2: omp_region.body5.split: +; CHECK2: omp_region.body6.split: +; CHECK2-NEXT: br label [[OMP_REGION_FINALIZE5:%.*]] +; CHECK2: omp_region.finalize{{.*}}: ; CHECK2-NEXT: call void @__kmpc_end_master(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM3]]) ; CHECK2-NEXT: br label [[OMP_REGION_END4]] ; CHECK2: omp_region.body: @@ -5646,6 +5662,8 @@ entry: ; CHECK2: omp.par.merged.split: ; CHECK2-NEXT: br label [[OMP_REGION_BODY_SPLIT:%.*]] ; CHECK2: omp_region.body.split: +; CHECK2-NEXT: br label [[OMP_REGION_FINALIZE:%.*]] +; CHECK2: omp_region.finalize: ; CHECK2-NEXT: call void @__kmpc_end_master(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]]) ; CHECK2-NEXT: br label [[OMP_REGION_END]] ; CHECK2: omp.par.exit.exitStub: diff --git a/llvm/test/Transforms/SCCP/get_vector_length-intrinsic.ll b/llvm/test/Transforms/SCCP/get_vector_length-intrinsic.ll new file mode 100644 index 0000000000000..d0741161e729e --- /dev/null +++ b/llvm/test/Transforms/SCCP/get_vector_length-intrinsic.ll @@ -0,0 +1,147 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt < %s -p sccp -S | FileCheck %s + +define i1 @result_le_count() { +; CHECK-LABEL: define i1 @result_le_count() { +; CHECK-NEXT: ret i1 true +; + %x = call i32 @llvm.experimental.get.vector.length(i32 3, i32 4, i1 false) + %res = icmp ule i32 %x, 3 + ret i1 %res +} + +define i1 @result_le_max_lanes(i32 %count) { +; CHECK-LABEL: define i1 @result_le_max_lanes( +; CHECK-SAME: i32 [[COUNT:%.*]]) { +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[COUNT]], i32 3, i1 false) +; CHECK-NEXT: ret i1 true +; + %x = call i32 @llvm.experimental.get.vector.length(i32 %count, i32 3, i1 false) + %res = icmp ule i32 %x, 3 + ret i1 %res +} + +define i1 @result_le_max_lanes_scalable(i32 %count) vscale_range(2, 4) { +; CHECK-LABEL: define i1 @result_le_max_lanes_scalable( +; CHECK-SAME: i32 [[COUNT:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[COUNT]], i32 4, i1 true) +; CHECK-NEXT: ret i1 true +; + %x = call i32 @llvm.experimental.get.vector.length(i32 %count, i32 4, i1 true) + %res = icmp ule i32 %x, 16 + ret i1 %res +} + +define i32 @count_le_max_lanes() { +; CHECK-LABEL: define i32 @count_le_max_lanes() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret i32 4 +; +entry: + br label %loop + +loop: + %iv = phi i32 [4, %entry], [%iv.next, %loop] + %x = call i32 @llvm.experimental.get.vector.length(i32 %iv, i32 4, i1 false) + %iv.next = sub i32 %iv, %x + %ec = icmp eq i32 %iv.next, 0 + br i1 %ec, label %exit, label %loop + +exit: + ret i32 %x +} + +; Can't simplify because %iv isn't <= max lanes. +define i32 @count_not_le_max_lanes() { +; CHECK-LABEL: define range(i32 0, 5) i32 @count_not_le_max_lanes() { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 6, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[IV]], i32 4, i1 false) +; CHECK-NEXT: [[IV_NEXT]] = sub i32 [[IV]], [[X]] +; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 0 +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret i32 [[X]] +; +entry: + br label %loop + +loop: + %iv = phi i32 [6, %entry], [%iv.next, %loop] + %x = call i32 @llvm.experimental.get.vector.length(i32 %iv, i32 4, i1 false) + %iv.next = sub i32 %iv, %x + %ec = icmp eq i32 %iv.next, 0 + br i1 %ec, label %exit, label %loop + +exit: + ret i32 %x +} + +define i32 @count_le_max_lanes_scalable_known() vscale_range(4, 8) { +; CHECK-LABEL: define i32 @count_le_max_lanes_scalable_known( +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret i32 16 +; +entry: + br label %loop + +loop: + %iv = phi i32 [16, %entry], [%iv.next, %loop] + %x = call i32 @llvm.experimental.get.vector.length(i32 %iv, i32 4, i1 true) + %iv.next = sub i32 %iv, %x + %ec = icmp eq i32 %iv.next, 0 + br i1 %ec, label %exit, label %loop + +exit: + ret i32 %x +} + +; Can't simplify because %iv isn't guaranteed <= max lanes. +define i32 @count_le_max_lanes_scalable_unknown() { +; CHECK-LABEL: define range(i32 0, -1) i32 @count_le_max_lanes_scalable_unknown() { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 16, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[IV]], i32 4, i1 true) +; CHECK-NEXT: [[IV_NEXT]] = sub i32 [[IV]], [[X]] +; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 0 +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret i32 [[X]] +; +entry: + br label %loop + +loop: + %iv = phi i32 [16, %entry], [%iv.next, %loop] + %x = call i32 @llvm.experimental.get.vector.length(i32 %iv, i32 4, i1 true) + %iv.next = sub i32 %iv, %x + %ec = icmp eq i32 %iv.next, 0 + br i1 %ec, label %exit, label %loop + +exit: + ret i32 %x +} + +define i1 @result_le_overflow() { +; CHECK-LABEL: define i1 @result_le_overflow() { +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 4294967296, i32 4, i1 false) +; CHECK-NEXT: [[RES:%.*]] = icmp ule i32 [[X]], 3 +; CHECK-NEXT: ret i1 [[RES]] +; + %x = call i32 @llvm.experimental.get.vector.length(i64 u0x100000000, i32 4, i1 false) + %res = icmp ule i32 %x, 3 + ret i1 %res +} diff --git a/llvm/unittests/CAS/CASTestConfig.h b/llvm/unittests/CAS/CASTestConfig.h index b1c0e59ff2b92..20a95dd2f6aa6 100644 --- a/llvm/unittests/CAS/CASTestConfig.h +++ b/llvm/unittests/CAS/CASTestConfig.h @@ -15,6 +15,11 @@ #include "gtest/gtest.h" #include +#ifdef _WIN32 +#include "llvm/Support/VersionTuple.h" +#include "llvm/Support/Windows/WindowsSupport.h" +#endif + namespace llvm::unittest::cas { class MockEnv { void anchor(); @@ -68,6 +73,10 @@ class CASTest } void SetUp() override { +#ifdef _WIN32 + if (llvm::GetWindowsOSVersion() < llvm::VersionTuple(10, 0, 0, 17763)) + GTEST_SKIP() << "CAS tests skipped on older windows version"; +#endif NextCASIndex = 0; setMaxOnDiskCASMappingSize(); } diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp index 1fe32eba9b3c3..9ba4a91755bd1 100644 --- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp +++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp @@ -429,8 +429,8 @@ TEST_F(OpenMPIRBuilderTest, CreateCancel) { OMPBuilder.createCancel(Loc, nullptr, OMPD_parallel)); Builder.restoreIP(NewIP); EXPECT_FALSE(M->global_empty()); - EXPECT_EQ(M->size(), 4U); - EXPECT_EQ(F->size(), 4U); + EXPECT_EQ(M->size(), 3U); + EXPECT_EQ(F->size(), 5U); EXPECT_EQ(BB->size(), 4U); CallInst *GTID = dyn_cast(&BB->front()); @@ -450,23 +450,16 @@ TEST_F(OpenMPIRBuilderTest, CreateCancel) { Instruction *CancelBBTI = Cancel->getParent()->getTerminator(); EXPECT_EQ(CancelBBTI->getNumSuccessors(), 2U); EXPECT_EQ(CancelBBTI->getSuccessor(0), NewIP.getBlock()); - EXPECT_EQ(CancelBBTI->getSuccessor(1)->size(), 3U); - CallInst *GTID1 = dyn_cast(&CancelBBTI->getSuccessor(1)->front()); - EXPECT_NE(GTID1, nullptr); - EXPECT_EQ(GTID1->arg_size(), 1U); - EXPECT_EQ(GTID1->getCalledFunction()->getName(), "__kmpc_global_thread_num"); - EXPECT_FALSE(GTID1->getCalledFunction()->doesNotAccessMemory()); - EXPECT_FALSE(GTID1->getCalledFunction()->doesNotFreeMemory()); - CallInst *Barrier = dyn_cast(GTID1->getNextNode()); - EXPECT_NE(Barrier, nullptr); - EXPECT_EQ(Barrier->arg_size(), 2U); - EXPECT_EQ(Barrier->getCalledFunction()->getName(), "__kmpc_cancel_barrier"); - EXPECT_FALSE(Barrier->getCalledFunction()->doesNotAccessMemory()); - EXPECT_FALSE(Barrier->getCalledFunction()->doesNotFreeMemory()); - EXPECT_TRUE(Barrier->use_empty()); + EXPECT_EQ(CancelBBTI->getSuccessor(1)->size(), 1U); EXPECT_EQ(CancelBBTI->getSuccessor(1)->getTerminator()->getNumSuccessors(), 1U); - EXPECT_EQ(CancelBBTI->getSuccessor(1)->getTerminator()->getSuccessor(0), CBB); + // cancel branch instruction (1) -> .cncl -> .fini -> CBB + EXPECT_EQ(CancelBBTI->getSuccessor(1) + ->getTerminator() + ->getSuccessor(0) + ->getTerminator() + ->getSuccessor(0), + CBB); EXPECT_EQ(cast(Cancel)->getArgOperand(1), GTID); @@ -499,7 +492,7 @@ TEST_F(OpenMPIRBuilderTest, CreateCancelIfCond) { Builder.restoreIP(NewIP); EXPECT_FALSE(M->global_empty()); EXPECT_EQ(M->size(), 4U); - EXPECT_EQ(F->size(), 7U); + EXPECT_EQ(F->size(), 10U); EXPECT_EQ(BB->size(), 1U); ASSERT_TRUE(isa(BB->getTerminator())); ASSERT_EQ(BB->getTerminator()->getNumSuccessors(), 2U); @@ -525,23 +518,15 @@ TEST_F(OpenMPIRBuilderTest, CreateCancelIfCond) { EXPECT_EQ(CancelBBTI->getSuccessor(0)->size(), 1U); EXPECT_EQ(CancelBBTI->getSuccessor(0)->getUniqueSuccessor(), NewIP.getBlock()); - EXPECT_EQ(CancelBBTI->getSuccessor(1)->size(), 3U); - CallInst *GTID1 = dyn_cast(&CancelBBTI->getSuccessor(1)->front()); - EXPECT_NE(GTID1, nullptr); - EXPECT_EQ(GTID1->arg_size(), 1U); - EXPECT_EQ(GTID1->getCalledFunction()->getName(), "__kmpc_global_thread_num"); - EXPECT_FALSE(GTID1->getCalledFunction()->doesNotAccessMemory()); - EXPECT_FALSE(GTID1->getCalledFunction()->doesNotFreeMemory()); - CallInst *Barrier = dyn_cast(GTID1->getNextNode()); - EXPECT_NE(Barrier, nullptr); - EXPECT_EQ(Barrier->arg_size(), 2U); - EXPECT_EQ(Barrier->getCalledFunction()->getName(), "__kmpc_cancel_barrier"); - EXPECT_FALSE(Barrier->getCalledFunction()->doesNotAccessMemory()); - EXPECT_FALSE(Barrier->getCalledFunction()->doesNotFreeMemory()); - EXPECT_TRUE(Barrier->use_empty()); + EXPECT_EQ(CancelBBTI->getSuccessor(1)->size(), 1U); EXPECT_EQ(CancelBBTI->getSuccessor(1)->getTerminator()->getNumSuccessors(), 1U); - EXPECT_EQ(CancelBBTI->getSuccessor(1)->getTerminator()->getSuccessor(0), CBB); + EXPECT_EQ(CancelBBTI->getSuccessor(1) + ->getTerminator() + ->getSuccessor(0) + ->getTerminator() + ->getSuccessor(0), + CBB); EXPECT_EQ(cast(Cancel)->getArgOperand(1), GTID); @@ -573,7 +558,7 @@ TEST_F(OpenMPIRBuilderTest, CreateCancelBarrier) { Builder.restoreIP(NewIP); EXPECT_FALSE(M->global_empty()); EXPECT_EQ(M->size(), 3U); - EXPECT_EQ(F->size(), 4U); + EXPECT_EQ(F->size(), 5U); EXPECT_EQ(BB->size(), 4U); CallInst *GTID = dyn_cast(&BB->front()); @@ -596,7 +581,11 @@ TEST_F(OpenMPIRBuilderTest, CreateCancelBarrier) { EXPECT_EQ(BarrierBBTI->getSuccessor(1)->size(), 1U); EXPECT_EQ(BarrierBBTI->getSuccessor(1)->getTerminator()->getNumSuccessors(), 1U); - EXPECT_EQ(BarrierBBTI->getSuccessor(1)->getTerminator()->getSuccessor(0), + EXPECT_EQ(BarrierBBTI->getSuccessor(1) + ->getTerminator() + ->getSuccessor(0) + ->getTerminator() + ->getSuccessor(0), CBB); EXPECT_EQ(cast(Barrier)->getArgOperand(1), GTID); @@ -1323,8 +1312,8 @@ TEST_F(OpenMPIRBuilderTest, ParallelCancelBarrier) { EXPECT_EQ(NumBodiesGenerated, 1U); EXPECT_EQ(NumPrivatizedVars, 0U); - EXPECT_EQ(NumFinalizationPoints, 2U); - EXPECT_TRUE(FakeDestructor->hasNUses(2)); + EXPECT_EQ(NumFinalizationPoints, 1U); + EXPECT_TRUE(FakeDestructor->hasNUses(1)); Builder.restoreIP(AfterIP); Builder.CreateRetVoid(); @@ -2961,7 +2950,8 @@ TEST_F(OpenMPIRBuilderTest, MasterDirective) { BranchInst *EntryBr = cast(EntryBB->getTerminator()); EXPECT_TRUE(EntryBr->isConditional()); EXPECT_EQ(EntryBr->getSuccessor(0), ThenBB); - BasicBlock *ExitBB = ThenBB->getUniqueSuccessor(); + BasicBlock *FinalizeBB = ThenBB->getUniqueSuccessor(); + BasicBlock *ExitBB = FinalizeBB->getUniqueSuccessor(); EXPECT_EQ(EntryBr->getSuccessor(1), ExitBB); CmpInst *CondInst = cast(EntryBr->getCondition()); @@ -2973,7 +2963,7 @@ TEST_F(OpenMPIRBuilderTest, MasterDirective) { EXPECT_TRUE(isa(MasterEntryCI->getArgOperand(0))); CallInst *MasterEndCI = nullptr; - for (auto &FI : *ThenBB) { + for (auto &FI : *FinalizeBB) { Instruction *cur = &FI; if (isa(cur)) { MasterEndCI = cast(cur); @@ -3044,7 +3034,8 @@ TEST_F(OpenMPIRBuilderTest, MaskedDirective) { BranchInst *EntryBr = cast(EntryBB->getTerminator()); EXPECT_TRUE(EntryBr->isConditional()); EXPECT_EQ(EntryBr->getSuccessor(0), ThenBB); - BasicBlock *ExitBB = ThenBB->getUniqueSuccessor(); + BasicBlock *FinalizeBB = ThenBB->getUniqueSuccessor(); + BasicBlock *ExitBB = FinalizeBB->getUniqueSuccessor(); EXPECT_EQ(EntryBr->getSuccessor(1), ExitBB); CmpInst *CondInst = cast(EntryBr->getCondition()); @@ -3056,7 +3047,7 @@ TEST_F(OpenMPIRBuilderTest, MaskedDirective) { EXPECT_TRUE(isa(MaskedEntryCI->getArgOperand(0))); CallInst *MaskedEndCI = nullptr; - for (auto &FI : *ThenBB) { + for (auto &FI : *FinalizeBB) { Instruction *cur = &FI; if (isa(cur)) { MaskedEndCI = cast(cur); @@ -3109,6 +3100,9 @@ TEST_F(OpenMPIRBuilderTest, CriticalDirective) { FINICB_WRAPPER(FiniCB), "testCRT", nullptr)); Builder.restoreIP(AfterIP); + BasicBlock *FinalizeBB = EntryBB->getUniqueSuccessor(); + EXPECT_NE(FinalizeBB, nullptr); + CallInst *CriticalEntryCI = nullptr; for (auto &EI : *EntryBB) { Instruction *cur = &EI; @@ -3125,7 +3119,7 @@ TEST_F(OpenMPIRBuilderTest, CriticalDirective) { EXPECT_TRUE(isa(CriticalEntryCI->getArgOperand(0))); CallInst *CriticalEndCI = nullptr; - for (auto &FI : *EntryBB) { + for (auto &FI : *FinalizeBB) { Instruction *cur = &FI; if (isa(cur)) { CriticalEndCI = cast(cur); @@ -3360,6 +3354,9 @@ TEST_F(OpenMPIRBuilderTest, OrderedDirectiveThreads) { FINICB_WRAPPER(FiniCB), true)); Builder.restoreIP(AfterIP); + BasicBlock *FinalizeBB = EntryBB->getUniqueSuccessor(); + EXPECT_NE(FinalizeBB, nullptr); + Builder.CreateRetVoid(); OMPBuilder.finalize(); EXPECT_FALSE(verifyModule(*M, &errs())); @@ -3382,7 +3379,7 @@ TEST_F(OpenMPIRBuilderTest, OrderedDirectiveThreads) { EXPECT_TRUE(isa(OrderedEntryCI->getArgOperand(0))); CallInst *OrderedEndCI = nullptr; - for (auto &FI : *EntryBB) { + for (auto &FI : *FinalizeBB) { Instruction *Cur = &FI; if (isa(Cur)) { OrderedEndCI = cast(Cur); @@ -3558,7 +3555,8 @@ TEST_F(OpenMPIRBuilderTest, SingleDirective) { BranchInst *EntryBr = cast(EntryBB->getTerminator()); EXPECT_TRUE(EntryBr->isConditional()); EXPECT_EQ(EntryBr->getSuccessor(0), ThenBB); - BasicBlock *ExitBB = ThenBB->getUniqueSuccessor(); + BasicBlock *FinalizeBB = ThenBB->getUniqueSuccessor(); + BasicBlock *ExitBB = FinalizeBB->getUniqueSuccessor(); EXPECT_EQ(EntryBr->getSuccessor(1), ExitBB); CmpInst *CondInst = cast(EntryBr->getCondition()); @@ -3570,7 +3568,7 @@ TEST_F(OpenMPIRBuilderTest, SingleDirective) { EXPECT_TRUE(isa(SingleEntryCI->getArgOperand(0))); CallInst *SingleEndCI = nullptr; - for (auto &FI : *ThenBB) { + for (auto &FI : *FinalizeBB) { Instruction *cur = &FI; if (isa(cur)) { SingleEndCI = cast(cur); @@ -3652,7 +3650,8 @@ TEST_F(OpenMPIRBuilderTest, SingleDirectiveNowait) { BranchInst *EntryBr = cast(EntryBB->getTerminator()); EXPECT_TRUE(EntryBr->isConditional()); EXPECT_EQ(EntryBr->getSuccessor(0), ThenBB); - BasicBlock *ExitBB = ThenBB->getUniqueSuccessor(); + BasicBlock *FinalizeBB = ThenBB->getUniqueSuccessor(); + BasicBlock *ExitBB = FinalizeBB->getUniqueSuccessor(); EXPECT_EQ(EntryBr->getSuccessor(1), ExitBB); CmpInst *CondInst = cast(EntryBr->getCondition()); @@ -3664,7 +3663,7 @@ TEST_F(OpenMPIRBuilderTest, SingleDirectiveNowait) { EXPECT_TRUE(isa(SingleEntryCI->getArgOperand(0))); CallInst *SingleEndCI = nullptr; - for (auto &FI : *ThenBB) { + for (auto &FI : *FinalizeBB) { Instruction *cur = &FI; if (isa(cur)) { SingleEndCI = cast(cur); @@ -3776,7 +3775,8 @@ TEST_F(OpenMPIRBuilderTest, SingleDirectiveCopyPrivate) { BranchInst *EntryBr = cast(EntryBB->getTerminator()); EXPECT_TRUE(EntryBr->isConditional()); EXPECT_EQ(EntryBr->getSuccessor(0), ThenBB); - BasicBlock *ExitBB = ThenBB->getUniqueSuccessor(); + BasicBlock *FinalizeBB = ThenBB->getUniqueSuccessor(); + BasicBlock *ExitBB = FinalizeBB->getUniqueSuccessor(); EXPECT_EQ(EntryBr->getSuccessor(1), ExitBB); CmpInst *CondInst = cast(EntryBr->getCondition()); @@ -3795,25 +3795,28 @@ TEST_F(OpenMPIRBuilderTest, SingleDirectiveCopyPrivate) { EXPECT_EQ(PrivLI->getPointerOperand(), PrivAI); // icmp EXPECT_TRUE(ThenBBI.next()); + + // check FinalizeBB + BBInstIter FinalizeBBI(FinalizeBB); // store 1, DidIt - auto *DidItSI = ThenBBI.next(); + auto *DidItSI = FinalizeBBI.next(); EXPECT_NE(DidItSI, nullptr); EXPECT_EQ(DidItSI->getValueOperand(), ConstantInt::get(Type::getInt32Ty(Ctx), 1)); Value *DidIt = DidItSI->getPointerOperand(); // call __kmpc_end_single - auto *SingleEndCI = ThenBBI.next(); + auto *SingleEndCI = FinalizeBBI.next(); EXPECT_NE(SingleEndCI, nullptr); EXPECT_EQ(SingleEndCI->getCalledFunction()->getName(), "__kmpc_end_single"); EXPECT_EQ(SingleEndCI->arg_size(), 2U); EXPECT_TRUE(isa(SingleEndCI->getArgOperand(0))); EXPECT_EQ(SingleEndCI->getArgOperand(1), SingleEntryCI->getArgOperand(1)); // br ExitBB - auto *ExitBBBI = ThenBBI.next(); + auto *ExitBBBI = FinalizeBBI.next(); EXPECT_NE(ExitBBBI, nullptr); EXPECT_TRUE(ExitBBBI->isUnconditional()); EXPECT_EQ(ExitBBBI->getOperand(0), ExitBB); - EXPECT_FALSE(ThenBBI.hasNext()); + EXPECT_FALSE(FinalizeBBI.hasNext()); // check ExitBB BBInstIter ExitBBI(ExitBB); diff --git a/llvm/unittests/IR/ConstantRangeTest.cpp b/llvm/unittests/IR/ConstantRangeTest.cpp index 53d581c8db7c9..13712a76d3edf 100644 --- a/llvm/unittests/IR/ConstantRangeTest.cpp +++ b/llvm/unittests/IR/ConstantRangeTest.cpp @@ -449,6 +449,9 @@ TEST_F(ConstantRangeTest, Trunc) { // trunc([7, 1), 3->2) = [3, 1) ConstantRange SevenOne(APInt(3, 7), APInt(3, 1)); EXPECT_EQ(SevenOne.truncate(2), ConstantRange(APInt(2, 3), APInt(2, 1))); + + ConstantRange Nop = Full.truncate(Full.getBitWidth()); + EXPECT_EQ(Full, Nop); } TEST_F(ConstantRangeTest, TruncNuw) { @@ -527,6 +530,9 @@ TEST_F(ConstantRangeTest, ZExt) { // zext([5, 0), 3->7) = [5, 8) ConstantRange FiveZero(APInt(3, 5), APInt(3, 0)); EXPECT_EQ(FiveZero.zeroExtend(7), ConstantRange(APInt(7, 5), APInt(7, 8))); + + ConstantRange Nop = Full.zeroExtend(Full.getBitWidth()); + EXPECT_EQ(Full, Nop); } TEST_F(ConstantRangeTest, SExt) { @@ -550,6 +556,9 @@ TEST_F(ConstantRangeTest, SExt) { EXPECT_EQ(ConstantRange(APInt(16, 0x0200), APInt(16, 0x8000)).signExtend(19), ConstantRange(APInt(19, 0x0200), APInt(19, 0x8000))); + + ConstantRange Nop = Full.signExtend(Full.getBitWidth()); + EXPECT_EQ(Full, Nop); } TEST_F(ConstantRangeTest, IntersectWith) { diff --git a/llvm/utils/lit/lit/run.py b/llvm/utils/lit/lit/run.py index 3fc4a1b9b40bd..9c54511bfd625 100644 --- a/llvm/utils/lit/lit/run.py +++ b/llvm/utils/lit/lit/run.py @@ -7,6 +7,14 @@ import lit.util import lit.worker +# Windows has a limit of 60 workers per pool. +# This is defined in the multiprocessing module implementation. +# See: https://github.com/python/cpython/blob/6bc65c30ff1fd0b581a2c93416496fc720bc442c/Lib/concurrent/futures/process.py#L669-L672 +WINDOWS_MAX_WORKERS_PER_POOL = 60 + + +def _ceilDiv(a, b): + return (a + b - 1) // b class MaxFailuresError(Exception): pass @@ -72,25 +80,65 @@ def _execute(self, deadline): if v is not None } - pool = multiprocessing.Pool( - self.workers, lit.worker.initialize, (self.lit_config, semaphores) + # Windows has a limit of 60 workers per pool, so we need to use multiple pools + # if we have more workers requested than the limit. + # Also, allow to override the limit with the LIT_WINDOWS_MAX_WORKERS_PER_POOL environment variable. + max_workers_per_pool = ( + WINDOWS_MAX_WORKERS_PER_POOL if os.name == "nt" else self.workers + ) + max_workers_per_pool = int( + os.getenv("LIT_WINDOWS_MAX_WORKERS_PER_POOL", max_workers_per_pool) ) - async_results = [ - pool.apply_async( - lit.worker.execute, args=[test], callback=self.progress_callback + num_pools = max(1, _ceilDiv(self.workers, max_workers_per_pool)) + + # Distribute self.workers across num_pools as evenly as possible + workers_per_pool_list = [self.workers // num_pools] * num_pools + for pool_idx in range(self.workers % num_pools): + workers_per_pool_list[pool_idx] += 1 + + if num_pools > 1: + self.lit_config.note( + "Using %d pools balancing %d workers total distributed as %s (Windows worker limit workaround)" + % (num_pools, self.workers, workers_per_pool_list) ) - for test in self.tests - ] - pool.close() + + # Create multiple pools + pools = [] + for pool_size in workers_per_pool_list: + pool = multiprocessing.Pool( + pool_size, lit.worker.initialize, (self.lit_config, semaphores) + ) + pools.append(pool) + + # Distribute tests across pools + tests_per_pool = _ceilDiv(len(self.tests), num_pools) + async_results = [] + + for pool_idx, pool in enumerate(pools): + start_idx = pool_idx * tests_per_pool + end_idx = min(start_idx + tests_per_pool, len(self.tests)) + for test in self.tests[start_idx:end_idx]: + ar = pool.apply_async( + lit.worker.execute, args=[test], callback=self.progress_callback + ) + async_results.append(ar) + + # Close all pools + for pool in pools: + pool.close() try: self._wait_for(async_results, deadline) except: - pool.terminate() + # Terminate all pools on exception + for pool in pools: + pool.terminate() raise finally: - pool.join() + # Join all pools + for pool in pools: + pool.join() def _wait_for(self, async_results, deadline): timeout = deadline - time.time() diff --git a/llvm/utils/lit/lit/util.py b/llvm/utils/lit/lit/util.py index e4e031b3e0898..6f25fbc94b757 100644 --- a/llvm/utils/lit/lit/util.py +++ b/llvm/utils/lit/lit/util.py @@ -114,11 +114,6 @@ def usable_core_count(): except AttributeError: n = os.cpu_count() or 1 - # On Windows with more than 60 processes, multiprocessing's call to - # _winapi.WaitForMultipleObjects() prints an error and lit hangs. - if platform.system() == "Windows": - return min(n, 60) - return n def abs_path_preserve_drive(path): diff --git a/llvm/utils/lit/tests/windows-pools.py b/llvm/utils/lit/tests/windows-pools.py new file mode 100644 index 0000000000000..85110b37c2601 --- /dev/null +++ b/llvm/utils/lit/tests/windows-pools.py @@ -0,0 +1,27 @@ +# Create a directory with 20 files and check the number of pools and workers per pool that lit will use. + +# RUN: rm -Rf %t.dir && mkdir -p %t.dir +# RUN: python -c "for i in range(20): open(rf'%t.dir/file{i}.txt', 'w').write('RUN:')" + +# RUN: echo "import lit.formats" > %t.dir/lit.cfg +# RUN: echo "config.name = \"top-level-suite\"" >> %t.dir/lit.cfg +# RUN: echo "config.suffixes = [\".txt\"]" >> %t.dir/lit.cfg +# RUN: echo "config.test_format = lit.formats.ShTest()" >> %t.dir/lit.cfg + + +# 15 workers per pool max, 100 workers total max: we expect lit to cap the workers to the number of files +# RUN: env "LIT_WINDOWS_MAX_WORKERS_PER_POOL=15" %{lit} -s %t.dir/ -j100 > %t.out 2>&1 +# CHECK: Using 2 pools balancing 20 workers total distributed as [10, 10] +# CHECK: Passed: 20 + +# 5 workers per pool max, 17 workers total max +# RUN: env "LIT_WINDOWS_MAX_WORKERS_PER_POOL=5" %{lit} -s %t.dir/ -j17 >> %t.out 2>&1 +# CHECK: Using 4 pools balancing 17 workers total distributed as [5, 4, 4, 4] +# CHECK: Passed: 20 + +# 19 workers per pool max, 19 workers total max +# RUN: env "LIT_WINDOWS_MAX_WORKERS_PER_POOL=19" %{lit} -s %t.dir/ -j19 >> %t.out 2>&1 +# CHECK-NOT: workers total distributed as +# CHECK: Passed: 20 + +# RUN: cat %t.out | FileCheck %s diff --git a/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp b/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp index 81fbdb1611deb..5c68236526b7d 100644 --- a/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp +++ b/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp @@ -41,15 +41,17 @@ static FuncOp createFnDecl(OpBuilder &b, SymbolOpInterface symTable, } /// Helper function to look up or create the symbol for a runtime library -/// function with the given parameter types. Always returns an int64_t. +/// function with the given parameter types. Returns an int64_t, unless a +/// different result type is specified. static FailureOr lookupOrCreateApFloatFn(OpBuilder &b, SymbolOpInterface symTable, StringRef name, TypeRange paramTypes, - SymbolTableCollection *symbolTables = nullptr) { - auto i64Type = IntegerType::get(symTable->getContext(), 64); - + SymbolTableCollection *symbolTables = nullptr, + Type resultType = {}) { + if (!resultType) + resultType = IntegerType::get(symTable->getContext(), 64); std::string funcName = (llvm::Twine("_mlir_apfloat_") + name).str(); - auto funcT = FunctionType::get(b.getContext(), paramTypes, {i64Type}); + auto funcT = FunctionType::get(b.getContext(), paramTypes, {resultType}); FailureOr func = lookupFnDecl(symTable, funcName, funcT, symbolTables); // Failed due to type mismatch. @@ -308,6 +310,188 @@ struct IntToFpConversion final : OpRewritePattern { bool isUnsigned; }; +struct CmpFOpToAPFloatConversion final : OpRewritePattern { + CmpFOpToAPFloatConversion(MLIRContext *context, SymbolOpInterface symTable, + PatternBenefit benefit = 1) + : OpRewritePattern(context, benefit), symTable(symTable) {} + + LogicalResult matchAndRewrite(arith::CmpFOp op, + PatternRewriter &rewriter) const override { + // Get APFloat function from runtime library. + auto i1Type = IntegerType::get(symTable->getContext(), 1); + auto i8Type = IntegerType::get(symTable->getContext(), 8); + auto i32Type = IntegerType::get(symTable->getContext(), 32); + auto i64Type = IntegerType::get(symTable->getContext(), 64); + FailureOr fn = + lookupOrCreateApFloatFn(rewriter, symTable, "compare", + {i32Type, i64Type, i64Type}, nullptr, i8Type); + if (failed(fn)) + return fn; + + // Cast operands to 64-bit integers. + rewriter.setInsertionPoint(op); + Location loc = op.getLoc(); + auto floatTy = cast(op.getLhs().getType()); + auto intWType = rewriter.getIntegerType(floatTy.getWidth()); + Value lhsBits = arith::ExtUIOp::create( + rewriter, loc, i64Type, + arith::BitcastOp::create(rewriter, loc, intWType, op.getLhs())); + Value rhsBits = arith::ExtUIOp::create( + rewriter, loc, i64Type, + arith::BitcastOp::create(rewriter, loc, intWType, op.getRhs())); + + // Call APFloat function. + Value semValue = getSemanticsValue(rewriter, loc, floatTy); + SmallVector params = {semValue, lhsBits, rhsBits}; + Value comparisonResult = + func::CallOp::create(rewriter, loc, TypeRange(i8Type), + SymbolRefAttr::get(*fn), params) + ->getResult(0); + + // Generate an i1 SSA value that is "true" if the comparison result matches + // the given `val`. + auto checkResult = [&](llvm::APFloat::cmpResult val) { + return arith::CmpIOp::create( + rewriter, loc, arith::CmpIPredicate::eq, comparisonResult, + arith::ConstantOp::create( + rewriter, loc, i8Type, + rewriter.getIntegerAttr(i8Type, static_cast(val))) + .getResult()); + }; + // Generate an i1 SSA value that is "true" if the comparison result matches + // any of the given `vals`. + std::function)> checkResults = + [&](ArrayRef vals) { + Value first = checkResult(vals.front()); + if (vals.size() == 1) + return first; + Value rest = checkResults(vals.drop_front()); + return arith::OrIOp::create(rewriter, loc, first, rest).getResult(); + }; + + // This switch-case statement was taken from arith::applyCmpPredicate. + Value result; + switch (op.getPredicate()) { + case arith::CmpFPredicate::AlwaysFalse: + result = arith::ConstantOp::create(rewriter, loc, i1Type, + rewriter.getIntegerAttr(i1Type, 0)) + .getResult(); + break; + case arith::CmpFPredicate::OEQ: + result = checkResult(llvm::APFloat::cmpEqual); + break; + case arith::CmpFPredicate::OGT: + result = checkResult(llvm::APFloat::cmpGreaterThan); + break; + case arith::CmpFPredicate::OGE: + result = checkResults( + {llvm::APFloat::cmpGreaterThan, llvm::APFloat::cmpEqual}); + break; + case arith::CmpFPredicate::OLT: + result = checkResult(llvm::APFloat::cmpLessThan); + break; + case arith::CmpFPredicate::OLE: + result = + checkResults({llvm::APFloat::cmpLessThan, llvm::APFloat::cmpEqual}); + break; + case arith::CmpFPredicate::ONE: + // Not cmpUnordered and not cmpUnordered. + result = checkResults( + {llvm::APFloat::cmpLessThan, llvm::APFloat::cmpGreaterThan}); + break; + case arith::CmpFPredicate::ORD: + // Not cmpUnordered. + result = checkResults({llvm::APFloat::cmpLessThan, + llvm::APFloat::cmpGreaterThan, + llvm::APFloat::cmpEqual}); + break; + case arith::CmpFPredicate::UEQ: + result = + checkResults({llvm::APFloat::cmpUnordered, llvm::APFloat::cmpEqual}); + break; + case arith::CmpFPredicate::UGT: + result = checkResults( + {llvm::APFloat::cmpUnordered, llvm::APFloat::cmpGreaterThan}); + break; + case arith::CmpFPredicate::UGE: + result = checkResults({llvm::APFloat::cmpUnordered, + llvm::APFloat::cmpGreaterThan, + llvm::APFloat::cmpEqual}); + break; + case arith::CmpFPredicate::ULT: + result = checkResults( + {llvm::APFloat::cmpUnordered, llvm::APFloat::cmpLessThan}); + break; + case arith::CmpFPredicate::ULE: + result = + checkResults({llvm::APFloat::cmpUnordered, llvm::APFloat::cmpLessThan, + llvm::APFloat::cmpEqual}); + break; + case arith::CmpFPredicate::UNE: + // Not cmpEqual. + result = checkResults({llvm::APFloat::cmpLessThan, + llvm::APFloat::cmpGreaterThan, + llvm::APFloat::cmpUnordered}); + break; + case arith::CmpFPredicate::UNO: + result = checkResult(llvm::APFloat::cmpUnordered); + break; + case arith::CmpFPredicate::AlwaysTrue: + result = arith::ConstantOp::create(rewriter, loc, i1Type, + rewriter.getIntegerAttr(i1Type, 1)) + .getResult(); + break; + } + rewriter.replaceOp(op, result); + return success(); + } + + SymbolOpInterface symTable; +}; + +struct NegFOpToAPFloatConversion final : OpRewritePattern { + NegFOpToAPFloatConversion(MLIRContext *context, SymbolOpInterface symTable, + PatternBenefit benefit = 1) + : OpRewritePattern(context, benefit), symTable(symTable) {} + + LogicalResult matchAndRewrite(arith::NegFOp op, + PatternRewriter &rewriter) const override { + // Get APFloat function from runtime library. + auto i32Type = IntegerType::get(symTable->getContext(), 32); + auto i64Type = IntegerType::get(symTable->getContext(), 64); + FailureOr fn = + lookupOrCreateApFloatFn(rewriter, symTable, "neg", {i32Type, i64Type}); + if (failed(fn)) + return fn; + + // Cast operand to 64-bit integer. + rewriter.setInsertionPoint(op); + Location loc = op.getLoc(); + auto floatTy = cast(op.getOperand().getType()); + auto intWType = rewriter.getIntegerType(floatTy.getWidth()); + Value operandBits = arith::ExtUIOp::create( + rewriter, loc, i64Type, arith::BitcastOp::create(rewriter, loc, intWType, op.getOperand())); + + // Call APFloat function. + Value semValue = getSemanticsValue(rewriter, loc, floatTy); + SmallVector params = {semValue, operandBits}; + Value negatedBits = + func::CallOp::create(rewriter, loc, TypeRange(i64Type), + SymbolRefAttr::get(*fn), params) + ->getResult(0); + + // Truncate result to the original width. + Value truncatedBits = arith::TruncIOp::create(rewriter, loc, intWType, + negatedBits); + Value result = + arith::BitcastOp::create(rewriter, loc, floatTy, truncatedBits); + rewriter.replaceOp(op, result); + return success(); + } + + SymbolOpInterface symTable; +}; + namespace { struct ArithToAPFloatConversionPass final : impl::ArithToAPFloatConversionPassBase { @@ -329,8 +513,17 @@ void ArithToAPFloatConversionPass::runOnOperation() { context, "divide", getOperation()); patterns.add>( context, "remainder", getOperation()); + patterns.add>( + context, "minnum", getOperation()); + patterns.add>( + context, "maxnum", getOperation()); + patterns.add>( + context, "minimum", getOperation()); + patterns.add>( + context, "maximum", getOperation()); patterns - .add, FpToFpConversion>( + .add, FpToFpConversion, + CmpFOpToAPFloatConversion, NegFOpToAPFloatConversion>( context, getOperation()); patterns.add>(context, getOperation(), /*isUnsigned=*/false); diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp index d3c305555fde8..b98f15cfe6d75 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp @@ -4707,16 +4707,20 @@ LogicalResult NVVMTargetAttr::verifyTarget(Operation *gpuModule) { "Minimum NVVM target SM version is sm_20"); } - gpuModuleOp->walk([&](Operation *op) { - if (auto reqOp = llvm::dyn_cast(op)) { - const NVVMCheckSMVersion requirement = reqOp.getRequiredMinSMVersion(); - if (!requirement.isCompatibleWith(targetSMVersion)) { - op->emitOpError() << "is not supported on " << getChip(); - return WalkResult::interrupt(); - } - } - return WalkResult::advance(); - }); + if (gpuModuleOp + ->walk([&](Operation *op) { + if (auto reqOp = llvm::dyn_cast(op)) { + const NVVMCheckSMVersion requirement = + reqOp.getRequiredMinSMVersion(); + if (!requirement.isCompatibleWith(targetSMVersion)) { + op->emitOpError() << "is not supported on " << getChip(); + return WalkResult::interrupt(); + } + } + return WalkResult::advance(); + }) + .wasInterrupted()) + return failure(); return success(); } diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp index e85a2ab26bd32..01e6e1e248658 100644 --- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp @@ -430,19 +430,33 @@ static bool convLayoutMatches(ArrayRef> mapListExpected, }))); } -/// Enum of all kinds of Pooling Op's type. -enum PoolingType { - NONE, - MAX_SIGNED, - MAX_UNSIGNED, - MIN_SIGNED, - MIN_UNSIGNED, - SUM +/// Enum representing pooling operation types used by ConvMatcherBuilder. +enum class PoolingType { + None, + MaxSigned, + MaxUnsigned, + MinSigned, + MinUnsigned, + Sum }; /// Helper class for building convolution op matchers with minimal boilerplate. /// Reduces repetitive code across Conv1D/2D/3D and Depthwise variants as well /// as Pooling ops. +/// +/// Usage: Create an instance with the op, spatial rank, and output pointers for +/// extracted dilations/strides. Then chain matchStride() calls for each spatial +/// dimension, followed by matchMaps() to verify indexing maps, and finally +/// matchBody() to verify the operation body pattern. +/// +/// The `matched` flag starts as `true` and is set to `false` if any match step +/// fails. This allows chaining multiple match calls; once any match fails, all +/// subsequent calls become no-ops and the final result is `false`. +/// +/// The `dilations` and `strides` pointers are output parameters that get +/// populated with the extracted dilation and stride values from the operation's +/// indexing maps during matchStride() calls. These values are initially set to +/// 1 for each spatial dimension and updated as patterns are matched. class ConvMatcherBuilder { LinalgOp op; MLIRContext *ctx; @@ -454,7 +468,7 @@ class ConvMatcherBuilder { public: ConvMatcherBuilder(LinalgOp op, unsigned spatialRank, SmallVector *d, SmallVector *s, - PoolingType poolingType = PoolingType::NONE) + PoolingType poolingType = PoolingType::None) : op(op), ctx(op->getContext()), dilations(d), strides(s), indexingMaps(op.getIndexingMaps()), poolingType(poolingType) { *dilations = SmallVector(spatialRank, 1); @@ -474,16 +488,16 @@ class ConvMatcherBuilder { ConvMatcherBuilder &matchStride(unsigned iDim, unsigned fDim, unsigned oDim, unsigned idx) { if (matched) { - matched = matchConvDimAddExprPattern(indexingMaps, iDim, fDim, oDim, - (*dilations)[idx], (*strides)[idx]); + matched &= matchConvDimAddExprPattern(indexingMaps, iDim, fDim, oDim, + (*dilations)[idx], (*strides)[idx]); } return *this; } /// Match expected indexing maps layout. Returns *this for method chaining. - ConvMatcherBuilder &expectMaps(ArrayRef> maps) { + ConvMatcherBuilder &matchMaps(ArrayRef> maps) { if (matched) - matched = convLayoutMatches(maps, indexingMaps, ctx); + matched &= convLayoutMatches(maps, indexingMaps, ctx); return *this; } @@ -494,17 +508,17 @@ class ConvMatcherBuilder { Block *body = op.getBlock(); auto yieldOp = cast(body->getTerminator()); switch (poolingType) { - case PoolingType::NONE: + case PoolingType::None: return bodyMatcherForConvolutionOps(yieldOp.getOperand(0), body); - case PoolingType::MAX_SIGNED: + case PoolingType::MaxSigned: return bodyMatcherForMaxSignedPoolOps(yieldOp.getOperand(0), body); - case PoolingType::MAX_UNSIGNED: + case PoolingType::MaxUnsigned: return bodyMatcherForMaxUnsignedPoolOps(yieldOp.getOperand(0), body); - case PoolingType::MIN_SIGNED: + case PoolingType::MinSigned: return bodyMatcherForMinSignedPoolOps(yieldOp.getOperand(0), body); - case PoolingType::MIN_UNSIGNED: + case PoolingType::MinUnsigned: return bodyMatcherForMinUnsignedPoolOps(yieldOp.getOperand(0), body); - case PoolingType::SUM: + case PoolingType::Sum: return bodyMatcherForSumPoolOps(yieldOp.getOperand(0), body); } return false; @@ -533,9 +547,9 @@ bool isaConvolutionOpOfType(LinalgOp op, AffineExpr w = m.dim(1); return m.matchStride(/*iDim=*/0, /*fDim=*/0, /*oDim=*/0, /*idx=*/0) - .expectMaps({/*inputMap=*/{m.strided(W, w, 0)}, - /*filterMap=*/{w}, - /*outputMap=*/{W}}) + .matchMaps({/*inputMap=*/{m.strided(W, w, 0)}, + /*filterMap=*/{w}, + /*outputMap=*/{W}}) .matchBody(); } @@ -560,9 +574,9 @@ bool isaConvolutionOpOfType( AffineExpr c = m.dim(4); return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0) - .expectMaps({/*inputMap=*/{N, m.strided(W, w, 0), c}, - /*filterMap=*/{w, c, F}, - /*outputMap=*/{N, W, F}}) + .matchMaps({/*inputMap=*/{N, m.strided(W, w, 0), c}, + /*filterMap=*/{w, c, F}, + /*outputMap=*/{N, W, F}}) .matchBody(); } @@ -587,9 +601,9 @@ bool isaConvolutionOpOfType( AffineExpr w = m.dim(4); return m.matchStride(/*iDim=*/2, /*fDim=*/2, /*oDim=*/2, /*idx=*/0) - .expectMaps({/*inputMap=*/{N, c, m.strided(W, w, 0)}, - /*filterMap=*/{F, c, w}, - /*outputMap=*/{N, F, W}}) + .matchMaps({/*inputMap=*/{N, c, m.strided(W, w, 0)}, + /*filterMap=*/{F, c, w}, + /*outputMap=*/{N, F, W}}) .matchBody(); } @@ -614,9 +628,9 @@ bool isaConvolutionOpOfType(LinalgOp op, return m.matchStride(/*iDim=*/0, /*fDim=*/0, /*oDim=*/0, /*idx=*/0) .matchStride(/*iDim=*/1, /*fDim=*/1, /*oDim=*/1, /*idx=*/1) - .expectMaps({/*inputMap=*/{m.strided(H, h, 0), m.strided(W, w, 1)}, - /*filterMap=*/{h, w}, - /*outputMap=*/{H, W}}) + .matchMaps({/*inputMap=*/{m.strided(H, h, 0), m.strided(W, w, 1)}, + /*filterMap=*/{h, w}, + /*outputMap=*/{H, W}}) .matchBody(); } @@ -644,10 +658,10 @@ bool isaConvolutionOpOfType(LinalgOp op, return m.matchStride(/*iDim=*/0, /*fDim=*/0, /*oDim=*/0, /*idx=*/0) .matchStride(/*iDim=*/1, /*fDim=*/1, /*oDim=*/1, /*idx=*/1) .matchStride(/*iDim=*/2, /*fDim=*/2, /*oDim=*/2, /*idx=*/2) - .expectMaps({/*inputMap=*/{m.strided(D, d, 0), m.strided(H, h, 1), - m.strided(W, w, 2)}, - /*filterMap=*/{d, h, w}, - /*outputMap=*/{D, H, W}}) + .matchMaps({/*inputMap=*/{m.strided(D, d, 0), m.strided(H, h, 1), + m.strided(W, w, 2)}, + /*filterMap=*/{d, h, w}, + /*outputMap=*/{D, H, W}}) .matchBody(); } @@ -671,9 +685,9 @@ bool isaConvolutionOpOfType( AffineExpr w = m.dim(3); return m.matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/0) - .expectMaps({/*inputMap=*/{N, C, m.strided(W, w, 0)}, - /*filterMap=*/{C, w}, - /*outputMap=*/{N, C, W}}) + .matchMaps({/*inputMap=*/{N, C, m.strided(W, w, 0)}, + /*filterMap=*/{C, w}, + /*outputMap=*/{N, C, W}}) .matchBody(); } @@ -697,9 +711,9 @@ bool isaConvolutionOpOfType( AffineExpr w = m.dim(3); return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0) - .expectMaps({/*inputMap=*/{N, m.strided(W, w, 0), C}, - /*filterMap=*/{w, C}, - /*outputMap=*/{N, W, C}}) + .matchMaps({/*inputMap=*/{N, m.strided(W, w, 0), C}, + /*filterMap=*/{w, C}, + /*outputMap=*/{N, W, C}}) .matchBody(); } @@ -724,9 +738,9 @@ bool isaConvolutionOpOfType( AffineExpr w = m.dim(4); return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0) - .expectMaps({/*inputMap=*/{N, m.strided(W, w, 0), C}, - /*filterMap=*/{w, C, CM}, - /*outputMap=*/{N, W, C, CM}}) + .matchMaps({/*inputMap=*/{N, m.strided(W, w, 0), C}, + /*filterMap=*/{w, C, CM}, + /*outputMap=*/{N, W, C, CM}}) .matchBody(); } @@ -753,9 +767,9 @@ bool isaConvolutionOpOfType( return m.matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/0) .matchStride(/*iDim=*/3, /*fDim=*/2, /*oDim=*/3, /*idx=*/1) - .expectMaps({/*inputMap=*/{N, C, m.strided(H, h, 0), m.strided(W, w, 1)}, - /*filterMap=*/{C, h, w}, - /*outputMap=*/{N, C, H, W}}) + .matchMaps({/*inputMap=*/{N, C, m.strided(H, h, 0), m.strided(W, w, 1)}, + /*filterMap=*/{C, h, w}, + /*outputMap=*/{N, C, H, W}}) .matchBody(); } @@ -789,10 +803,10 @@ bool isaConvolutionOpOfType( return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0) .matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/1) .matchStride(/*iDim=*/3, /*fDim=*/2, /*oDim=*/3, /*idx=*/2) - .expectMaps({/*inputMap=*/{N, m.strided(D, d, 0), m.strided(H, h, 1), - m.strided(W, w, 2), C}, - /*filterMap=*/{d, h, w, C, CM}, - /*outputMap=*/{N, D, H, W, C, CM}}) + .matchMaps({/*inputMap=*/{N, m.strided(D, d, 0), m.strided(H, h, 1), + m.strided(W, w, 2), C}, + /*filterMap=*/{d, h, w, C, CM}, + /*outputMap=*/{N, D, H, W, C, CM}}) .matchBody(); } @@ -810,7 +824,7 @@ bool isaConvolutionOpOfType( "expected op to implement ConvolutionOpInterface"); ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides, - PoolingType::MAX_SIGNED); + PoolingType::MaxSigned); AffineExpr N = m.dim(0); AffineExpr H = m.dim(1); AffineExpr W = m.dim(2); @@ -820,9 +834,9 @@ bool isaConvolutionOpOfType( return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0) .matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/1) - .expectMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C}, - /*filterMap=*/{h, w}, - /*outputMap=*/{N, H, W, C}}) + .matchMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C}, + /*filterMap=*/{h, w}, + /*outputMap=*/{N, H, W, C}}) .matchBody(); } @@ -840,7 +854,7 @@ bool isaConvolutionOpOfType( "expected op to implement ConvolutionOpInterface"); ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides, - PoolingType::MIN_SIGNED); + PoolingType::MinSigned); AffineExpr N = m.dim(0); AffineExpr H = m.dim(1); AffineExpr W = m.dim(2); @@ -850,9 +864,9 @@ bool isaConvolutionOpOfType( return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0) .matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/1) - .expectMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C}, - /*filterMap=*/{h, w}, - /*outputMap=*/{N, H, W, C}}) + .matchMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C}, + /*filterMap=*/{h, w}, + /*outputMap=*/{N, H, W, C}}) .matchBody(); } @@ -870,7 +884,7 @@ bool isaConvolutionOpOfType( "expected op to implement ConvolutionOpInterface"); ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides, - PoolingType::SUM); + PoolingType::Sum); AffineExpr N = m.dim(0); AffineExpr H = m.dim(1); AffineExpr W = m.dim(2); @@ -880,9 +894,9 @@ bool isaConvolutionOpOfType( return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0) .matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/1) - .expectMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C}, - /*filterMap=*/{h, w}, - /*outputMap=*/{N, H, W, C}}) + .matchMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C}, + /*filterMap=*/{h, w}, + /*outputMap=*/{N, H, W, C}}) .matchBody(); } @@ -900,7 +914,7 @@ bool isaConvolutionOpOfType( "expected op to implement ConvolutionOpInterface"); ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides, - PoolingType::MAX_UNSIGNED); + PoolingType::MaxUnsigned); AffineExpr N = m.dim(0); AffineExpr H = m.dim(1); AffineExpr W = m.dim(2); @@ -910,9 +924,9 @@ bool isaConvolutionOpOfType( return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0) .matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/1) - .expectMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C}, - /*filterMap=*/{h, w}, - /*outputMap=*/{N, H, W, C}}) + .matchMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C}, + /*filterMap=*/{h, w}, + /*outputMap=*/{N, H, W, C}}) .matchBody(); } @@ -930,7 +944,7 @@ bool isaConvolutionOpOfType( "expected op to implement ConvolutionOpInterface"); ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides, - PoolingType::MIN_UNSIGNED); + PoolingType::MinUnsigned); AffineExpr N = m.dim(0); AffineExpr H = m.dim(1); AffineExpr W = m.dim(2); @@ -940,9 +954,9 @@ bool isaConvolutionOpOfType( return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0) .matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/1) - .expectMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C}, - /*filterMap=*/{h, w}, - /*outputMap=*/{N, H, W, C}}) + .matchMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C}, + /*filterMap=*/{h, w}, + /*outputMap=*/{N, H, W, C}}) .matchBody(); } diff --git a/mlir/lib/Dialect/SCF/IR/CMakeLists.txt b/mlir/lib/Dialect/SCF/IR/CMakeLists.txt index 423e1c3e1e042..b111117410ba3 100644 --- a/mlir/lib/Dialect/SCF/IR/CMakeLists.txt +++ b/mlir/lib/Dialect/SCF/IR/CMakeLists.txt @@ -19,5 +19,5 @@ add_mlir_dialect_library(MLIRSCFDialect MLIRSideEffectInterfaces MLIRTensorDialect MLIRValueBoundsOpInterface + MLIRTransformUtils ) - diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp index a09b9639f8dd1..a63e44f4808e0 100644 --- a/mlir/lib/Dialect/SCF/IR/SCF.cpp +++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp @@ -26,6 +26,7 @@ #include "mlir/Interfaces/ParallelCombiningOpInterface.h" #include "mlir/Interfaces/ValueBoundsOpInterface.h" #include "mlir/Transforms/InliningUtils.h" +#include "mlir/Transforms/RegionUtils.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" @@ -3682,6 +3683,133 @@ LogicalResult scf::WhileOp::verify() { } namespace { +/// Move a scf.if op that is directly before the scf.condition op in the while +/// before region, and whose condition matches the condition of the +/// scf.condition op, down into the while after region. +/// +/// scf.while (..) : (...) -> ... { +/// %additional_used_values = ... +/// %cond = ... +/// ... +/// %res = scf.if %cond -> (...) { +/// use(%additional_used_values) +/// ... // then block +/// scf.yield %then_value +/// } else { +/// scf.yield %else_value +/// } +/// scf.condition(%cond) %res, ... +/// } do { +/// ^bb0(%res_arg, ...): +/// use(%res_arg) +/// ... +/// +/// becomes +/// scf.while (..) : (...) -> ... { +/// %additional_used_values = ... +/// %cond = ... +/// ... +/// scf.condition(%cond) %else_value, ..., %additional_used_values +/// } do { +/// ^bb0(%res_arg ..., %additional_args): : +/// use(%additional_args) +/// ... // if then block +/// use(%then_value) +/// ... +struct WhileMoveIfDown : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(scf::WhileOp op, + PatternRewriter &rewriter) const override { + auto conditionOp = op.getConditionOp(); + + // Only support ifOp right before the condition at the moment. Relaxing this + // would require to: + // - check that the body does not have side-effects conflicting with + // operations between the if and the condition. + // - check that results of the if operation are only used as arguments to + // the condition. + auto ifOp = dyn_cast_or_null(conditionOp->getPrevNode()); + + // Check that the ifOp is directly before the conditionOp and that it + // matches the condition of the conditionOp. Also ensure that the ifOp has + // no else block with content, as that would complicate the transformation. + // TODO: support else blocks with content. + if (!ifOp || ifOp.getCondition() != conditionOp.getCondition() || + (ifOp.elseBlock() && !ifOp.elseBlock()->without_terminator().empty())) + return failure(); + + assert(ifOp->use_empty() || (llvm::all_equal(ifOp->getUsers()) && + *ifOp->user_begin() == conditionOp) && + "ifOp has unexpected uses"); + + Location loc = op.getLoc(); + + // Replace uses of ifOp results in the conditionOp with the yielded values + // from the ifOp branches. + for (auto [idx, arg] : llvm::enumerate(conditionOp.getArgs())) { + auto it = llvm::find(ifOp->getResults(), arg); + if (it != ifOp->getResults().end()) { + size_t ifOpIdx = it.getIndex(); + Value thenValue = ifOp.thenYield()->getOperand(ifOpIdx); + Value elseValue = ifOp.elseYield()->getOperand(ifOpIdx); + + rewriter.replaceAllUsesWith(ifOp->getResults()[ifOpIdx], elseValue); + rewriter.replaceAllUsesWith(op.getAfterArguments()[idx], thenValue); + } + } + + // Collect additional used values from before region. + SetVector additionalUsedValuesSet; + visitUsedValuesDefinedAbove(ifOp.getThenRegion(), [&](OpOperand *operand) { + if (&op.getBefore() == operand->get().getParentRegion()) + additionalUsedValuesSet.insert(operand->get()); + }); + + // Create new whileOp with additional used values as results. + auto additionalUsedValues = additionalUsedValuesSet.getArrayRef(); + auto additionalValueTypes = llvm::map_to_vector( + additionalUsedValues, [](Value val) { return val.getType(); }); + size_t additionalValueSize = additionalUsedValues.size(); + SmallVector newResultTypes(op.getResultTypes()); + newResultTypes.append(additionalValueTypes); + + auto newWhileOp = + scf::WhileOp::create(rewriter, loc, newResultTypes, op.getInits()); + + rewriter.modifyOpInPlace(newWhileOp, [&] { + newWhileOp.getBefore().takeBody(op.getBefore()); + newWhileOp.getAfter().takeBody(op.getAfter()); + newWhileOp.getAfter().addArguments( + additionalValueTypes, + SmallVector(additionalValueSize, loc)); + }); + + rewriter.modifyOpInPlace(conditionOp, [&] { + conditionOp.getArgsMutable().append(additionalUsedValues); + }); + + // Replace uses of additional used values inside the ifOp then region with + // the whileOp after region arguments. + rewriter.replaceUsesWithIf( + additionalUsedValues, + newWhileOp.getAfterArguments().take_back(additionalValueSize), + [&](OpOperand &use) { + return ifOp.getThenRegion().isAncestor( + use.getOwner()->getParentRegion()); + }); + + // Inline ifOp then region into new whileOp after region. + rewriter.eraseOp(ifOp.thenYield()); + rewriter.inlineBlockBefore(ifOp.thenBlock(), newWhileOp.getAfterBody(), + newWhileOp.getAfterBody()->begin()); + rewriter.eraseOp(ifOp); + rewriter.replaceOp(op, + newWhileOp->getResults().drop_back(additionalValueSize)); + return success(); + } +}; + /// Replace uses of the condition within the do block with true, since otherwise /// the block would not be evaluated. /// @@ -4394,7 +4522,8 @@ void WhileOp::getCanonicalizationPatterns(RewritePatternSet &results, results.add(context); + WhileRemoveUnusedArgs, WhileOpAlignBeforeArgs, WhileMoveIfDown>( + context); } //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/SCF/Transforms/UpliftWhileToFor.cpp b/mlir/lib/Dialect/SCF/Transforms/UpliftWhileToFor.cpp index 9f242f9e62b8e..ec1044aaa42ac 100644 --- a/mlir/lib/Dialect/SCF/Transforms/UpliftWhileToFor.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/UpliftWhileToFor.cpp @@ -19,83 +19,6 @@ using namespace mlir; namespace { -/// Move an scf.if op that is directly before the scf.condition op in the while -/// before region, and whose condition matches the condition of the -/// scf.condition op, down into the while after region. -/// -/// scf.while (%init) : (...) -> ... { -/// %cond = ... -/// %res = scf.if %cond -> (...) { -/// use1(%init) -/// %then_val = ... -/// ... // then block -/// scf.yield %then_val -/// } else { -/// scf.yield %init -/// } -/// scf.condition(%cond) %res -/// } do { -/// ^bb0(%arg): -/// use2(%arg) -/// ... -/// -/// becomes -/// scf.while (%init) : (...) -> ... { -/// %cond = ... -/// scf.condition(%cond) %init -/// } do { -/// ^bb0(%arg): : -/// use1(%arg) -/// ... // if then block -/// %then_val = ... -/// use2(%then_val) -/// ... -struct WhileMoveIfDown : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(scf::WhileOp op, - PatternRewriter &rewriter) const override { - // Check that the first opeation produces one result and that result must - // have exactly two uses (these two uses come from the `scf.if` and - // `scf.condition` operations). - Operation &condOp = op.getBeforeBody()->front(); - if (condOp.getNumResults() != 1 || !condOp.getResult(0).hasNUses(2)) - return failure(); - - Value condVal = condOp.getResult(0); - auto ifOp = dyn_cast(condOp.getNextNode()); - if (!ifOp || ifOp.getCondition() != condVal) - return failure(); - - auto term = dyn_cast(ifOp->getNextNode()); - if (!term || term.getCondition() != condVal) - return failure(); - - // Check that if results and else yield operands match the scf.condition op - // arguments and while before arguments respectively. - if (!llvm::equal(ifOp->getResults(), term.getArgs()) || - !llvm::equal(ifOp.elseYield()->getOperands(), op.getBeforeArguments())) - return failure(); - - // Update uses and move the if op into the after region. - rewriter.replaceAllUsesWith(op.getAfterArguments(), - ifOp.thenYield()->getOperands()); - rewriter.replaceUsesWithIf(op.getBeforeArguments(), op.getAfterArguments(), - [&](OpOperand &use) { - return ifOp.getThenRegion().isAncestor( - use.getOwner()->getParentRegion()); - }); - rewriter.modifyOpInPlace( - term, [&]() { term.getArgsMutable().assign(op.getBeforeArguments()); }); - - rewriter.eraseOp(ifOp.thenYield()); - rewriter.inlineBlockBefore(ifOp.thenBlock(), op.getAfterBody(), - op.getAfterBody()->begin()); - rewriter.eraseOp(ifOp); - return success(); - } -}; - struct UpliftWhileOp : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; @@ -344,5 +267,5 @@ FailureOr mlir::scf::upliftWhileToForLoop(RewriterBase &rewriter, } void mlir::scf::populateUpliftWhileToForPatterns(RewritePatternSet &patterns) { - patterns.add(patterns.getContext()); + patterns.add(patterns.getContext()); } diff --git a/mlir/lib/ExecutionEngine/APFloatWrappers.cpp b/mlir/lib/ExecutionEngine/APFloatWrappers.cpp index 44980ccd77491..f3e38eb8ffa2d 100644 --- a/mlir/lib/ExecutionEngine/APFloatWrappers.cpp +++ b/mlir/lib/ExecutionEngine/APFloatWrappers.cpp @@ -131,4 +131,44 @@ MLIR_APFLOAT_WRAPPERS_EXPORT uint64_t _mlir_apfloat_convert_from_int( llvm::RoundingMode::NearestTiesToEven); return result.bitcastToAPInt().getZExtValue(); } + +MLIR_APFLOAT_WRAPPERS_EXPORT int8_t _mlir_apfloat_compare(int32_t semantics, + uint64_t a, + uint64_t b) { + const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics( + static_cast(semantics)); + unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem); + llvm::APFloat x(sem, llvm::APInt(bitWidth, a)); + llvm::APFloat y(sem, llvm::APInt(bitWidth, b)); + return static_cast(x.compare(y)); +} + +MLIR_APFLOAT_WRAPPERS_EXPORT uint64_t _mlir_apfloat_neg(int32_t semantics, uint64_t a) { + const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics( + static_cast(semantics)); + unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem); + llvm::APFloat x(sem, llvm::APInt(bitWidth, a)); + x.changeSign(); + return x.bitcastToAPInt().getZExtValue(); +} + +/// Min/max operations. +#define APFLOAT_MIN_MAX_OP(OP) \ + MLIR_APFLOAT_WRAPPERS_EXPORT uint64_t _mlir_apfloat_##OP( \ + int32_t semantics, uint64_t a, uint64_t b) { \ + const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics( \ + static_cast(semantics)); \ + unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem); \ + llvm::APFloat lhs(sem, llvm::APInt(bitWidth, a)); \ + llvm::APFloat rhs(sem, llvm::APInt(bitWidth, b)); \ + llvm::APFloat result = llvm::OP(lhs, rhs); \ + return result.bitcastToAPInt().getZExtValue(); \ + } + +APFLOAT_MIN_MAX_OP(minimum) +APFLOAT_MIN_MAX_OP(maximum) +APFLOAT_MIN_MAX_OP(minnum) +APFLOAT_MIN_MAX_OP(maxnum) + +#undef APFLOAT_MIN_MAX_OP } diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 0cab8f352ada7..a2871b36fd9d8 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -2809,6 +2809,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, ArrayRef isByRef = getIsByRef(opInst.getReductionByref()); assert(isByRef.size() == opInst.getNumReductionVars()); llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); + bool isCancellable = constructIsCancellable(opInst); if (failed(checkImplementationStatus(*opInst))) return failure(); @@ -2946,6 +2947,18 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, opInst.getLoc(), privateVarsInfo))) return llvm::make_error(); + // If we could be performing cancellation, add the cancellation barrier on + // the way out of the outlined region. + if (isCancellable) { + auto IPOrErr = ompBuilder->createBarrier( + llvm::OpenMPIRBuilder::LocationDescription(builder), + llvm::omp::Directive::OMPD_unknown, + /* ForceSimpleCall */ false, + /* CheckCancelFlag */ false); + if (!IPOrErr) + return IPOrErr.takeError(); + } + builder.restoreIP(oldIP); return llvm::Error::success(); }; @@ -2959,7 +2972,6 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, auto pbKind = llvm::omp::OMP_PROC_BIND_default; if (auto bind = opInst.getProcBindKind()) pbKind = getProcBindKind(*bind); - bool isCancellable = constructIsCancellable(opInst); llvm::SmallVector deallocIPs; llvm::OpenMPIRBuilder::InsertPointTy allocIP = diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp index 252be796488c5..d08e7ecf326ca 100644 --- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp +++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp @@ -2923,9 +2923,6 @@ LogicalResult spirv::Deserializer::structurizeControlFlow() { return failure(); } - // TODO: This loop is non-deterministic. Iteration order may vary between runs - // for the same shader as the key to the map is a pointer. See: - // https://github.com/llvm/llvm-project/issues/128547 while (!blockMergeInfo.empty()) { Block *headerBlock = blockMergeInfo.begin()->first; BlockMergeInfo mergeInfo = blockMergeInfo.begin()->second; diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.h b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.h index 243e6fd70ae43..6d09d556c4d02 100644 --- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.h +++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.h @@ -58,7 +58,9 @@ struct DebugLine { }; /// Map from a selection/loop's header block to its merge (and continue) target. -using BlockMergeInfoMap = DenseMap; +/// Use `MapVector<>` to ensure a deterministic iteration order with a pointer +/// key. +using BlockMergeInfoMap = llvm::MapVector; /// A "deferred struct type" is a struct type with one or more member types not /// known when the Deserializer first encounters the struct. This happens, for diff --git a/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir b/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir index d71d81dddcd4f..950d2cecefa95 100644 --- a/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir +++ b/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir @@ -198,3 +198,68 @@ func.func @uitofp(%arg0: i32) { %0 = arith.uitofp %arg0 : i32 to f4E2M1FN return } + +// ----- + +// CHECK: func.func private @_mlir_apfloat_compare(i32, i64, i64) -> i8 +// CHECK: %[[sem:.*]] = arith.constant 18 : i32 +// CHECK: %[[cmp:.*]] = call @_mlir_apfloat_compare(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i8 +// CHECK: %[[c3:.*]] = arith.constant 3 : i8 +// CHECK: %[[is_unordered:.*]] = arith.cmpi eq, %[[cmp]], %[[c3]] : i8 +// CHECK: %[[c0:.*]] = arith.constant 0 : i8 +// CHECK: %[[is_lt:.*]] = arith.cmpi eq, %[[cmp]], %[[c0]] : i8 +// CHECK: arith.ori %[[is_unordered]], %[[is_lt]] : i1 +func.func @cmpf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) { + %0 = arith.cmpf "ult", %arg0, %arg1 : f4E2M1FN + return +} + +// ----- + +// CHECK: func.func private @_mlir_apfloat_neg(i32, i64) -> i64 +// CHECK: %[[sem:.*]] = arith.constant 2 : i32 +// CHECK: %[[res:.*]] = call @_mlir_apfloat_neg(%[[sem]], %{{.*}}) : (i32, i64) -> i64 +func.func @negf(%arg0: f32) { + %0 = arith.negf %arg0 : f32 + return +} + +// ----- + +// CHECK: func.func private @_mlir_apfloat_minimum(i32, i64, i64) -> i64 +// CHECK: %[[sem:.*]] = arith.constant 2 : i32 +// CHECK: %[[res:.*]] = call @_mlir_apfloat_minimum(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64 +func.func @minimumf(%arg0: f32, %arg1: f32) { + %0 = arith.minimumf %arg0, %arg1 : f32 + return +} + +// ----- + +// CHECK: func.func private @_mlir_apfloat_maximum(i32, i64, i64) -> i64 +// CHECK: %[[sem:.*]] = arith.constant 2 : i32 +// CHECK: %[[res:.*]] = call @_mlir_apfloat_maximum(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64 +func.func @maximumf(%arg0: f32, %arg1: f32) { + %0 = arith.maximumf %arg0, %arg1 : f32 + return +} + +// ----- + +// CHECK: func.func private @_mlir_apfloat_minnum(i32, i64, i64) -> i64 +// CHECK: %[[sem:.*]] = arith.constant 2 : i32 +// CHECK: %[[res:.*]] = call @_mlir_apfloat_minnum(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64 +func.func @minnumf(%arg0: f32, %arg1: f32) { + %0 = arith.minnumf %arg0, %arg1 : f32 + return +} + +// ----- + +// CHECK: func.func private @_mlir_apfloat_maxnum(i32, i64, i64) -> i64 +// CHECK: %[[sem:.*]] = arith.constant 2 : i32 +// CHECK: %[[res:.*]] = call @_mlir_apfloat_maxnum(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64 +func.func @maxnumf(%arg0: f32, %arg1: f32) { + %0 = arith.maxnumf %arg0, %arg1 : f32 + return +} diff --git a/mlir/test/Conversion/UBToSPIRV/ub-to-spirv.mlir b/mlir/test/Conversion/UBToSPIRV/ub-to-spirv.mlir index edbe8b8001bba..9c277cf99b9a8 100644 --- a/mlir/test/Conversion/UBToSPIRV/ub-to-spirv.mlir +++ b/mlir/test/Conversion/UBToSPIRV/ub-to-spirv.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -split-input-file -convert-ub-to-spirv -verify-diagnostics %s | FileCheck %s +// RUN: mlir-opt -split-input-file -convert-ub-to-spirv %s | FileCheck %s module attributes { spirv.target_env = #spirv.target_env< @@ -22,15 +22,17 @@ func.func @check_poison() { // ----- -// No successful test because the dialect conversion framework does not convert -// unreachable blocks. - module attributes { spirv.target_env = #spirv.target_env< #spirv.vce, #spirv.resource_limits<>> } { -func.func @check_unrechable() { -// expected-error@+1{{cannot be used in reachable block}} - spirv.Unreachable +// CHECK-LABEL: @check_unrechable +func.func @check_unrechable(%c: i1) { + cf.cond_br %c, ^bb1, ^bb2 +^bb1: +// CHECK: spirv.Unreachable + ub.unreachable +^bb2: + return } } diff --git a/mlir/test/Dialect/LLVMIR/nvvm-target-invalid.mlir b/mlir/test/Dialect/LLVMIR/nvvm-target-invalid.mlir new file mode 100644 index 0000000000000..c2cfa7689978b --- /dev/null +++ b/mlir/test/Dialect/LLVMIR/nvvm-target-invalid.mlir @@ -0,0 +1,11 @@ +// RUN: not mlir-opt %s 2>&1 | FileCheck %s +// CHECK: 'nvvm.tcgen05.alloc' op is not supported on sm_90 + +module { + gpu.module @mod [#nvvm.target] { + func.func @tcgen05_alloc(%arg0: !llvm.ptr<7>, %arg1: i32) { + nvvm.tcgen05.alloc %arg0, %arg1 : !llvm.ptr<7>, i32 + return + } + } +} diff --git a/mlir/test/Dialect/SCF/canonicalize.mlir b/mlir/test/Dialect/SCF/canonicalize.mlir index 084c3fc065de3..ac590fc0c47b9 100644 --- a/mlir/test/Dialect/SCF/canonicalize.mlir +++ b/mlir/test/Dialect/SCF/canonicalize.mlir @@ -974,6 +974,56 @@ func.func @replace_if_with_cond3(%arg0 : i1, %arg2: i64) -> (i32, i64) { // ----- +// CHECK-LABEL: @while_move_if_down +func.func @while_move_if_down() -> i32 { + %defined_outside = "test.get_some_value0" () : () -> (i32) + %0 = scf.while () : () -> (i32) { + %used_value = "test.get_some_value1" () : () -> (i32) + %used_by_subregion = "test.get_some_value2" () : () -> (i32) + %else_value = "test.get_some_value3" () : () -> (i32) + %condition = "test.condition"() : () -> i1 + %res = scf.if %condition -> (i32) { + "test.use0" (%defined_outside) : (i32) -> () + "test.use1" (%used_value) : (i32) -> () + test.alloca_scope_region { + "test.use2" (%used_by_subregion) : (i32) -> () + } + %then_value = "test.get_some_value4" () : () -> (i32) + scf.yield %then_value : i32 + } else { + scf.yield %else_value : i32 + } + scf.condition(%condition) %res : i32 + } do { + ^bb0(%res_arg: i32): + "test.use3" (%res_arg) : (i32) -> () + scf.yield + } + return %0 : i32 +} +// CHECK: %[[defined_outside:.*]] = "test.get_some_value0"() : () -> i32 +// CHECK: %[[WHILE_RES:.*]]:3 = scf.while : () -> (i32, i32, i32) { +// CHECK: %[[used_value:.*]] = "test.get_some_value1"() : () -> i32 +// CHECK: %[[used_by_subregion:.*]] = "test.get_some_value2"() : () -> i32 +// CHECK: %[[else_value:.*]] = "test.get_some_value3"() : () -> i32 +// CHECK: %[[condition:.*]] = "test.condition"() : () -> i1 +// CHECK: scf.condition(%[[condition]]) %[[else_value]], %[[used_value]], %[[used_by_subregion]] : i32, i32, i32 +// CHECK: } do { +// CHECK: ^bb0(%[[res_arg:.*]]: i32, %[[used_value_arg:.*]]: i32, %[[used_by_subregion_arg:.*]]: i32): +// CHECK: "test.use0"(%[[defined_outside]]) : (i32) -> () +// CHECK: "test.use1"(%[[used_value_arg]]) : (i32) -> () +// CHECK: test.alloca_scope_region { +// CHECK: "test.use2"(%[[used_by_subregion_arg]]) : (i32) -> () +// CHECK: } +// CHECK: %[[then_value:.*]] = "test.get_some_value4"() : () -> i32 +// CHECK: "test.use3"(%[[then_value]]) : (i32) -> () +// CHECK: scf.yield +// CHECK: } +// CHECK: return %[[WHILE_RES]]#0 : i32 +// CHECK: } + +// ----- + // CHECK-LABEL: @while_cond_true func.func @while_cond_true() -> i1 { %0 = scf.while () : () -> i1 { diff --git a/mlir/test/Dialect/SCF/uplift-while.mlir b/mlir/test/Dialect/SCF/uplift-while.mlir index 736112824c515..cbe2ce5076ad2 100644 --- a/mlir/test/Dialect/SCF/uplift-while.mlir +++ b/mlir/test/Dialect/SCF/uplift-while.mlir @@ -185,34 +185,3 @@ func.func @uplift_while(%arg0: index, %arg1: index, %arg2: index) -> (i32, f32) // CHECK: %[[T2:.*]] = "test.test2"(%[[ARG2]]) : (f32) -> f32 // CHECK: scf.yield %[[T1]], %[[T2]] : i32, f32 // CHECK: return %[[RES]]#0, %[[RES]]#1 : i32, f32 - -// ----- - -func.func @uplift_while(%low: index, %upper: index, %val : i32) -> i32 { - %c1 = arith.constant 1 : index - %1:2 = scf.while (%iv = %low, %iter = %val) : (index, i32) -> (index, i32) { - %2 = arith.cmpi slt, %iv, %upper : index - %3:2 = scf.if %2 -> (index, i32) { - %4 = "test.test"(%iter) : (i32) -> i32 - %5 = arith.addi %iv, %c1 : index - scf.yield %5, %4 : index, i32 - } else { - scf.yield %iv, %iter : index, i32 - } - scf.condition(%2) %3#0, %3#1 : index, i32 - } do { - ^bb0(%arg0: index, %arg1: i32): - scf.yield %arg0, %arg1 : index, i32 - } - return %1#1 : i32 -} - -// CHECK-LABEL: func.func @uplift_while( -// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index, %[[ARG2:.*]]: i32) -> i32 { -// CHECK: %[[CONSTANT_0:.*]] = arith.constant 1 : index -// CHECK: %[[FOR_0:.*]] = scf.for %[[VAL_0:.*]] = %[[ARG0]] to %[[ARG1]] step %[[CONSTANT_0]] iter_args(%[[VAL_1:.*]] = %[[ARG2]]) -> (i32) { -// CHECK: %[[VAL_2:.*]] = "test.test"(%[[VAL_1]]) : (i32) -> i32 -// CHECK: scf.yield %[[VAL_2]] : i32 -// CHECK: } -// CHECK: return %[[FOR_0]] : i32 -// CHECK: } diff --git a/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir b/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir index 8046610d479a8..7f72dd5931488 100644 --- a/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir +++ b/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir @@ -43,6 +43,18 @@ func.func @entry() { %cvt = arith.truncf %b2 : f32 to f8E4M3FN vector.print %cvt : f8E4M3FN + // CHECK-NEXT: -2.25 + %negated = arith.negf %cvt : f8E4M3FN + vector.print %negated : f8E4M3FN + + // CHECK-NEXT: -2.25 + %min = arith.minimumf %cvt, %negated : f8E4M3FN + vector.print %min : f8E4M3FN + + // CHECK-NEXT: 1 + %cmp1 = arith.cmpf "olt", %cvt, %c1 : f8E4M3FN + vector.print %cmp1 : i1 + // CHECK-NEXT: 1 // Bit pattern: 01, interpreted as signed integer: 1 %cvt_int_signed = arith.fptosi %cvt : f8E4M3FN to i2 diff --git a/mlir/test/Target/LLVMIR/openmp-barrier-cancel.mlir b/mlir/test/Target/LLVMIR/openmp-barrier-cancel.mlir index c4b245667a1f3..6585549de7f96 100644 --- a/mlir/test/Target/LLVMIR/openmp-barrier-cancel.mlir +++ b/mlir/test/Target/LLVMIR/openmp-barrier-cancel.mlir @@ -29,22 +29,24 @@ llvm.func @test() { // CHECK: %[[VAL_14:.*]] = icmp eq i32 %[[VAL_13]], 0 // CHECK: br i1 %[[VAL_14]], label %[[VAL_15:.*]], label %[[VAL_16:.*]] // CHECK: omp.par.region1.cncl: ; preds = %[[VAL_11]] -// CHECK: %[[VAL_17:.*]] = call i32 @__kmpc_global_thread_num(ptr @1) -// CHECK: %[[VAL_18:.*]] = call i32 @__kmpc_cancel_barrier(ptr @2, i32 %[[VAL_17]]) -// CHECK: br label %[[VAL_19:.*]] +// CHECK: br label %[[FINI:.*]] +// CHECK: .fini: +// CHECK: %[[TID:.*]] = call i32 @__kmpc_global_thread_num(ptr @1) +// CHECK: %[[CNCL_BARRIER:.*]] = call i32 @__kmpc_cancel_barrier(ptr @2, i32 %[[TID]]) +// CHECK: br label %[[EXIT_STUB:.*]] // CHECK: omp.par.region1.split: ; preds = %[[VAL_11]] // CHECK: %[[VAL_20:.*]] = call i32 @__kmpc_global_thread_num(ptr @1) // CHECK: %[[VAL_21:.*]] = call i32 @__kmpc_cancel_barrier(ptr @3, i32 %[[VAL_20]]) // CHECK: %[[VAL_22:.*]] = icmp eq i32 %[[VAL_21]], 0 // CHECK: br i1 %[[VAL_22]], label %[[VAL_23:.*]], label %[[VAL_24:.*]] // CHECK: omp.par.region1.split.cncl: ; preds = %[[VAL_15]] -// CHECK: br label %[[VAL_19]] +// CHECK: br label %[[FINI]] // CHECK: omp.par.region1.split.cont: ; preds = %[[VAL_15]] // CHECK: br label %[[VAL_25:.*]] // CHECK: omp.region.cont: ; preds = %[[VAL_23]] // CHECK: br label %[[VAL_26:.*]] // CHECK: omp.par.pre_finalize: ; preds = %[[VAL_25]] -// CHECK: br label %[[VAL_19]] -// CHECK: omp.par.exit.exitStub: ; preds = %[[VAL_26]], %[[VAL_24]], %[[VAL_16]] +// CHECK: br label %[[FINI]] +// CHECK: omp.par.exit.exitStub: // CHECK: ret void diff --git a/mlir/test/Target/LLVMIR/openmp-cancel.mlir b/mlir/test/Target/LLVMIR/openmp-cancel.mlir index e1abb15fbb476..59f2c36a2523a 100644 --- a/mlir/test/Target/LLVMIR/openmp-cancel.mlir +++ b/mlir/test/Target/LLVMIR/openmp-cancel.mlir @@ -25,16 +25,18 @@ llvm.func @cancel_parallel() { // CHECK: %[[VAL_15:.*]] = icmp eq i32 %[[VAL_14]], 0 // CHECK: br i1 %[[VAL_15]], label %[[VAL_16:.*]], label %[[VAL_17:.*]] // CHECK: omp.par.region1.cncl: ; preds = %[[VAL_12]] +// CHECK: br label %[[VAL_20:.*]] +// CHECK: .fini: // CHECK: %[[VAL_18:.*]] = call i32 @__kmpc_global_thread_num(ptr @1) // CHECK: %[[VAL_19:.*]] = call i32 @__kmpc_cancel_barrier(ptr @2, i32 %[[VAL_18]]) -// CHECK: br label %[[VAL_20:.*]] +// CHECK: br label %[[EXIT_STUB:.*]] // CHECK: omp.par.region1.split: ; preds = %[[VAL_12]] // CHECK: br label %[[VAL_21:.*]] // CHECK: omp.region.cont: ; preds = %[[VAL_16]] // CHECK: br label %[[VAL_22:.*]] // CHECK: omp.par.pre_finalize: ; preds = %[[VAL_21]] // CHECK: br label %[[VAL_20]] -// CHECK: omp.par.exit.exitStub: ; preds = %[[VAL_22]], %[[VAL_17]] +// CHECK: omp.par.exit.exitStub: // CHECK: ret void llvm.func @cancel_parallel_if(%arg0 : i1) { @@ -59,27 +61,36 @@ llvm.func @cancel_parallel_if(%arg0 : i1) { // CHECK: omp.par.region: ; preds = %[[VAL_17]] // CHECK: br label %[[VAL_20:.*]] // CHECK: omp.par.region1: ; preds = %[[VAL_19]] -// CHECK: br i1 %[[VAL_16]], label %[[VAL_21:.*]], label %[[VAL_22:.*]] +// CHECK: br i1 %[[VAL_16]], label %[[SPLIT:.*]], label %[[VAL_22:.*]] // CHECK: 3: ; preds = %[[VAL_20]] -// CHECK: br label %[[VAL_23:.*]] -// CHECK: 4: ; preds = %[[VAL_22]], %[[VAL_24:.*]] +// CHECK: %[[GTN:.*]] = call i32 @__kmpc_global_thread_num(ptr @1) +// CHECK: %[[NOT_CANCELLED:.*]] = call i32 @__kmpc_cancellationpoint(ptr @1, i32 %[[GTN]], i32 1) +// CHECK: %[[COND:.*]] = icmp eq i32 %[[NOT_CANCELLED]], 0 +// CHECK: br i1 %[[COND]], label %[[VAL_23:.*]], label %[[CNCL:.*]] +// CHECK: .cncl: +// CHECK: br label %[[FINI:.*]] +// CHECK: .fini: +// CHECK: %[[VAL_32:.*]] = call i32 @__kmpc_global_thread_num(ptr @1) +// CHECK: %[[VAL_33:.*]] = call i32 @__kmpc_cancel_barrier(ptr @2, i32 %[[VAL_32]]) +// CHECK: br label %[[EXIT_STUB:.*]] +// CHECK: .split: +// CHECK: br label %[[SEVEN:.*]] +// CHECK: 7: // CHECK: br label %[[VAL_25:.*]] -// CHECK: omp.region.cont: ; preds = %[[VAL_23]] +// CHECK: omp.region.cont: // CHECK: br label %[[VAL_26:.*]] // CHECK: omp.par.pre_finalize: ; preds = %[[VAL_25]] // CHECK: br label %[[VAL_27:.*]] -// CHECK: 5: ; preds = %[[VAL_20]] +// CHECK: 8: ; preds = %[[VAL_20]] // CHECK: %[[VAL_28:.*]] = call i32 @__kmpc_global_thread_num(ptr @1) // CHECK: %[[VAL_29:.*]] = call i32 @__kmpc_cancel(ptr @1, i32 %[[VAL_28]], i32 1) // CHECK: %[[VAL_30:.*]] = icmp eq i32 %[[VAL_29]], 0 -// CHECK: br i1 %[[VAL_30]], label %[[VAL_24]], label %[[VAL_31:.*]] -// CHECK: .cncl: ; preds = %[[VAL_21]] -// CHECK: %[[VAL_32:.*]] = call i32 @__kmpc_global_thread_num(ptr @1) -// CHECK: %[[VAL_33:.*]] = call i32 @__kmpc_cancel_barrier(ptr @2, i32 %[[VAL_32]]) -// CHECK: br label %[[VAL_27]] -// CHECK: .split: ; preds = %[[VAL_21]] -// CHECK: br label %[[VAL_23]] -// CHECK: omp.par.exit.exitStub: ; preds = %[[VAL_31]], %[[VAL_26]] +// CHECK: br i1 %[[VAL_30]], label %[[SPLIT5:.*]], label %[[VAL_31:.*]] +// CHECK: .cncl{{.*}}: +// CHECK: br label %[[FINI]] +// CHECK: .split{{.*}}: +// CHECK: br label %[[SEVEN]] +// CHECK: omp.par.exit.exitStub: // CHECK: ret void llvm.func @cancel_sections_if(%cond : i1) { @@ -133,11 +144,16 @@ llvm.func @cancel_sections_if(%cond : i1) { // CHECK: %[[VAL_30:.*]] = call i32 @__kmpc_cancel(ptr @1, i32 %[[VAL_29]], i32 3) // CHECK: %[[VAL_31:.*]] = icmp eq i32 %[[VAL_30]], 0 // CHECK: br i1 %[[VAL_31]], label %[[VAL_32:.*]], label %[[VAL_33:.*]] -// CHECK: .split: ; preds = %[[VAL_27]] +// CHECK: .split{{.*}}: ; preds = %[[VAL_27]] // CHECK: br label %[[VAL_34:.*]] -// CHECK: 11: ; preds = %[[VAL_25]] +// CHECK: 12: ; preds = %[[VAL_25]] +// CHECK: %[[GTN:.*]] = call i32 @__kmpc_global_thread_num(ptr @1) +// CHECK: %[[CANCEL_POINT:.*]] = call i32 @__kmpc_cancellationpoint(ptr @1, i32 %[[GTN]], i32 3) +// CHECK: %[[COND:.*]] = icmp eq i32 %13, 0 +// CHECK: br i1 %[[COND]], label %[[SPLIT:.*]], label %[[CNCL:.*]] +// CHECK: .split{{.*}}: // CHECK: br label %[[VAL_34]] -// CHECK: 12: ; preds = %[[VAL_28]], %[[VAL_32]] +// CHECK: 15: // CHECK: br label %[[VAL_35:.*]] // CHECK: omp.region.cont: ; preds = %[[VAL_34]] // CHECK: br label %[[VAL_23]] @@ -146,17 +162,17 @@ llvm.func @cancel_sections_if(%cond : i1) { // CHECK: omp_section_loop.inc: ; preds = %[[VAL_23]] // CHECK: %[[VAL_15]] = add nuw i32 %[[VAL_14]], 1 // CHECK: br label %[[VAL_12]] -// CHECK: omp_section_loop.exit: ; preds = %[[VAL_33]], %[[VAL_16]] +// CHECK: omp_section_loop.exit: // CHECK: call void @__kmpc_for_static_fini(ptr @1, i32 %[[VAL_7]]) // CHECK: %[[VAL_36:.*]] = call i32 @__kmpc_global_thread_num(ptr @1) // CHECK: call void @__kmpc_barrier(ptr @2, i32 %[[VAL_36]]) // CHECK: br label %[[VAL_37:.*]] // CHECK: omp_section_loop.after: ; preds = %[[VAL_19]] -// CHECK: br label %[[VAL_38:.*]] -// CHECK: omp_section_loop.aftersections.fini: ; preds = %[[VAL_37]] // CHECK: ret void -// CHECK: .cncl: ; preds = %[[VAL_27]] -// CHECK: br label %[[VAL_19]] +// CHECK: .cncl: +// CHECK: br label %[[OMP_SECTION_LOOP_EXIT:.*]] +// CHECK: .cncl{{.*}}: +// CHECK: br label %[[OMP_SECTION_LOOP_EXIT:.*]] llvm.func @cancel_wsloop_if(%lb : i32, %ub : i32, %step : i32, %cond : i1) { omp.wsloop { @@ -222,18 +238,23 @@ llvm.func @cancel_wsloop_if(%lb : i32, %ub : i32, %step : i32, %cond : i1) { // CHECK: %[[VAL_47:.*]] = call i32 @__kmpc_cancel(ptr @1, i32 %[[VAL_46]], i32 2) // CHECK: %[[VAL_48:.*]] = icmp eq i32 %[[VAL_47]], 0 // CHECK: br i1 %[[VAL_48]], label %[[VAL_49:.*]], label %[[VAL_50:.*]] -// CHECK: .split: ; preds = %[[VAL_44]] +// CHECK: .split{{.*}}: // CHECK: br label %[[VAL_51:.*]] -// CHECK: 28: ; preds = %[[VAL_42]] +// CHECK: 28: +// CHECK: %[[GTN:.*]] = call i32 @__kmpc_global_thread_num(ptr @1) +// CHECK: %[[CANCEL_POINT:.*]] = call i32 @__kmpc_cancellationpoint(ptr @1, i32 %[[GTN]], i32 2) +// CHECK: %[[COND:.*]] = icmp eq i32 %[[CANCEL_POINT]], 0 +// CHECK: br i1 %[[COND]], label %[[SPLIT3:.*]], label %[[CNCL4:.*]] +// CHECK: .split{{.*}}: // CHECK: br label %[[VAL_51]] -// CHECK: 29: ; preds = %[[VAL_45]], %[[VAL_49]] +// CHECK: 31: // CHECK: br label %[[VAL_52:.*]] // CHECK: omp.region.cont1: ; preds = %[[VAL_51]] // CHECK: br label %[[VAL_32]] // CHECK: omp_loop.inc: ; preds = %[[VAL_52]] // CHECK: %[[VAL_34]] = add nuw i32 %[[VAL_33]], 1 // CHECK: br label %[[VAL_31]] -// CHECK: omp_loop.exit: ; preds = %[[VAL_50]], %[[VAL_35]] +// CHECK: omp_loop.exit: // CHECK: call void @__kmpc_for_static_fini(ptr @1, i32 %[[VAL_26]]) // CHECK: %[[VAL_53:.*]] = call i32 @__kmpc_global_thread_num(ptr @1) // CHECK: call void @__kmpc_barrier(ptr @2, i32 %[[VAL_53]]) @@ -242,8 +263,12 @@ llvm.func @cancel_wsloop_if(%lb : i32, %ub : i32, %step : i32, %cond : i1) { // CHECK: br label %[[VAL_55:.*]] // CHECK: omp.region.cont: ; preds = %[[VAL_54]] // CHECK: ret void -// CHECK: .cncl: ; preds = %[[VAL_44]] -// CHECK: br label %[[VAL_38]] +// CHECK: .cncl{{.*}}: +// CHECK: br label %[[FINI:.*]] +// CHECK: .fini: +// CHECK: br label %[[OMP_LOOP_EXIT:.*]] +// CHECK: .cncl{{.*}}: +// CHECK: br label %[[FINI:.*]] omp.private {type = firstprivate} @i32_priv : i32 copy { ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): diff --git a/mlir/test/Target/LLVMIR/openmp-cancellation-point.mlir b/mlir/test/Target/LLVMIR/openmp-cancellation-point.mlir index 5e0d3f9f7e293..93fa2064ab99a 100644 --- a/mlir/test/Target/LLVMIR/openmp-cancellation-point.mlir +++ b/mlir/test/Target/LLVMIR/openmp-cancellation-point.mlir @@ -24,16 +24,18 @@ llvm.func @cancellation_point_parallel() { // CHECK: %[[VAL_15:.*]] = icmp eq i32 %[[VAL_14]], 0 // CHECK: br i1 %[[VAL_15]], label %[[VAL_16:.*]], label %[[VAL_17:.*]] // CHECK: omp.par.region1.cncl: ; preds = %[[VAL_12]] +// CHECK: br label %[[FINI:.*]] +// CHECK: .fini: // CHECK: %[[VAL_18:.*]] = call i32 @__kmpc_global_thread_num(ptr @1) // CHECK: %[[VAL_19:.*]] = call i32 @__kmpc_cancel_barrier(ptr @2, i32 %[[VAL_18]]) -// CHECK: br label %[[VAL_20:.*]] +// CHECK: br label %[[EXIT_STUB:.*]] // CHECK: omp.par.region1.split: ; preds = %[[VAL_12]] // CHECK: br label %[[VAL_21:.*]] // CHECK: omp.region.cont: ; preds = %[[VAL_16]] // CHECK: br label %[[VAL_22:.*]] // CHECK: omp.par.pre_finalize: ; preds = %[[VAL_21]] -// CHECK: br label %[[VAL_20]] -// CHECK: omp.par.exit.exitStub: ; preds = %[[VAL_22]], %[[VAL_17]] +// CHECK: br label %[[FINI]] +// CHECK: omp.par.exit.exitStub: // CHECK: ret void llvm.func @cancellation_point_sections() { @@ -94,14 +96,12 @@ llvm.func @cancellation_point_sections() { // CHECK: omp_section_loop.inc: ; preds = %[[VAL_46]] // CHECK: %[[VAL_38]] = add nuw i32 %[[VAL_37]], 1 // CHECK: br label %[[VAL_35]] -// CHECK: omp_section_loop.exit: ; preds = %[[VAL_53]], %[[VAL_39]] +// CHECK: omp_section_loop.exit: // CHECK: call void @__kmpc_for_static_fini(ptr @1, i32 %[[VAL_30]]) // CHECK: %[[VAL_55:.*]] = call i32 @__kmpc_global_thread_num(ptr @1) // CHECK: call void @__kmpc_barrier(ptr @2, i32 %[[VAL_55]]) // CHECK: br label %[[VAL_56:.*]] // CHECK: omp_section_loop.after: ; preds = %[[VAL_42]] -// CHECK: br label %[[VAL_57:.*]] -// CHECK: omp_section_loop.aftersections.fini: ; preds = %[[VAL_56]] // CHECK: ret void // CHECK: omp.section.region.cncl: ; preds = %[[VAL_48]] // CHECK: br label %[[VAL_42]] @@ -175,7 +175,7 @@ llvm.func @cancellation_point_wsloop(%lb : i32, %ub : i32, %step : i32) { // CHECK: omp_loop.inc: ; preds = %[[VAL_106]] // CHECK: %[[VAL_92]] = add nuw i32 %[[VAL_91]], 1 // CHECK: br label %[[VAL_89]] -// CHECK: omp_loop.exit: ; preds = %[[VAL_105]], %[[VAL_93]] +// CHECK: omp_loop.exit: // CHECK: call void @__kmpc_for_static_fini(ptr @1, i32 %[[VAL_84]]) // CHECK: %[[VAL_107:.*]] = call i32 @__kmpc_global_thread_num(ptr @1) // CHECK: call void @__kmpc_barrier(ptr @2, i32 %[[VAL_107]]) diff --git a/mlir/test/Target/LLVMIR/openmp-outline-infinite-loop.mlir b/mlir/test/Target/LLVMIR/openmp-outline-infinite-loop.mlir index faccfc678adfe..99f37c7e79be8 100644 --- a/mlir/test/Target/LLVMIR/openmp-outline-infinite-loop.mlir +++ b/mlir/test/Target/LLVMIR/openmp-outline-infinite-loop.mlir @@ -21,9 +21,11 @@ llvm.func @parallel_infinite_loop() -> () { // CHECK: omp.region.cont: ; No predecessors! // CHECK: br label %[[VAL_4:.*]] // CHECK: omp.par.pre_finalize: ; preds = %[[VAL_5:.*]] -// CHECK: br label %[[VAL_6:.*]] -// CHECK: omp.par.exit: ; preds = %[[VAL_4]] +// CHECK: br label %[[FINI:.*]] +// CHECK: [[OMP_PAR_EXIT:omp.par.exit]]: ; preds = %[[FINI]] // CHECK: ret void +// CHECK: [[FINI]]: +// CHECK: br label %[[OMP_PAR_EXIT]] // CHECK: } // CHECK-LABEL: define internal void @parallel_infinite_loop..omp_par( diff --git a/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir b/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir index 887d2977e45cc..c79c369b69d7f 100644 --- a/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir +++ b/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir @@ -108,6 +108,8 @@ llvm.func @missordered_blocks_(%arg0: !llvm.ptr {fir.bindc_name = "x"}, %arg1: ! // CHECK: reduce.finalize: ; preds = %[[VAL_49]], %[[VAL_43]] // CHECK: br label %[[VAL_53:.*]] // CHECK: omp.par.pre_finalize: ; preds = %[[VAL_48]] +// CHECK: br label %[[FINI:.*]] +// CHECK: .fini: // CHECK: %[[VAL_54:.*]] = load ptr, ptr %[[VAL_20]], align 8 // CHECK: %[[VAL_55:.*]] = load ptr, ptr %[[VAL_21]], align 8 // CHECK: br label %[[VAL_56:.*]] @@ -115,5 +117,5 @@ llvm.func @missordered_blocks_(%arg0: !llvm.ptr {fir.bindc_name = "x"}, %arg1: ! // CHECK: br label %[[VAL_38]] // CHECK: omp.reduction.neutral1: ; preds = %[[VAL_25]] // CHECK: br label %[[VAL_30]] -// CHECK: omp.par.exit.exitStub: ; preds = %[[VAL_53]] +// CHECK: omp.par.exit.exitStub: ; preds = %[[FINI]] // CHECK: ret void diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir index b302b4b20edd5..13f52f054869e 100644 --- a/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir +++ b/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir @@ -127,8 +127,6 @@ llvm.func @sectionsreduction_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attribute // CHECK: call void @__kmpc_barrier(ptr @2, i32 %[[VAL_36]]) // CHECK: br label %[[VAL_37:.*]] // CHECK: omp_section_loop.after: ; preds = %[[VAL_35]] -// CHECK: br label %[[VAL_38:.*]] -// CHECK: omp_section_loop.aftersections.fini: ; preds = %[[VAL_37]] // CHECK: %[[VAL_39:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_14]], i64 0, i64 0 // CHECK: store ptr %[[VAL_21]], ptr %[[VAL_39]], align 8 // CHECK: %[[VAL_40:.*]] = call i32 @__kmpc_global_thread_num(ptr @1) @@ -137,9 +135,9 @@ llvm.func @sectionsreduction_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attribute // CHECK: i32 1, label %[[VAL_43:.*]] // CHECK: i32 2, label %[[VAL_44:.*]] // CHECK: ] -// CHECK: reduce.switch.atomic: ; preds = %[[VAL_38]] +// CHECK: reduce.switch.atomic: ; preds = %[[VAL_37]] // CHECK: unreachable -// CHECK: reduce.switch.nonatomic: ; preds = %[[VAL_38]] +// CHECK: reduce.switch.nonatomic: ; preds = %[[VAL_37]] // CHECK: %[[VAL_45:.*]] = load ptr, ptr %[[VAL_21]], align 8 // CHECK: br label %[[VAL_46:.*]] // CHECK: omp.reduction.nonatomic.body: ; preds = %[[VAL_43]] @@ -157,7 +155,7 @@ llvm.func @sectionsreduction_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attribute // CHECK: omp.reduction.nonatomic.body17: ; preds = %[[VAL_47]] // CHECK: %[[VAL_50]] = sub i64 %[[VAL_49]], 1 // CHECK: br label %[[VAL_47]] -// CHECK: reduce.finalize: ; preds = %[[VAL_53]], %[[VAL_38]] +// CHECK: reduce.finalize: ; preds = %[[VAL_53]], %[[VAL_37]] // CHECK: %[[VAL_55:.*]] = call i32 @__kmpc_global_thread_num(ptr @1) // CHECK: call void @__kmpc_barrier(ptr @2, i32 %[[VAL_55]]) // CHECK: %[[VAL_56:.*]] = load ptr, ptr %[[VAL_21]], align 8 @@ -173,7 +171,9 @@ llvm.func @sectionsreduction_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attribute // CHECK: omp.region.cont: ; preds = %[[VAL_62]] // CHECK: br label %[[VAL_64:.*]] // CHECK: omp.par.pre_finalize: ; preds = %[[VAL_63]] -// CHECK: br label %[[VAL_65:.*]] +// CHECK: br label %[[FINI:.fini.*]] +// CHECK: [[FINI]]: +// CHECK: br label %[[EXIT:.*]] // CHECK: omp.reduction.cleanup21: ; preds = %[[VAL_57]] // CHECK: br label %[[VAL_61]] // CHECK: omp_section_loop.body: ; preds = %[[VAL_32]] @@ -219,5 +219,5 @@ llvm.func @sectionsreduction_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attribute // CHECK: omp_section_loop.inc: ; preds = %[[VAL_69]] // CHECK: %[[VAL_31]] = add nuw i32 %[[VAL_30]], 1 // CHECK: br label %[[VAL_28]] -// CHECK: omp.par.exit.exitStub: ; preds = %[[VAL_64]] +// CHECK: omp.par.exit.exitStub: ; preds = %[[FINI]] // CHECK: ret void diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir index a714ca68a1e95..cb30d3b2f4473 100644 --- a/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir +++ b/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir @@ -96,8 +96,10 @@ module { // CHECK: reduce.finalize: ; preds = %[[VAL_34]], %[[VAL_28]] // CHECK: br label %[[VAL_38:.*]] // CHECK: omp.par.pre_finalize: ; preds = %[[VAL_33]] +// CHECK: br label %[[FINI:.*]] +// CHECK: [[FINI]]: // CHECK: br label %[[VAL_39:.*]] -// CHECK: omp.par.exit.exitStub: ; preds = %[[VAL_38]] +// CHECK: omp.par.exit.exitStub: ; preds = %[[FINI]] // CHECK: ret void // CHECK: %[[VAL_40:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_41:.*]], i64 0, i64 0 // CHECK: %[[VAL_42:.*]] = load ptr, ptr %[[VAL_40]], align 8 diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-sections.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-sections.mlir index 19da6f8517fcd..00f6c1b02206e 100644 --- a/mlir/test/Target/LLVMIR/openmp-reduction-sections.mlir +++ b/mlir/test/Target/LLVMIR/openmp-reduction-sections.mlir @@ -86,8 +86,6 @@ llvm.func @sections_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attributes {fir.in // CHECK: call void @__kmpc_barrier(ptr @2, i32 %[[VAL_40]]) // CHECK: br label %[[VAL_41:.*]] // CHECK: omp_section_loop.after: ; preds = %[[VAL_39]] -// CHECK: br label %[[VAL_42:.*]] -// CHECK: omp_section_loop.aftersections.fini: ; preds = %[[VAL_41]] // CHECK: %[[VAL_43:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_21]], i64 0, i64 0 // CHECK: store ptr %[[VAL_20]], ptr %[[VAL_43]], align 8 // CHECK: %[[VAL_44:.*]] = call i32 @__kmpc_global_thread_num(ptr @1) @@ -96,23 +94,25 @@ llvm.func @sections_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attributes {fir.in // CHECK: i32 1, label %[[VAL_47:.*]] // CHECK: i32 2, label %[[VAL_48:.*]] // CHECK: ] -// CHECK: reduce.switch.atomic: ; preds = %[[VAL_42]] +// CHECK: reduce.switch.atomic: ; preds = %[[VAL_41]] // CHECK: unreachable -// CHECK: reduce.switch.nonatomic: ; preds = %[[VAL_42]] +// CHECK: reduce.switch.nonatomic: ; preds = %[[VAL_41]] // CHECK: %[[VAL_49:.*]] = load float, ptr %[[VAL_11]], align 4 // CHECK: %[[VAL_50:.*]] = load float, ptr %[[VAL_20]], align 4 // CHECK: %[[VAL_51:.*]] = fadd contract float %[[VAL_49]], %[[VAL_50]] // CHECK: store float %[[VAL_51]], ptr %[[VAL_11]], align 4 // CHECK: call void @__kmpc_end_reduce(ptr @1, i32 %[[VAL_44]], ptr @.gomp_critical_user_.reduction.var) // CHECK: br label %[[VAL_46]] -// CHECK: reduce.finalize: ; preds = %[[VAL_47]], %[[VAL_42]] +// CHECK: reduce.finalize: ; preds = %[[VAL_47]], %[[VAL_41]] // CHECK: %[[VAL_52:.*]] = call i32 @__kmpc_global_thread_num(ptr @1) // CHECK: call void @__kmpc_barrier(ptr @2, i32 %[[VAL_52]]) // CHECK: br label %[[VAL_53:.*]] // CHECK: omp.region.cont: ; preds = %[[VAL_46]] // CHECK: br label %[[VAL_54:.*]] // CHECK: omp.par.pre_finalize: ; preds = %[[VAL_53]] -// CHECK: br label %[[VAL_55:.*]] +// CHECK: br label %[[FINI:.fini.*]] +// CHECK: [[FINI]]: +// CHECK: br label %[[EXIT:.*]] // CHECK: omp_section_loop.body: ; preds = %[[VAL_36]] // CHECK: %[[VAL_56:.*]] = add i32 %[[VAL_34]], %[[VAL_28]] // CHECK: %[[VAL_57:.*]] = mul i32 %[[VAL_56]], 1 @@ -144,8 +144,10 @@ llvm.func @sections_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attributes {fir.in // CHECK: omp_section_loop.inc: ; preds = %[[VAL_59]] // CHECK: %[[VAL_35]] = add nuw i32 %[[VAL_34]], 1 // CHECK: br label %[[VAL_32]] -// CHECK: omp.par.exit.exitStub: ; preds = %[[VAL_54]] +// CHECK: omp.par.exit.exitStub: ; preds = %[[FINI]] // CHECK: ret void + +// CHECK-LABEL: define internal void @.omp.reduction.func // CHECK: %[[VAL_70:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_71:.*]], i64 0, i64 0 // CHECK: %[[VAL_72:.*]] = load ptr, ptr %[[VAL_70]], align 8 // CHECK: %[[VAL_73:.*]] = load float, ptr %[[VAL_72]], align 4 diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 13a7705091b24..c574ba5877b3d 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -4444,6 +4444,7 @@ cc_library( ":SCFIncGen", ":SideEffectInterfaces", ":TensorDialect", + ":TransformUtils", ":ValueBoundsOpInterface", ":ViewLikeInterface", "//llvm:Support",