From c9be7a297957de823ae4cd4c681e8ce2cef44adb Mon Sep 17 00:00:00 2001 From: Commit Queue Date: Fri, 27 Oct 2017 09:02:06 +0000 Subject: [PATCH] Merge r223965 - Unreviewed, rolling out r222945. https://bugs.webkit.org/show_bug.cgi?id=178818 "It made WasmBench crash" (Requested by saamyjoon on #webkit). Reverted changeset: "bmalloc mutex should be adaptive" https://bugs.webkit.org/show_bug.cgi?id=177839 https://trac.webkit.org/changeset/222945 --- Source/WTF/ChangeLog | 13 ++ Source/WTF/wtf/LockAlgorithmInlines.h | 4 - Source/WTF/wtf/WordLock.cpp | 15 +- Source/bmalloc/ChangeLog | 13 ++ Source/bmalloc/bmalloc/Algorithm.h | 21 +-- Source/bmalloc/bmalloc/PerThread.h | 8 +- Source/bmalloc/bmalloc/StaticMutex.cpp | 242 ++----------------------- Source/bmalloc/bmalloc/StaticMutex.h | 70 ++++--- 8 files changed, 83 insertions(+), 303 deletions(-) diff --git a/Source/WTF/ChangeLog b/Source/WTF/ChangeLog index fc704640a0ba..f6e9e19f3e9c 100644 --- a/Source/WTF/ChangeLog +++ b/Source/WTF/ChangeLog @@ -1,3 +1,16 @@ +2017-10-25 Commit Queue + + Unreviewed, rolling out r222945. + https://bugs.webkit.org/show_bug.cgi?id=178818 + + "It made WasmBench crash" (Requested by saamyjoon on #webkit). + + Reverted changeset: + + "bmalloc mutex should be adaptive" + https://bugs.webkit.org/show_bug.cgi?id=177839 + https://trac.webkit.org/changeset/222945 + 2017-10-04 Filip Pizlo bmalloc mutex should be adaptive diff --git a/Source/WTF/wtf/LockAlgorithmInlines.h b/Source/WTF/wtf/LockAlgorithmInlines.h index 31b42a5e8aee..0cbe5fa5fac8 100644 --- a/Source/WTF/wtf/LockAlgorithmInlines.h +++ b/Source/WTF/wtf/LockAlgorithmInlines.h @@ -33,10 +33,6 @@ // the lock algorithm slow path without recompiling the world. Right now this should be included in two // places (Lock.cpp and JSCell.cpp). -// NOTE: It's a bug to use any memory order other than seq_cst in this code. The cost of seq_cst -// fencing is negligible on slow paths, so any use of a more relaxed memory model is all risk and no -// reward. - namespace WTF { template diff --git a/Source/WTF/wtf/WordLock.cpp b/Source/WTF/wtf/WordLock.cpp index 7361dc0fce92..9e8fb9bff05f 100644 --- a/Source/WTF/wtf/WordLock.cpp +++ b/Source/WTF/wtf/WordLock.cpp @@ -76,10 +76,6 @@ ThreadData* myThreadData() } // anonymous namespace -// NOTE: It's a bug to use any memory order other than seq_cst in this code. The cost of seq_cst -// fencing is negligible on slow paths, so any use of a more relaxed memory model is all risk and no -// reward. - NEVER_INLINE void WordLockBase::lockSlow() { unsigned spinCount = 0; @@ -227,8 +223,7 @@ NEVER_INLINE void WordLockBase::unlockSlow() ASSERT(currentWordValue & isLockedBit); ASSERT(currentWordValue & isQueueLockedBit); ThreadData* queueHead = bitwise_cast(currentWordValue & ~queueHeadMask); - RELEASE_ASSERT(queueHead); - RELEASE_ASSERT(queueHead->shouldPark); // This would have been set before the thread was enqueued, so it must still be set now. + ASSERT(queueHead); ThreadData* newQueueHead = queueHead->nextInQueue; // Either this was the only thread on the queue, in which case we delete the queue, or there @@ -261,12 +256,10 @@ NEVER_INLINE void WordLockBase::unlockSlow() { std::lock_guard locker(queueHead->parkingLock); queueHead->shouldPark = false; - // We have to do the notify while still holding the lock, since otherwise, we could try to - // do it after the queueHead has destructed. It's impossible for queueHead to destruct - // while we hold the lock, since it is either currently in the wait loop or it's before it - // so it has to grab the lock before destructing. - queueHead->parkingCondition.notify_one(); } + // Doesn't matter if we notify_all() or notify_one() here since the only thread that could be + // waiting is queueHead. + queueHead->parkingCondition.notify_one(); // The old queue head can now contend for the lock again. We're done! } diff --git a/Source/bmalloc/ChangeLog b/Source/bmalloc/ChangeLog index 12b49c2fcd7d..9ba95aa9e8fa 100644 --- a/Source/bmalloc/ChangeLog +++ b/Source/bmalloc/ChangeLog @@ -1,3 +1,16 @@ +2017-10-25 Commit Queue + + Unreviewed, rolling out r222945. + https://bugs.webkit.org/show_bug.cgi?id=178818 + + "It made WasmBench crash" (Requested by saamyjoon on #webkit). + + Reverted changeset: + + "bmalloc mutex should be adaptive" + https://bugs.webkit.org/show_bug.cgi?id=177839 + https://trac.webkit.org/changeset/222945 + 2017-10-23 Zan Dobersek bmalloc::api::tryLargeMemalignVirtual() shouldn't assert on a failed allocation diff --git a/Source/bmalloc/bmalloc/Algorithm.h b/Source/bmalloc/bmalloc/Algorithm.h index e682b78dcb9f..a8be204bf4da 100644 --- a/Source/bmalloc/bmalloc/Algorithm.h +++ b/Source/bmalloc/bmalloc/Algorithm.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2014-2017 Apple Inc. All rights reserved. + * Copyright (C) 2014 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -28,7 +28,6 @@ #include "BAssert.h" #include -#include #include #include #include @@ -159,24 +158,6 @@ inline constexpr unsigned long log2(unsigned long value) return bitCount() - 1 - __builtin_clzl(value); } -// We need a CAS API that isn't the badly designed one from C++. -template -bool compareExchangeWeak(std::atomic& word, T expected, T desired, std::memory_order order = std::memory_order_seq_cst) -{ - // They could have made a sensible CAS API, but they didn't. Sneaky fact: for no good reason, the - // C++ API will mutate expected. I think that apologists will say something about how it saves you - // reloading the value around the back of your CAS loop, but that's nonsense. The delay along the - // back edge of a CAS loop has almost no impact on CAS loop performance. More often than not, we - // want to *add* delays to the back edge, not remove them. - return word.compare_exchange_weak(expected, desired, order, std::memory_order_relaxed); -} - -template -bool compareExchangeStrong(std::atomic& word, T expected, T desired, std::memory_order order = std::memory_order_seq_cst) -{ - return word.compare_exchange_strong(expected, desired, order, std::memory_order_relaxed); -} - } // namespace bmalloc #endif // Algorithm_h diff --git a/Source/bmalloc/bmalloc/PerThread.h b/Source/bmalloc/bmalloc/PerThread.h index df87f0eb2a2b..b848f828e954 100644 --- a/Source/bmalloc/bmalloc/PerThread.h +++ b/Source/bmalloc/bmalloc/PerThread.h @@ -60,11 +60,11 @@ class PerThread { static void destructor(void*); }; +#if HAVE_PTHREAD_MACHDEP_H + class Cache; template struct PerThreadStorage; -#if HAVE_PTHREAD_MACHDEP_H - // For now, we only support PerThread>. We can expand to other types by // using more keys. template<> struct PerThreadStorage> { @@ -82,7 +82,7 @@ template<> struct PerThreadStorage> { } }; -#endif +#else template struct PerThreadStorage { static bool s_didInitialize; @@ -112,6 +112,8 @@ template bool PerThreadStorage::s_didInitialize; template pthread_key_t PerThreadStorage::s_key; template std::once_flag PerThreadStorage::s_onceFlag; +#endif + template BINLINE T* PerThread::getFastCase() { diff --git a/Source/bmalloc/bmalloc/StaticMutex.cpp b/Source/bmalloc/bmalloc/StaticMutex.cpp index b444af081fd4..1e2e1c3c0c08 100644 --- a/Source/bmalloc/bmalloc/StaticMutex.cpp +++ b/Source/bmalloc/bmalloc/StaticMutex.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2014-2017 Apple Inc. All rights reserved. + * Copyright (C) 2014 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -23,245 +23,31 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "StaticMutex.h" - -#include "PerThread.h" #include "ScopeExit.h" -#include -#include +#include "StaticMutex.h" #include namespace bmalloc { -// FIXME: This duplicates code from WTF::WordLock. -// https://bugs.webkit.org/show_bug.cgi?id=177719 - -namespace { - -// This data structure serves three purposes: -// -// 1) A parking mechanism for threads that go to sleep. That involves just a system mutex and -// condition variable. -// -// 2) A queue node for when a thread is on some lock's queue. -// -// 3) The queue head. This is kind of funky. When a thread is the head of a queue, it also serves as -// the basic queue bookkeeping data structure. When a thread is dequeued, the next thread in the -// queue takes on the queue head duties. -struct ThreadData { - // The parking mechanism. - bool shouldPark { false }; - std::mutex parkingLock; - std::condition_variable parkingCondition; - - // The queue node. - ThreadData* nextInQueue { nullptr }; - - // The queue itself. - ThreadData* queueTail { nullptr }; -}; - -ThreadData* myThreadData() +void StaticMutex::lockSlowCase() { - return PerThread::get(); -} - -} // anonymous namespace - -// NOTE: It's a bug to use any memory order other than seq_cst in this code. The cost of seq_cst -// fencing is negligible on slow paths, so any use of a more relaxed memory model is all risk and no -// reward. - -BNO_INLINE void StaticMutex::lockSlow() -{ - unsigned spinCount = 0; - - // This magic number turns out to be optimal based on past JikesRVM experiments. - const unsigned spinLimit = 40; + // The longest critical section in bmalloc is much shorter than the + // time it takes to make a system call to yield to the OS scheduler. + // So, we try again a lot before we yield. + static const size_t aLot = 256; - for (;;) { - uintptr_t currentWordValue = m_word.load(); - - if (!(currentWordValue & isLockedBit)) { - // It's not possible for someone to hold the queue lock while the lock itself is no longer - // held, since we will only attempt to acquire the queue lock when the lock is held and - // the queue lock prevents unlock. - BASSERT(!(currentWordValue & isQueueLockedBit)); - if (compareExchangeWeak(m_word, currentWordValue, currentWordValue | isLockedBit)) { - // Success! We acquired the lock. - return; - } - } - - // If there is no queue and we haven't spun too much, we can just try to spin around again. - if (!(currentWordValue & ~queueHeadMask) && spinCount < spinLimit) { - spinCount++; - sched_yield(); - continue; - } - - // Need to put ourselves on the queue. Create the queue if one does not exist. This requries - // owning the queue for a little bit. The lock that controls the queue is itself a spinlock. - // But before we acquire the queue spinlock, we make sure that we have a ThreadData for this - // thread. - ThreadData* me = myThreadData(); - BASSERT(!me->shouldPark); - BASSERT(!me->nextInQueue); - BASSERT(!me->queueTail); - - // Reload the current word value, since some time may have passed. - currentWordValue = m_word.load(); - - // We proceed only if the queue lock is not held, the lock is held, and we succeed in - // acquiring the queue lock. - if ((currentWordValue & isQueueLockedBit) - || !(currentWordValue & isLockedBit) - || !compareExchangeWeak(m_word, currentWordValue, currentWordValue | isQueueLockedBit)) { - sched_yield(); - continue; - } - - me->shouldPark = true; - - // We own the queue. Nobody can enqueue or dequeue until we're done. Also, it's not possible - // to release the lock while we hold the queue lock. - ThreadData* queueHead = reinterpret_cast(currentWordValue & ~queueHeadMask); - if (queueHead) { - // Put this thread at the end of the queue. - queueHead->queueTail->nextInQueue = me; - queueHead->queueTail = me; + if (!m_isSpinning.test_and_set()) { + auto clear = makeScopeExit([&] { m_isSpinning.clear(); }); - // Release the queue lock. - currentWordValue = m_word.load(); - BASSERT(currentWordValue & ~queueHeadMask); - BASSERT(currentWordValue & isQueueLockedBit); - BASSERT(currentWordValue & isLockedBit); - m_word.store(currentWordValue & ~isQueueLockedBit); - } else { - // Make this thread be the queue-head. - queueHead = me; - me->queueTail = me; - - // Release the queue lock and install ourselves as the head. No need for a CAS loop, since - // we own the queue lock. - currentWordValue = m_word.load(); - BASSERT(~(currentWordValue & ~queueHeadMask)); - BASSERT(currentWordValue & isQueueLockedBit); - BASSERT(currentWordValue & isLockedBit); - uintptr_t newWordValue = currentWordValue; - newWordValue |= reinterpret_cast(queueHead); - newWordValue &= ~isQueueLockedBit; - m_word.store(newWordValue); - } - - // At this point everyone who acquires the queue lock will see me on the queue, and anyone who - // acquires me's lock will see that me wants to park. Note that shouldPark may have been - // cleared as soon as the queue lock was released above, but it will happen while the - // releasing thread holds me's parkingLock. - - { - std::unique_lock locker(me->parkingLock); - while (me->shouldPark) - me->parkingCondition.wait(locker); - } - - BASSERT(!me->shouldPark); - BASSERT(!me->nextInQueue); - BASSERT(!me->queueTail); - - // Now we can loop around and try to acquire the lock again. - } -} - -BNO_INLINE void StaticMutex::unlockSlow() -{ - // The fast path can fail either because of spurious weak CAS failure, or because someone put a - // thread on the queue, or the queue lock is held. If the queue lock is held, it can only be - // because someone *will* enqueue a thread onto the queue. - - // Acquire the queue lock, or release the lock. This loop handles both lock release in case the - // fast path's weak CAS spuriously failed and it handles queue lock acquisition if there is - // actually something interesting on the queue. - for (;;) { - uintptr_t currentWordValue = m_word.load(); - - BASSERT(currentWordValue & isLockedBit); - - if (currentWordValue == isLockedBit) { - if (compareExchangeWeak(m_word, isLockedBit, clear)) { - // The fast path's weak CAS had spuriously failed, and now we succeeded. The lock is - // unlocked and we're done! + for (size_t i = 0; i < aLot; ++i) { + if (try_lock()) return; - } - // Loop around and try again. - sched_yield(); - continue; - } - - if (currentWordValue & isQueueLockedBit) { - sched_yield(); - continue; } - - // If it wasn't just a spurious weak CAS failure and if the queue lock is not held, then there - // must be an entry on the queue. - BASSERT(currentWordValue & ~queueHeadMask); - - if (compareExchangeWeak(m_word, currentWordValue, currentWordValue | isQueueLockedBit)) - break; } - uintptr_t currentWordValue = m_word.load(); - - // After we acquire the queue lock, the lock must still be held and the queue must be - // non-empty. The queue must be non-empty since only the lockSlow() method could have held the - // queue lock and if it did then it only releases it after putting something on the queue. - BASSERT(currentWordValue & isLockedBit); - BASSERT(currentWordValue & isQueueLockedBit); - ThreadData* queueHead = reinterpret_cast(currentWordValue & ~queueHeadMask); - RELEASE_BASSERT(queueHead); - RELEASE_BASSERT(queueHead->shouldPark); - - ThreadData* newQueueHead = queueHead->nextInQueue; - // Either this was the only thread on the queue, in which case we delete the queue, or there - // are still more threads on the queue, in which case we create a new queue head. - if (newQueueHead) - newQueueHead->queueTail = queueHead->queueTail; - - // Change the queue head, possibly removing it if newQueueHead is null. No need for a CAS loop, - // since we hold the queue lock and the lock itself so nothing about the lock can change right - // now. - currentWordValue = m_word.load(); - BASSERT(currentWordValue & isLockedBit); - BASSERT(currentWordValue & isQueueLockedBit); - BASSERT((currentWordValue & ~queueHeadMask) == reinterpret_cast(queueHead)); - uintptr_t newWordValue = currentWordValue; - newWordValue &= ~isLockedBit; // Release the lock. - newWordValue &= ~isQueueLockedBit; // Release the queue lock. - newWordValue &= queueHeadMask; // Clear out the old queue head. - newWordValue |= reinterpret_cast(newQueueHead); // Install new queue head. - m_word.store(newWordValue); - - // Now the lock is available for acquisition. But we just have to wake up the old queue head. - // After that, we're done! - - queueHead->nextInQueue = nullptr; - queueHead->queueTail = nullptr; - - // We do this carefully because this may run either before or during the parkingLock critical - // section in lockSlow(). - { - std::lock_guard locker(queueHead->parkingLock); - RELEASE_BASSERT(queueHead->shouldPark); - queueHead->shouldPark = false; - // We have to do the notify while still holding the lock, since otherwise, we could try to - // do it after the queueHead has destructed. It's impossible for queueHead to destruct - // while we hold the lock, since it is either currently in the wait loop or it's before it - // so it has to grab the lock before destructing. - queueHead->parkingCondition.notify_one(); - } - - // The old queue head can now contend for the lock again. We're done! + // Avoid spinning pathologically. + while (!try_lock()) + sched_yield(); } } // namespace bmalloc diff --git a/Source/bmalloc/bmalloc/StaticMutex.h b/Source/bmalloc/bmalloc/StaticMutex.h index 2366733fca1f..9e1361749d95 100644 --- a/Source/bmalloc/bmalloc/StaticMutex.h +++ b/Source/bmalloc/bmalloc/StaticMutex.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2014-2017 Apple Inc. All rights reserved. + * Copyright (C) 2014 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -26,9 +26,7 @@ #ifndef StaticMutex_h #define StaticMutex_h -#include "Algorithm.h" #include "BAssert.h" -#include "BExport.h" #include #include #include @@ -38,16 +36,6 @@ // Use StaticMutex in static storage, where global constructors and exit-time // destructors are prohibited, but all memory is zero-initialized automatically. -// This uses the code from what WTF used to call WordLock. It's a fully adaptive mutex that uses -// sizeof(void*) storage. It has a fast path that is similar to a spinlock, and a slow path that is -// similar to std::mutex. - -// NOTE: In general, this is a great lock to use if you are very low in the stack. WTF continues to -// have a copy of this code. - -// FIXME: Either fold bmalloc into WTF or find a better way to share code between them. -// https://bugs.webkit.org/show_bug.cgi?id=177719 - namespace bmalloc { class StaticMutex { @@ -57,49 +45,57 @@ class StaticMutex { public: void lock(); - bool tryLock(); - bool try_lock() { return tryLock(); } + bool try_lock(); void unlock(); - - bool isHeld() const; - bool isLocked() const { return isHeld(); } private: - BEXPORT void lockSlow(); - BEXPORT void unlockSlow(); - - static const uintptr_t clear = 0; - static const uintptr_t isLockedBit = 1; - static const uintptr_t isQueueLockedBit = 2; - static const uintptr_t queueHeadMask = 3; + void lockSlowCase(); - std::atomic m_word; + std::atomic_flag m_flag; + std::atomic_flag m_isSpinning; }; +static inline void sleep( + std::unique_lock& lock, std::chrono::milliseconds duration) +{ + if (duration == std::chrono::milliseconds(0)) + return; + + lock.unlock(); + std::this_thread::sleep_for(duration); + lock.lock(); +} + +static inline void waitUntilFalse( + std::unique_lock& lock, std::chrono::milliseconds sleepDuration, + bool& condition) +{ + while (condition) { + condition = false; + sleep(lock, sleepDuration); + } +} + inline void StaticMutex::init() { - m_word.store(0, std::memory_order_relaxed); + m_flag.clear(); + m_isSpinning.clear(); } -inline bool StaticMutex::tryLock() +inline bool StaticMutex::try_lock() { - return m_word.load(std::memory_order_acquire) & isLockedBit; + return !m_flag.test_and_set(std::memory_order_acquire); } inline void StaticMutex::lock() { - if (compareExchangeWeak(m_word, clear, isLockedBit, std::memory_order_acquire)) - return; - - lockSlow(); + if (!try_lock()) + lockSlowCase(); } inline void StaticMutex::unlock() { - if (compareExchangeWeak(m_word, isLockedBit, clear, std::memory_order_release)) - return; - - unlockSlow(); + m_flag.clear(std::memory_order_release); } } // namespace bmalloc