From c9be7a297957de823ae4cd4c681e8ce2cef44adb Mon Sep 17 00:00:00 2001
From: Commit Queue <commit-queue@webkit.org>
Date: Fri, 27 Oct 2017 09:02:06 +0000
Subject: [PATCH] Merge r223965 - Unreviewed, rolling out r222945.
 https://bugs.webkit.org/show_bug.cgi?id=178818

"It made WasmBench crash" (Requested by saamyjoon on #webkit).

Reverted changeset:

"bmalloc mutex should be adaptive"
https://bugs.webkit.org/show_bug.cgi?id=177839
https://trac.webkit.org/changeset/222945
---
 Source/WTF/ChangeLog                   |  13 ++
 Source/WTF/wtf/LockAlgorithmInlines.h  |   4 -
 Source/WTF/wtf/WordLock.cpp            |  15 +-
 Source/bmalloc/ChangeLog               |  13 ++
 Source/bmalloc/bmalloc/Algorithm.h     |  21 +--
 Source/bmalloc/bmalloc/PerThread.h     |   8 +-
 Source/bmalloc/bmalloc/StaticMutex.cpp | 242 ++-----------------------
 Source/bmalloc/bmalloc/StaticMutex.h   |  70 ++++---
 8 files changed, 83 insertions(+), 303 deletions(-)
diff --git a/Source/WTF/ChangeLog b/Source/WTF/ChangeLog
index fc704640a0ba..f6e9e19f3e9c 100644
--- a/Source/WTF/ChangeLog
+++ b/Source/WTF/ChangeLog
@@ -1,3 +1,16 @@
+2017-10-25  Commit Queue  <commit-queue@webkit.org>
+
+        Unreviewed, rolling out r222945.
+        https://bugs.webkit.org/show_bug.cgi?id=178818
+
+        "It made WasmBench crash" (Requested by saamyjoon on #webkit).
+
+        Reverted changeset:
+
+        "bmalloc mutex should be adaptive"
+        https://bugs.webkit.org/show_bug.cgi?id=177839
+        https://trac.webkit.org/changeset/222945
+
 2017-10-04  Filip Pizlo  <fpizlo@apple.com>
 
         bmalloc mutex should be adaptive
diff --git a/Source/WTF/wtf/LockAlgorithmInlines.h b/Source/WTF/wtf/LockAlgorithmInlines.h
index 31b42a5e8aee..0cbe5fa5fac8 100644
--- a/Source/WTF/wtf/LockAlgorithmInlines.h
+++ b/Source/WTF/wtf/LockAlgorithmInlines.h
@@ -33,10 +33,6 @@
 // the lock algorithm slow path without recompiling the world. Right now this should be included in two
 // places (Lock.cpp and JSCell.cpp).
 
-// NOTE: It's a bug to use any memory order other than seq_cst in this code. The cost of seq_cst
-// fencing is negligible on slow paths, so any use of a more relaxed memory model is all risk and no
-// reward.
-
 namespace WTF {
 
 template<typename LockType, LockType isHeldBit, LockType hasParkedBit>
diff --git a/Source/WTF/wtf/WordLock.cpp b/Source/WTF/wtf/WordLock.cpp
index 7361dc0fce92..9e8fb9bff05f 100644
--- a/Source/WTF/wtf/WordLock.cpp
+++ b/Source/WTF/wtf/WordLock.cpp
@@ -76,10 +76,6 @@ ThreadData* myThreadData()
 
 } // anonymous namespace
 
-// NOTE: It's a bug to use any memory order other than seq_cst in this code. The cost of seq_cst
-// fencing is negligible on slow paths, so any use of a more relaxed memory model is all risk and no
-// reward.
-
 NEVER_INLINE void WordLockBase::lockSlow()
 {
     unsigned spinCount = 0;
@@ -227,8 +223,7 @@ NEVER_INLINE void WordLockBase::unlockSlow()
     ASSERT(currentWordValue & isLockedBit);
     ASSERT(currentWordValue & isQueueLockedBit);
     ThreadData* queueHead = bitwise_cast<ThreadData*>(currentWordValue & ~queueHeadMask);
-    RELEASE_ASSERT(queueHead);
-    RELEASE_ASSERT(queueHead->shouldPark); // This would have been set before the thread was enqueued, so it must still be set now.
+    ASSERT(queueHead);
 
     ThreadData* newQueueHead = queueHead->nextInQueue;
     // Either this was the only thread on the queue, in which case we delete the queue, or there
@@ -261,12 +256,10 @@ NEVER_INLINE void WordLockBase::unlockSlow()
     {
         std::lock_guard<std::mutex> locker(queueHead->parkingLock);
         queueHead->shouldPark = false;
-        // We have to do the notify while still holding the lock, since otherwise, we could try to
-        // do it after the queueHead has destructed. It's impossible for queueHead to destruct
-        // while we hold the lock, since it is either currently in the wait loop or it's before it
-        // so it has to grab the lock before destructing.
-        queueHead->parkingCondition.notify_one();
     }
+    // Doesn't matter if we notify_all() or notify_one() here since the only thread that could be
+    // waiting is queueHead.
+    queueHead->parkingCondition.notify_one();
 
     // The old queue head can now contend for the lock again. We're done!
 }
diff --git a/Source/bmalloc/ChangeLog b/Source/bmalloc/ChangeLog
index 12b49c2fcd7d..9ba95aa9e8fa 100644
--- a/Source/bmalloc/ChangeLog
+++ b/Source/bmalloc/ChangeLog
@@ -1,3 +1,16 @@
+2017-10-25  Commit Queue  <commit-queue@webkit.org>
+
+        Unreviewed, rolling out r222945.
+        https://bugs.webkit.org/show_bug.cgi?id=178818
+
+        "It made WasmBench crash" (Requested by saamyjoon on #webkit).
+
+        Reverted changeset:
+
+        "bmalloc mutex should be adaptive"
+        https://bugs.webkit.org/show_bug.cgi?id=177839
+        https://trac.webkit.org/changeset/222945
+
 2017-10-23  Zan Dobersek  <zdobersek@igalia.com>
 
         bmalloc::api::tryLargeMemalignVirtual() shouldn't assert on a failed allocation
diff --git a/Source/bmalloc/bmalloc/Algorithm.h b/Source/bmalloc/bmalloc/Algorithm.h
index e682b78dcb9f..a8be204bf4da 100644
--- a/Source/bmalloc/bmalloc/Algorithm.h
+++ b/Source/bmalloc/bmalloc/Algorithm.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2014-2017 Apple Inc. All rights reserved.
+ * Copyright (C) 2014 Apple Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -28,7 +28,6 @@
 
 #include "BAssert.h"
 #include <algorithm>
-#include <atomic>
 #include <cstdint>
 #include <cstddef>
 #include <limits>
@@ -159,24 +158,6 @@ inline constexpr unsigned long log2(unsigned long value)
     return bitCount<unsigned long>() - 1 - __builtin_clzl(value);
 }
 
-// We need a CAS API that isn't the badly designed one from C++.
-template<typename T>
-bool compareExchangeWeak(std::atomic<T>& word, T expected, T desired, std::memory_order order = std::memory_order_seq_cst)
-{
-    // They could have made a sensible CAS API, but they didn't. Sneaky fact: for no good reason, the
-    // C++ API will mutate expected. I think that apologists will say something about how it saves you
-    // reloading the value around the back of your CAS loop, but that's nonsense. The delay along the
-    // back edge of a CAS loop has almost no impact on CAS loop performance. More often than not, we
-    // want to *add* delays to the back edge, not remove them.
-    return word.compare_exchange_weak(expected, desired, order, std::memory_order_relaxed);
-}
-
-template<typename T>
-bool compareExchangeStrong(std::atomic<T>& word, T expected, T desired, std::memory_order order = std::memory_order_seq_cst)
-{
-    return word.compare_exchange_strong(expected, desired, order, std::memory_order_relaxed);
-}
-
 } // namespace bmalloc
 
 #endif // Algorithm_h
diff --git a/Source/bmalloc/bmalloc/PerThread.h b/Source/bmalloc/bmalloc/PerThread.h
index df87f0eb2a2b..b848f828e954 100644
--- a/Source/bmalloc/bmalloc/PerThread.h
+++ b/Source/bmalloc/bmalloc/PerThread.h
@@ -60,11 +60,11 @@ class PerThread {
     static void destructor(void*);
 };
 
+#if HAVE_PTHREAD_MACHDEP_H
+
 class Cache;
 template<typename T> struct PerThreadStorage;
 
-#if HAVE_PTHREAD_MACHDEP_H
-
 // For now, we only support PerThread<PerHeapKind<Cache>>. We can expand to other types by
 // using more keys.
 template<> struct PerThreadStorage<PerHeapKind<Cache>> {
@@ -82,7 +82,7 @@ template<> struct PerThreadStorage<PerHeapKind<Cache>> {
     }
 };
 
-#endif
+#else
 
 template<typename T> struct PerThreadStorage {
     static bool s_didInitialize;
@@ -112,6 +112,8 @@ template<typename T> bool PerThreadStorage<T>::s_didInitialize;
 template<typename T> pthread_key_t PerThreadStorage<T>::s_key;
 template<typename T> std::once_flag PerThreadStorage<T>::s_onceFlag;
 
+#endif
+
 template<typename T>
 BINLINE T* PerThread<T>::getFastCase()
 {
diff --git a/Source/bmalloc/bmalloc/StaticMutex.cpp b/Source/bmalloc/bmalloc/StaticMutex.cpp
index b444af081fd4..1e2e1c3c0c08 100644
--- a/Source/bmalloc/bmalloc/StaticMutex.cpp
+++ b/Source/bmalloc/bmalloc/StaticMutex.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2014-2017 Apple Inc. All rights reserved.
+ * Copyright (C) 2014 Apple Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -23,245 +23,31 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  */
 
-#include "StaticMutex.h"
-
-#include "PerThread.h"
 #include "ScopeExit.h"
-#include <condition_variable>
-#include <mutex>
+#include "StaticMutex.h"
 #include <thread>
 
 namespace bmalloc {
 
-// FIXME: This duplicates code from WTF::WordLock.
-// https://bugs.webkit.org/show_bug.cgi?id=177719
-
-namespace {
-
-// This data structure serves three purposes:
-//
-// 1) A parking mechanism for threads that go to sleep. That involves just a system mutex and
-//    condition variable.
-//
-// 2) A queue node for when a thread is on some lock's queue.
-//
-// 3) The queue head. This is kind of funky. When a thread is the head of a queue, it also serves as
-//    the basic queue bookkeeping data structure. When a thread is dequeued, the next thread in the
-//    queue takes on the queue head duties.
-struct ThreadData {
-    // The parking mechanism.
-    bool shouldPark { false };
-    std::mutex parkingLock;
-    std::condition_variable parkingCondition;
-
-    // The queue node.
-    ThreadData* nextInQueue { nullptr };
-
-    // The queue itself.
-    ThreadData* queueTail { nullptr };
-};
-
-ThreadData* myThreadData()
+void StaticMutex::lockSlowCase()
 {
-    return PerThread<ThreadData>::get();
-}
-
-} // anonymous namespace
-
-// NOTE: It's a bug to use any memory order other than seq_cst in this code. The cost of seq_cst
-// fencing is negligible on slow paths, so any use of a more relaxed memory model is all risk and no
-// reward.
-
-BNO_INLINE void StaticMutex::lockSlow()
-{
-    unsigned spinCount = 0;
-
-    // This magic number turns out to be optimal based on past JikesRVM experiments.
-    const unsigned spinLimit = 40;
+    // The longest critical section in bmalloc is much shorter than the
+    // time it takes to make a system call to yield to the OS scheduler.
+    // So, we try again a lot before we yield.
+    static const size_t aLot = 256;
     
-    for (;;) {
-        uintptr_t currentWordValue = m_word.load();
-        
-        if (!(currentWordValue & isLockedBit)) {
-            // It's not possible for someone to hold the queue lock while the lock itself is no longer
-            // held, since we will only attempt to acquire the queue lock when the lock is held and
-            // the queue lock prevents unlock.
-            BASSERT(!(currentWordValue & isQueueLockedBit));
-            if (compareExchangeWeak(m_word, currentWordValue, currentWordValue | isLockedBit)) {
-                // Success! We acquired the lock.
-                return;
-            }
-        }
-
-        // If there is no queue and we haven't spun too much, we can just try to spin around again.
-        if (!(currentWordValue & ~queueHeadMask) && spinCount < spinLimit) {
-            spinCount++;
-            sched_yield();
-            continue;
-        }
-
-        // Need to put ourselves on the queue. Create the queue if one does not exist. This requries
-        // owning the queue for a little bit. The lock that controls the queue is itself a spinlock.
-        // But before we acquire the queue spinlock, we make sure that we have a ThreadData for this
-        // thread.
-        ThreadData* me = myThreadData();
-        BASSERT(!me->shouldPark);
-        BASSERT(!me->nextInQueue);
-        BASSERT(!me->queueTail);
-
-        // Reload the current word value, since some time may have passed.
-        currentWordValue = m_word.load();
-
-        // We proceed only if the queue lock is not held, the lock is held, and we succeed in
-        // acquiring the queue lock.
-        if ((currentWordValue & isQueueLockedBit)
-            || !(currentWordValue & isLockedBit)
-            || !compareExchangeWeak(m_word, currentWordValue, currentWordValue | isQueueLockedBit)) {
-            sched_yield();
-            continue;
-        }
-        
-        me->shouldPark = true;
-
-        // We own the queue. Nobody can enqueue or dequeue until we're done. Also, it's not possible
-        // to release the lock while we hold the queue lock.
-        ThreadData* queueHead = reinterpret_cast<ThreadData*>(currentWordValue & ~queueHeadMask);
-        if (queueHead) {
-            // Put this thread at the end of the queue.
-            queueHead->queueTail->nextInQueue = me;
-            queueHead->queueTail = me;
+    if (!m_isSpinning.test_and_set()) {
+        auto clear = makeScopeExit([&] { m_isSpinning.clear(); });
 
-            // Release the queue lock.
-            currentWordValue = m_word.load();
-            BASSERT(currentWordValue & ~queueHeadMask);
-            BASSERT(currentWordValue & isQueueLockedBit);
-            BASSERT(currentWordValue & isLockedBit);
-            m_word.store(currentWordValue & ~isQueueLockedBit);
-        } else {
-            // Make this thread be the queue-head.
-            queueHead = me;
-            me->queueTail = me;
-
-            // Release the queue lock and install ourselves as the head. No need for a CAS loop, since
-            // we own the queue lock.
-            currentWordValue = m_word.load();
-            BASSERT(~(currentWordValue & ~queueHeadMask));
-            BASSERT(currentWordValue & isQueueLockedBit);
-            BASSERT(currentWordValue & isLockedBit);
-            uintptr_t newWordValue = currentWordValue;
-            newWordValue |= reinterpret_cast<uintptr_t>(queueHead);
-            newWordValue &= ~isQueueLockedBit;
-            m_word.store(newWordValue);
-        }
-
-        // At this point everyone who acquires the queue lock will see me on the queue, and anyone who
-        // acquires me's lock will see that me wants to park. Note that shouldPark may have been
-        // cleared as soon as the queue lock was released above, but it will happen while the
-        // releasing thread holds me's parkingLock.
-
-        {
-            std::unique_lock<std::mutex> locker(me->parkingLock);
-            while (me->shouldPark)
-                me->parkingCondition.wait(locker);
-        }
-
-        BASSERT(!me->shouldPark);
-        BASSERT(!me->nextInQueue);
-        BASSERT(!me->queueTail);
-        
-        // Now we can loop around and try to acquire the lock again.
-    }
-}
-
-BNO_INLINE void StaticMutex::unlockSlow()
-{
-    // The fast path can fail either because of spurious weak CAS failure, or because someone put a
-    // thread on the queue, or the queue lock is held. If the queue lock is held, it can only be
-    // because someone *will* enqueue a thread onto the queue.
-
-    // Acquire the queue lock, or release the lock. This loop handles both lock release in case the
-    // fast path's weak CAS spuriously failed and it handles queue lock acquisition if there is
-    // actually something interesting on the queue.
-    for (;;) {
-        uintptr_t currentWordValue = m_word.load();
-
-        BASSERT(currentWordValue & isLockedBit);
-        
-        if (currentWordValue == isLockedBit) {
-            if (compareExchangeWeak(m_word, isLockedBit, clear)) {
-                // The fast path's weak CAS had spuriously failed, and now we succeeded. The lock is
-                // unlocked and we're done!
+        for (size_t i = 0; i < aLot; ++i) {
+            if (try_lock())
                 return;
-            }
-            // Loop around and try again.
-            sched_yield();
-            continue;
-        }
-        
-        if (currentWordValue & isQueueLockedBit) {
-            sched_yield();
-            continue;
         }
-
-        // If it wasn't just a spurious weak CAS failure and if the queue lock is not held, then there
-        // must be an entry on the queue.
-        BASSERT(currentWordValue & ~queueHeadMask);
-
-        if (compareExchangeWeak(m_word, currentWordValue, currentWordValue | isQueueLockedBit))
-            break;
     }
 
-    uintptr_t currentWordValue = m_word.load();
-        
-    // After we acquire the queue lock, the lock must still be held and the queue must be
-    // non-empty. The queue must be non-empty since only the lockSlow() method could have held the
-    // queue lock and if it did then it only releases it after putting something on the queue.
-    BASSERT(currentWordValue & isLockedBit);
-    BASSERT(currentWordValue & isQueueLockedBit);
-    ThreadData* queueHead = reinterpret_cast<ThreadData*>(currentWordValue & ~queueHeadMask);
-    RELEASE_BASSERT(queueHead);
-    RELEASE_BASSERT(queueHead->shouldPark);
-
-    ThreadData* newQueueHead = queueHead->nextInQueue;
-    // Either this was the only thread on the queue, in which case we delete the queue, or there
-    // are still more threads on the queue, in which case we create a new queue head.
-    if (newQueueHead)
-        newQueueHead->queueTail = queueHead->queueTail;
-
-    // Change the queue head, possibly removing it if newQueueHead is null. No need for a CAS loop,
-    // since we hold the queue lock and the lock itself so nothing about the lock can change right
-    // now.
-    currentWordValue = m_word.load();
-    BASSERT(currentWordValue & isLockedBit);
-    BASSERT(currentWordValue & isQueueLockedBit);
-    BASSERT((currentWordValue & ~queueHeadMask) == reinterpret_cast<uintptr_t>(queueHead));
-    uintptr_t newWordValue = currentWordValue;
-    newWordValue &= ~isLockedBit; // Release the lock.
-    newWordValue &= ~isQueueLockedBit; // Release the queue lock.
-    newWordValue &= queueHeadMask; // Clear out the old queue head.
-    newWordValue |= reinterpret_cast<uintptr_t>(newQueueHead); // Install new queue head.
-    m_word.store(newWordValue);
-
-    // Now the lock is available for acquisition. But we just have to wake up the old queue head.
-    // After that, we're done!
-
-    queueHead->nextInQueue = nullptr;
-    queueHead->queueTail = nullptr;
-
-    // We do this carefully because this may run either before or during the parkingLock critical
-    // section in lockSlow().
-    {
-        std::lock_guard<std::mutex> locker(queueHead->parkingLock);
-        RELEASE_BASSERT(queueHead->shouldPark);
-        queueHead->shouldPark = false;
-        // We have to do the notify while still holding the lock, since otherwise, we could try to
-        // do it after the queueHead has destructed. It's impossible for queueHead to destruct
-        // while we hold the lock, since it is either currently in the wait loop or it's before it
-        // so it has to grab the lock before destructing.
-        queueHead->parkingCondition.notify_one();
-    }
-    
-    // The old queue head can now contend for the lock again. We're done!
+    // Avoid spinning pathologically.
+    while (!try_lock())
+        sched_yield();
 }
 
 } // namespace bmalloc
diff --git a/Source/bmalloc/bmalloc/StaticMutex.h b/Source/bmalloc/bmalloc/StaticMutex.h
index 2366733fca1f..9e1361749d95 100644
--- a/Source/bmalloc/bmalloc/StaticMutex.h
+++ b/Source/bmalloc/bmalloc/StaticMutex.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2014-2017 Apple Inc. All rights reserved.
+ * Copyright (C) 2014 Apple Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -26,9 +26,7 @@
 #ifndef StaticMutex_h
 #define StaticMutex_h
 
-#include "Algorithm.h"
 #include "BAssert.h"
-#include "BExport.h"
 #include <atomic>
 #include <mutex>
 #include <thread>
@@ -38,16 +36,6 @@
 // Use StaticMutex in static storage, where global constructors and exit-time
 // destructors are prohibited, but all memory is zero-initialized automatically.
 
-// This uses the code from what WTF used to call WordLock. It's a fully adaptive mutex that uses
-// sizeof(void*) storage. It has a fast path that is similar to a spinlock, and a slow path that is
-// similar to std::mutex.
-
-// NOTE: In general, this is a great lock to use if you are very low in the stack. WTF continues to
-// have a copy of this code.
-
-// FIXME: Either fold bmalloc into WTF or find a better way to share code between them.
-// https://bugs.webkit.org/show_bug.cgi?id=177719
-
 namespace bmalloc {
 
 class StaticMutex {
@@ -57,49 +45,57 @@ class StaticMutex {
 
 public:
     void lock();
-    bool tryLock();
-    bool try_lock() { return tryLock(); }
+    bool try_lock();
     void unlock();
-    
-    bool isHeld() const;
-    bool isLocked() const { return isHeld(); }
 
 private:
-    BEXPORT void lockSlow();
-    BEXPORT void unlockSlow();
-
-    static const uintptr_t clear = 0;
-    static const uintptr_t isLockedBit = 1;
-    static const uintptr_t isQueueLockedBit = 2;
-    static const uintptr_t queueHeadMask = 3;
+    void lockSlowCase();
 
-    std::atomic<uintptr_t> m_word;
+    std::atomic_flag m_flag;
+    std::atomic_flag m_isSpinning;
 };
 
+static inline void sleep(
+    std::unique_lock<StaticMutex>& lock, std::chrono::milliseconds duration)
+{
+    if (duration == std::chrono::milliseconds(0))
+        return;
+    
+    lock.unlock();
+    std::this_thread::sleep_for(duration);
+    lock.lock();
+}
+
+static inline void waitUntilFalse(
+    std::unique_lock<StaticMutex>& lock, std::chrono::milliseconds sleepDuration,
+    bool& condition)
+{
+    while (condition) {
+        condition = false;
+        sleep(lock, sleepDuration);
+    }
+}
+
 inline void StaticMutex::init()
 {
-    m_word.store(0, std::memory_order_relaxed);
+    m_flag.clear();
+    m_isSpinning.clear();
 }
 
-inline bool StaticMutex::tryLock()
+inline bool StaticMutex::try_lock()
 {
-    return m_word.load(std::memory_order_acquire) & isLockedBit;
+    return !m_flag.test_and_set(std::memory_order_acquire);
 }
 
 inline void StaticMutex::lock()
 {
-    if (compareExchangeWeak(m_word, clear, isLockedBit, std::memory_order_acquire))
-        return;
-    
-    lockSlow();
+    if (!try_lock())
+        lockSlowCase();
 }
 
 inline void StaticMutex::unlock()
 {
-    if (compareExchangeWeak(m_word, isLockedBit, clear, std::memory_order_release))
-        return;
-    
-    unlockSlow();
+    m_flag.clear(std::memory_order_release);
 }
 
 } // namespace bmalloc