Merge r222945 - bmalloc mutex should be adaptive

https://bugs.webkit.org/show_bug.cgi?id=177839 Reviewed by Michael Saboff. Source/bmalloc: This pulls the WordLock algorithm into bmalloc, mostly by copy-pasting the code. We need to copy paste because sometimes we build WTF without bmalloc, so WTF cannot rely on bmalloc for anything other than malloc. Reland after fixing ancient WordLock bug: the notify_one has to happen with the lock held to ensure it doesn't run after that thread has died. * bmalloc/Algorithm.h: (bmalloc::compareExchangeWeak): (bmalloc::compareExchangeStrong): * bmalloc/PerThread.h: * bmalloc/StaticMutex.cpp: (bmalloc::StaticMutex::lockSlow): (bmalloc::StaticMutex::unlockSlow): (bmalloc::StaticMutex::lockSlowCase): Deleted. * bmalloc/StaticMutex.h: (bmalloc::StaticMutex::try_lock): (bmalloc::StaticMutex::isLocked const): (bmalloc::StaticMutex::init): (bmalloc::StaticMutex::tryLock): (bmalloc::StaticMutex::lock): (bmalloc::StaticMutex::unlock): (bmalloc::sleep): Deleted. (bmalloc::waitUntilFalse): Deleted. Source/WTF: Add some comments that I thought of while copy-pasting this code. Reland after fixing ancient WordLock bug: the notify_one has to happen with the lock held to ensure it doesn't run after that thread has died. * wtf/LockAlgorithmInlines.h: * wtf/WordLock.cpp:
WebKit · Oct 17, 2017 · da333d1 · da333d1
1 parent c7604b8
commit da333d1
Show file tree

Hide file tree

Showing 8 changed files with 350 additions and 57 deletions.
diff --git a/Source/WTF/ChangeLog b/Source/WTF/ChangeLog
@@ -1,3 +1,18 @@
+2017-10-04  Filip Pizlo  <fpizlo@apple.com>
+
+        bmalloc mutex should be adaptive
+        https://bugs.webkit.org/show_bug.cgi?id=177839
+
+        Reviewed by Michael Saboff.
+
+        Add some comments that I thought of while copy-pasting this code.
+
+        Reland after fixing ancient WordLock bug: the notify_one has to happen with the lock held
+        to ensure it doesn't run after that thread has died.
+
+        * wtf/LockAlgorithmInlines.h:
+        * wtf/WordLock.cpp:
+
 2017-10-05  Carlos Alberto Lopez Perez  <clopez@igalia.com>
 
         Generate a compile error if release is built without compiler optimizations

diff --git a/Source/WTF/wtf/LockAlgorithmInlines.h b/Source/WTF/wtf/LockAlgorithmInlines.h
@@ -33,6 +33,10 @@
 // the lock algorithm slow path without recompiling the world. Right now this should be included in two
 // places (Lock.cpp and JSCell.cpp).
 
+// NOTE: It's a bug to use any memory order other than seq_cst in this code. The cost of seq_cst
+// fencing is negligible on slow paths, so any use of a more relaxed memory model is all risk and no
+// reward.
+
 namespace WTF {
 
 template<typename LockType, LockType isHeldBit, LockType hasParkedBit>

diff --git a/Source/WTF/wtf/WordLock.cpp b/Source/WTF/wtf/WordLock.cpp
@@ -76,6 +76,10 @@ ThreadData* myThreadData()
 
 } // anonymous namespace
 
+// NOTE: It's a bug to use any memory order other than seq_cst in this code. The cost of seq_cst
+// fencing is negligible on slow paths, so any use of a more relaxed memory model is all risk and no
+// reward.
+
 NEVER_INLINE void WordLockBase::lockSlow()
 {
     unsigned spinCount = 0;
@@ -223,7 +227,8 @@ NEVER_INLINE void WordLockBase::unlockSlow()
     ASSERT(currentWordValue & isLockedBit);
     ASSERT(currentWordValue & isQueueLockedBit);
     ThreadData* queueHead = bitwise_cast<ThreadData*>(currentWordValue & ~queueHeadMask);
-    ASSERT(queueHead);
+    RELEASE_ASSERT(queueHead);
+    RELEASE_ASSERT(queueHead->shouldPark); // This would have been set before the thread was enqueued, so it must still be set now.
 
     ThreadData* newQueueHead = queueHead->nextInQueue;
     // Either this was the only thread on the queue, in which case we delete the queue, or there
@@ -256,10 +261,12 @@ NEVER_INLINE void WordLockBase::unlockSlow()
     {
         std::lock_guard<std::mutex> locker(queueHead->parkingLock);
         queueHead->shouldPark = false;
+        // We have to do the notify while still holding the lock, since otherwise, we could try to
+        // do it after the queueHead has destructed. It's impossible for queueHead to destruct
+        // while we hold the lock, since it is either currently in the wait loop or it's before it
+        // so it has to grab the lock before destructing.
+        queueHead->parkingCondition.notify_one();
     }
-    // Doesn't matter if we notify_all() or notify_one() here since the only thread that could be
-    // waiting is queueHead.
-    queueHead->parkingCondition.notify_one();
 
     // The old queue head can now contend for the lock again. We're done!
 }

diff --git a/Source/bmalloc/ChangeLog b/Source/bmalloc/ChangeLog
@@ -1,3 +1,35 @@
+2017-10-04  Filip Pizlo  <fpizlo@apple.com>
+
+        bmalloc mutex should be adaptive
+        https://bugs.webkit.org/show_bug.cgi?id=177839
+
+        Reviewed by Michael Saboff.
+
+        This pulls the WordLock algorithm into bmalloc, mostly by copy-pasting the code. We need to
+        copy paste because sometimes we build WTF without bmalloc, so WTF cannot rely on bmalloc for
+        anything other than malloc.
+
+        Reland after fixing ancient WordLock bug: the notify_one has to happen with the lock held
+        to ensure it doesn't run after that thread has died.
+
+        * bmalloc/Algorithm.h:
+        (bmalloc::compareExchangeWeak):
+        (bmalloc::compareExchangeStrong):
+        * bmalloc/PerThread.h:
+        * bmalloc/StaticMutex.cpp:
+        (bmalloc::StaticMutex::lockSlow):
+        (bmalloc::StaticMutex::unlockSlow):
+        (bmalloc::StaticMutex::lockSlowCase): Deleted.
+        * bmalloc/StaticMutex.h:
+        (bmalloc::StaticMutex::try_lock):
+        (bmalloc::StaticMutex::isLocked const):
+        (bmalloc::StaticMutex::init):
+        (bmalloc::StaticMutex::tryLock):
+        (bmalloc::StaticMutex::lock):
+        (bmalloc::StaticMutex::unlock):
+        (bmalloc::sleep): Deleted.
+        (bmalloc::waitUntilFalse): Deleted.
+
 2017-09-04  Adrian Perez de Castro  <aperez@igalia.com>
 
         Unreviewed build fix for Clang with libc++

diff --git a/Source/bmalloc/bmalloc/Algorithm.h b/Source/bmalloc/bmalloc/Algorithm.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2014 Apple Inc. All rights reserved.
+ * Copyright (C) 2014-2017 Apple Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -28,6 +28,7 @@
 
 #include "BAssert.h"
 #include <algorithm>
+#include <atomic>
 #include <cstdint>
 #include <cstddef>
 #include <limits>
@@ -158,6 +159,24 @@ inline constexpr unsigned long log2(unsigned long value)
     return bitCount<unsigned long>() - 1 - __builtin_clzl(value);
 }
 
+// We need a CAS API that isn't the badly designed one from C++.
+template<typename T>
+bool compareExchangeWeak(std::atomic<T>& word, T expected, T desired, std::memory_order order = std::memory_order_seq_cst)
+{
+    // They could have made a sensible CAS API, but they didn't. Sneaky fact: for no good reason, the
+    // C++ API will mutate expected. I think that apologists will say something about how it saves you
+    // reloading the value around the back of your CAS loop, but that's nonsense. The delay along the
+    // back edge of a CAS loop has almost no impact on CAS loop performance. More often than not, we
+    // want to *add* delays to the back edge, not remove them.
+    return word.compare_exchange_weak(expected, desired, order, std::memory_order_relaxed);
+}
+
+template<typename T>
+bool compareExchangeStrong(std::atomic<T>& word, T expected, T desired, std::memory_order order = std::memory_order_seq_cst)
+{
+    return word.compare_exchange_strong(expected, desired, order, std::memory_order_relaxed);
+}
+
 } // namespace bmalloc
 
 #endif // Algorithm_h
diff --git a/Source/bmalloc/bmalloc/PerThread.h b/Source/bmalloc/bmalloc/PerThread.h
@@ -60,11 +60,11 @@ class PerThread {
     static void destructor(void*);
 };
 
-#if HAVE_PTHREAD_MACHDEP_H
-
 class Cache;
 template<typename T> struct PerThreadStorage;
 
+#if HAVE_PTHREAD_MACHDEP_H
+
 // For now, we only support PerThread<PerHeapKind<Cache>>. We can expand to other types by
 // using more keys.
 template<> struct PerThreadStorage<PerHeapKind<Cache>> {
@@ -82,7 +82,7 @@ template<> struct PerThreadStorage<PerHeapKind<Cache>> {
     }
 };
 
-#else
+#endif
 
 template<typename T> struct PerThreadStorage {
     static bool s_didInitialize;
@@ -112,8 +112,6 @@ template<typename T> bool PerThreadStorage<T>::s_didInitialize;
 template<typename T> pthread_key_t PerThreadStorage<T>::s_key;
 template<typename T> std::once_flag PerThreadStorage<T>::s_onceFlag;
 
-#endif
-
 template<typename T>
 BINLINE T* PerThread<T>::getFastCase()
 {