diff --git a/Utilities/cond.cpp b/Utilities/cond.cpp
index 718ce5cb30d2..c806549b3663 100644
--- a/Utilities/cond.cpp
+++ b/Utilities/cond.cpp
@@ -273,6 +273,159 @@ void shared_cond::imp_notify() noexcept
 	balanced_awaken<true>(m_cvx32, utils::popcnt32(wait_mask));
 }
 
+void shared_cond::wait_all() noexcept
+{
+	// Try to acquire waiting state without locking but only if there are other locks
+	const auto [old_, result] = m_cvx32.fetch_op([](u64& cvx32) -> u64
+	{
+		// Check waiting alone
+		if ((cvx32 & 0xffffffff) == 0)
+		{
+			return 0;
+		}
+
+		// Combine used bits and invert to find least significant bit unused
+		const u32 slot = utils::cnttz64(~((cvx32 & 0xffffffff) | (cvx32 >> 32)), true);
+
+		// Set waiting bit (does nothing if all slots are used)
+		cvx32 |= (1ull << slot) & 0xffffffff;
+		return 1ull << slot;
+	});
+
+	if (!result)
+	{
+		return;
+	}
+
+	if (result > 0xffffffffu)
+	{
+		// All slots are used, fallback to spin wait
+		while (m_cvx32 & 0xffffffff)
+		{
+			busy_wait();
+		}
+
+		return;
+	}
+
+	const u64 wait_bit = result;
+	const u64 lock_bit = wait_bit | (wait_bit << 32);
+
+	balanced_wait_until(m_cvx32, -1, [&](u64& cvx32, auto... ret) -> int
+	{
+		if ((cvx32 & wait_bit) == 0)
+		{
+			// Remove signal and unlock at once
+			cvx32 &= ~lock_bit;
+			return +1;
+		}
+
+		if constexpr (sizeof...(ret))
+		{
+			cvx32 &= ~lock_bit;
+			return -1;
+		}
+
+		return 0;
+	});
+}
+
+bool shared_cond::wait_all(shared_cond::shared_lock& lock) noexcept
+{
+	AUDIT(lock.m_this == this);
+
+	if (lock.m_slot >= 32)
+	{
+		// Invalid argument, assume notified
+		return true;
+	}
+
+	const u64 wait_bit = c_wait << lock.m_slot;
+	const u64 lock_bit = c_lock << lock.m_slot;
+
+	// Try to acquire waiting state only if there are other locks
+	const auto [old_, not_alone] = m_cvx32.fetch_op([&](u64& cvx32)
+	{
+		// Check locking alone
+		if ((cvx32 >> 32) == (lock_bit >> 32))
+		{
+			return false;
+		}
+
+		// c_lock -> c_wait, c_sig -> unlock
+		cvx32 &= ~(lock_bit & ~wait_bit);
+		return true;
+	});
+
+	if (!not_alone)
+	{
+		return false;
+	}
+	else
+	{
+		// Set invalid slot to acknowledge unlocking
+		lock.m_slot = 33;
+	}
+
+	if ((old_ & wait_bit) == 0)
+	{
+		// Already signaled, return without waiting
+		return true;
+	}
+
+	balanced_wait_until(m_cvx32, -1, [&](u64& cvx32, auto... ret) -> int
+	{
+		if ((cvx32 & wait_bit) == 0)
+		{
+			// Remove signal and unlock at once
+			cvx32 &= ~lock_bit;
+			return +1;
+		}
+
+		if constexpr (sizeof...(ret))
+		{
+			cvx32 &= ~lock_bit;
+			return -1;
+		}
+
+		return 0;
+	});
+
+	return true;
+}
+
+void shared_cond::notify_all(shared_cond::shared_lock& lock) noexcept
+{
+	AUDIT(lock.m_this == this);
+
+	const u64 slot_mask = c_sig << lock.m_slot;
+
+	auto [old, ok] = m_cvx32.fetch_op([&](u64& cvx32)
+	{
+		if (const u64 sig_mask = cvx32 & 0xffffffff)
+		{
+			cvx32 &= (0xffffffffull << 32) & ~slot_mask;
+			cvx32 |= (sig_mask << 32) & ~slot_mask;
+			return true;
+		}
+
+		return false;
+	});
+
+	// Set invalid slot to acknowledge unlocking
+	lock.m_slot = 34;
+
+	// Determine if some waiters need a syscall notification
+	const u64 wait_mask = old & (~old >> 32);
+
+	if (UNLIKELY(!ok || !wait_mask))
+	{
+		return;
+	}
+
+	balanced_awaken<true>(m_cvx32, utils::popcnt32(wait_mask));
+}
+
 bool lf_queue_base::wait(u64 _timeout)
 {
 	auto _old = m_head.compare_and_swap(0, 1);
diff --git a/Utilities/cond.h b/Utilities/cond.h
index dc716fab88c8..0938ca72266b 100644
--- a/Utilities/cond.h
+++ b/Utilities/cond.h
@@ -206,7 +206,7 @@ class shared_cond
 			m_slot = m_this->m_cvx32.atomic_op([](u64& cvx32)
 			{
 				// Combine used bits and invert to find least significant bit unused
-				const u32 slot = utils::cnttz32(~((cvx32 & 0xffffffff) | (cvx32 >> 32)), true);
+				const u32 slot = utils::cnttz64(~((cvx32 & 0xffffffff) | (cvx32 >> 32)), true);
 
 				// Set lock bits (does nothing if all slots are used)
 				const u64 bit = (1ull << slot) & 0xffffffff;
@@ -261,6 +261,10 @@ class shared_cond
 		return imp_wait(lock.m_slot, usec_timeout);
 	}
 
+	void wait_all() noexcept;
+
+	bool wait_all(shared_lock& lock) noexcept;
+
 	void notify_all() noexcept
 	{
 		if (LIKELY(!m_cvx32))
@@ -268,4 +272,6 @@ class shared_cond
 
 		imp_notify();
 	}
+
+	void notify_all(shared_lock& lock) noexcept;
 };
diff --git a/rpcs3/Emu/CPU/CPUThread.cpp b/rpcs3/Emu/CPU/CPUThread.cpp
index 150a197f785b..cb2ea0fb39c9 100644
--- a/rpcs3/Emu/CPU/CPUThread.cpp
+++ b/rpcs3/Emu/CPU/CPUThread.cpp
@@ -19,10 +19,13 @@ void fmt_class_string<cpu_flag>::format(std::string& out, u64 arg)
 		{
 		case cpu_flag::stop: return "STOP";
 		case cpu_flag::exit: return "EXIT";
+		case cpu_flag::wait: return "w";
+		case cpu_flag::pause: return "p";
 		case cpu_flag::suspend: return "s";
 		case cpu_flag::ret: return "ret";
 		case cpu_flag::signal: return "sig";
 		case cpu_flag::memory: return "mem";
+		case cpu_flag::jit_return: return "JIT";
 		case cpu_flag::dbg_global_pause: return "G-PAUSE";
 		case cpu_flag::dbg_global_stop: return "G-EXIT";
 		case cpu_flag::dbg_pause: return "PAUSE";
@@ -42,10 +45,37 @@ void fmt_class_string<bs_t<cpu_flag>>::format(std::string& out, u64 arg)
 
 thread_local cpu_thread* g_tls_current_cpu_thread = nullptr;
 
-void cpu_thread::operator()()
+// Pseudo-lock for coordination
+alignas(64) shared_cond g_cpu_array_lock;
+
+// Semaphore for global thread array (global counter)
+alignas(64) atomic_t<u32> g_cpu_array_sema;
+
+// Semaphore subdivision for each array slot (64 x N in total)
+atomic_t<u64> g_cpu_array_bits[6]{};
+
+// All registered threads
+atomic_t<cpu_thread*> g_cpu_array[sizeof(g_cpu_array_bits) * 8]{};
+
+template <typename F>
+void for_all_cpu(F&& func) noexcept
 {
-	state -= cpu_flag::exit;
+	for (u32 i = 0; i < ::size32(g_cpu_array_bits); i++)
+	{
+		for (u64 bits = g_cpu_array_bits[i]; bits; bits &= bits - 1)
+		{
+			const u64 index = i * 64 + utils::cnttz64(bits, true);
+
+			if (cpu_thread* cpu = g_cpu_array[index].load())
+			{
+				func(cpu);
+			}
+		}
+	}
+}
 
+void cpu_thread::operator()()
+{
 	g_tls_current_cpu_thread = this;
 
 	if (g_cfg.core.thread_scheduler_enabled)
@@ -58,6 +88,48 @@ void cpu_thread::operator()()
 		thread_ctrl::set_native_priority(-1);
 	}
 
+	// Register thread in g_cpu_array
+	if (!g_cpu_array_sema.try_inc(sizeof(g_cpu_array_bits) * 8))
+	{
+		LOG_FATAL(GENERAL, "Too many threads");
+		Emu.Pause();
+		return;
+	}
+
+	u64 array_slot = -1;
+
+	for (u32 i = 0;; i = (i + 1) % ::size32(g_cpu_array_bits))
+	{
+		if (LIKELY(~g_cpu_array_bits[i]))
+		{
+			const u64 found = g_cpu_array_bits[i].atomic_op([](u64& bits) -> u64
+			{
+				// Find empty array slot and set its bit
+				if (LIKELY(~bits))
+				{
+					const u64 bit = utils::cnttz64(~bits, true);
+					bits |= 1ull << bit;
+					return bit;
+				}
+
+				return 64;
+			});
+
+			if (LIKELY(found < 64))
+			{
+				// Fixup
+				array_slot = i * 64 + found;
+				break;
+			}
+		}
+	}
+
+	// Register and wait if necessary
+	verify("g_cpu_array[...] -> this" HERE), g_cpu_array[array_slot].exchange(this) == nullptr;
+
+	state += cpu_flag::wait;
+	g_cpu_array_lock.wait_all();
+
 	// Check thread status
 	while (!(state & (cpu_flag::exit + cpu_flag::dbg_global_stop)))
 	{
@@ -86,6 +158,13 @@ void cpu_thread::operator()()
 
 		thread_ctrl::wait();
 	}
+
+	// Unregister and wait if necessary
+	state += cpu_flag::wait;
+	verify("g_cpu_array[...] -> null" HERE), g_cpu_array[array_slot].exchange(nullptr) == this;
+	g_cpu_array_bits[array_slot / 64] &= ~(1ull << (array_slot % 64));
+	g_cpu_array_sema--;
+	g_cpu_array_lock.wait_all();
 }
 
 void cpu_thread::on_abort()
@@ -105,7 +184,7 @@ cpu_thread::cpu_thread(u32 id)
 	g_threads_created++;
 }
 
-bool cpu_thread::check_state()
+bool cpu_thread::check_state() noexcept
 {
 #ifdef WITH_GDB_DEBUGGER
 	if (state & cpu_flag::dbg_pause)
@@ -116,6 +195,17 @@ bool cpu_thread::check_state()
 
 	bool cpu_sleep_called = false;
 	bool cpu_flag_memory = false;
+	bool cpu_flag_wait = false;
+
+	if (state & cpu_flag::wait)
+	{
+		cpu_flag_wait = true;
+	}
+	else if (state & cpu_flag::pause)
+	{
+		state += cpu_flag::wait;
+		cpu_flag_wait = true;
+	}
 
 	while (true)
 	{
@@ -131,8 +221,9 @@ bool cpu_thread::check_state()
 			state -= cpu_flag::memory;
 		}
 
-		if (state & cpu_flag::exit + cpu_flag::jit_return + cpu_flag::dbg_global_stop)
+		if (state & (cpu_flag::exit + cpu_flag::jit_return + cpu_flag::dbg_global_stop))
 		{
+			state += cpu_flag::wait;
 			return true;
 		}
 
@@ -141,7 +232,20 @@ bool cpu_thread::check_state()
 			cpu_sleep_called = false;
 		}
 
-		if (!is_paused())
+		const auto [state0, escape] = state.fetch_op([&](bs_t<cpu_flag>& flags)
+		{
+			// Check pause flags which hold thread inside check_state
+			if (flags & (cpu_flag::pause + cpu_flag::suspend + cpu_flag::dbg_global_pause + cpu_flag::dbg_pause))
+			{
+				return false;
+			}
+
+			// Atomically clean wait flag and escape
+			flags -= cpu_flag::wait;
+			return true;
+		});
+
+		if (escape)
 		{
 			if (cpu_flag_memory)
 			{
@@ -150,14 +254,46 @@ bool cpu_thread::check_state()
 
 			break;
 		}
-		else if (!cpu_sleep_called && state & cpu_flag::suspend)
+		else if (!cpu_sleep_called && state0 & cpu_flag::suspend)
 		{
 			cpu_sleep();
 			cpu_sleep_called = true;
 			continue;
 		}
 
-		thread_ctrl::wait();
+		if (!cpu_flag_wait)
+		{
+			state += cpu_flag::wait;
+			cpu_flag_wait = true;
+
+			// Spin wait once for a bit before resorting to thread_ctrl::wait
+			for (u32 i = 0; i < 10; i++)
+			{
+				if (state0 & (cpu_flag::pause + cpu_flag::suspend))
+				{
+					busy_wait(500);
+				}
+				else
+				{
+					break;
+				}
+			}
+
+			if (!(state0 & (cpu_flag::pause + cpu_flag::suspend)))
+			{
+				continue;
+			}
+		}
+
+		if (state0 & (cpu_flag::suspend + cpu_flag::dbg_global_pause + cpu_flag::dbg_pause))
+		{
+			thread_ctrl::wait();
+		}
+		else
+		{
+			// If only cpu_flag::pause was set, notification won't arrive
+			g_cpu_array_lock.wait_all();
+		}
 	}
 
 	const auto state_ = state.load();
@@ -196,3 +332,67 @@ std::string cpu_thread::dump() const
 {
 	return fmt::format("Type: %s\n" "State: %s\n", typeid(*this).name(), state.load());
 }
+
+cpu_thread::suspend_all::suspend_all(cpu_thread* _this) noexcept
+	: m_lock(g_cpu_array_lock.try_shared_lock())
+	, m_this(_this)
+{
+	// TODO
+	if (!m_lock)
+	{
+		LOG_FATAL(GENERAL, "g_cpu_array_lock: too many concurrent accesses");
+		Emu.Pause();
+		return;
+	}
+
+	if (m_this)
+	{
+		m_this->state += cpu_flag::wait;
+	}
+
+	for_all_cpu([](cpu_thread* cpu)
+	{
+		cpu->state += cpu_flag::pause;
+	});
+
+	busy_wait(500);
+
+	while (true)
+	{
+		bool ok = true;
+
+		for_all_cpu([&](cpu_thread* cpu)
+		{
+			if (!(cpu->state & cpu_flag::wait))
+			{
+				ok = false;
+			}
+		});
+
+		if (LIKELY(ok))
+		{
+			break;
+		}
+
+		busy_wait(1000);
+	}
+}
+
+cpu_thread::suspend_all::~suspend_all()
+{
+	// Make sure latest worker does cleanup and notifies others
+	if (!g_cpu_array_lock.wait_all(m_lock))
+	{
+		for_all_cpu([](cpu_thread* cpu)
+		{
+			cpu->state -= cpu_flag::pause;
+		});
+
+		g_cpu_array_lock.notify_all(m_lock);
+	}
+
+	if (m_this)
+	{
+		m_this->check_state();
+	}
+}
diff --git a/rpcs3/Emu/CPU/CPUThread.h b/rpcs3/Emu/CPU/CPUThread.h
index 7eb3fdf63373..a1f3af46e9d1 100644
--- a/rpcs3/Emu/CPU/CPUThread.h
+++ b/rpcs3/Emu/CPU/CPUThread.h
@@ -2,12 +2,15 @@
 
 #include "../Utilities/Thread.h"
 #include "../Utilities/bit_set.h"
+#include "../Utilities/cond.h"
 
 // Thread state flags
 enum class cpu_flag : u32
 {
 	stop, // Thread not running (HLE, initial state)
 	exit, // Irreversible exit
+	wait, // Indicates waiting state, set by the thread itself
+	pause, // Thread suspended by suspend_all technique
 	suspend, // Thread suspended
 	ret, // Callback return requested
 	signal, // Thread received a signal (HLE)
@@ -39,15 +42,15 @@ class cpu_thread
 	const u32 id;
 
 	// Public thread state
-	atomic_bs_t<cpu_flag> state{+cpu_flag::stop};
+	atomic_bs_t<cpu_flag> state{cpu_flag::stop + cpu_flag::wait};
 
 	// Process thread state, return true if the checker must return
-	bool check_state();
+	bool check_state() noexcept;
 
 	// Process thread state (pause)
 	[[nodiscard]] bool test_stopped()
 	{
-		if (UNLIKELY(state))
+		if (state)
 		{
 			if (check_state())
 			{
@@ -99,6 +102,20 @@ class cpu_thread
 
 	// Callback for vm::temporary_unlock
 	virtual void cpu_unmem() {}
+
+	// Thread locker
+	class suspend_all
+	{
+		decltype(std::declval<shared_cond&>().try_shared_lock()) m_lock;
+
+		cpu_thread* m_this;
+
+	public:
+		suspend_all(cpu_thread* _this) noexcept;
+		suspend_all(const suspend_all&) = delete;
+		suspend_all& operator=(const suspend_all&) = delete;
+		~suspend_all();
+	};
 };
 
 inline cpu_thread* get_current_cpu_thread() noexcept
diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp
index 2fbc73150117..6fc7b839a537 100644
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@@ -343,7 +343,6 @@ const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, const
 	c.mov(x86::r12d, 1);
 
 	Label fall2 = c.newLabel();
-	Label next2 = c.newLabel();
 
 	// Lightened transaction: only compare and swap data
 	Label retry = build_transaction_enter(c, fall2);
@@ -470,16 +469,10 @@ const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, const
 	c.jne(fail);
 	c.cmp(x86::r12, 16);
 	c.jb(retry);
-	c.mov(x86::rax, imm_ptr(&g_cfg.core.spu_accurate_putllc.get()));
-	c.test(x86::byte_ptr(x86::rax), 1);
-	c.jnz(retry);
+	//c.jmp(_ret);
 
 	c.bind(fail);
 	build_transaction_abort(c, 0xff);
-	c.test(x86::r12, x86::r12);
-	c.jz(next2);
-	c.lock().sub(x86::qword_ptr(x86::rbx), 1);
-	c.bind(next2);
 	c.mov(x86::rax, x86::r12);
 	c.not_(x86::rax);
 	//c.jmp(_ret);
@@ -522,7 +515,7 @@ const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, const
 	c.ret();
 });
 
-const auto spu_getll_tx = build_function_asm<u64(*)(u32 raddr, void* rdata, u64* rtime)>([](asmjit::X86Assembler& c, auto& args)
+const auto spu_getll_tx = build_function_asm<u64(*)(u32 raddr, void* rdata)>([](asmjit::X86Assembler& c, auto& args)
 {
 	using namespace asmjit;
 
@@ -558,7 +551,6 @@ const auto spu_getll_tx = build_function_asm<u64(*)(u32 raddr, void* rdata, u64*
 	c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0]));
 	c.xor_(x86::r12d, x86::r12d);
 	c.mov(x86::r13, args[1]);
-	c.mov(x86::qword_ptr(x86::rsp, 64), args[2]);
 
 	// Begin transaction
 	Label begin = build_transaction_enter(c, fall);
@@ -605,14 +597,14 @@ const auto spu_getll_tx = build_function_asm<u64(*)(u32 raddr, void* rdata, u64*
 	}
 
 	c.and_(x86::rax, -128);
-	c.mov(args[2], x86::qword_ptr(x86::rsp, 64));
-	c.mov(x86::qword_ptr(args[2]), x86::rax);
-	c.mov(x86::rax, x86::r12);
 	c.jmp(_ret);
 
 	// Touch memory after transaction failure
 	c.bind(fall);
 	c.lea(x86::r12, x86::qword_ptr(x86::r12, 1));
+	c.mov(x86::eax, 1);
+	c.cmp(x86::r12, 16);
+	c.jae(_ret);
 
 	if (s_tsx_haswell || std::thread::hardware_concurrency() < 12)
 	{
@@ -994,7 +986,10 @@ const auto spu_putlluc_tx = build_function_asm<u64(*)(u32 raddr, const void* rda
 	c.xor_(x86::rbp, x86::rax);
 	c.lock().add(x86::qword_ptr(x86::rbp), 0);
 	c.xor_(x86::rbp, x86::rax);
-	c.jmp(retry);
+	c.cmp(x86::r12, 16);
+	c.jb(retry);
+	c.mov(x86::rax, x86::r12);
+	//c.jmp(_ret);
 
 	c.bind(_ret);
 
@@ -1707,7 +1702,20 @@ void spu_thread::do_putlluc(const spu_mfc_cmd& args)
 
 		if (count >= 10)
 		{
-			LOG_ERROR(SPU, "%s took too long: %u", args.cmd, count);
+			LOG_WARNING(SPU, "%s took too long: %u", args.cmd, count);
+		}
+
+		if (count >= 16)
+		{
+			cpu_thread::suspend_all cpu_lock(this);
+
+			while (atomic_storage<u64>::bts(vm::reservation_acquire(addr, 128).raw(), 6))
+			{
+				busy_wait(100);
+			}
+
+			mov_rdata(vm::_ref<decltype(rdata)>(addr).data(), to_write.data());
+			vm::reservation_acquire(addr, 128) += 63;
 		}
 	}
 	else
@@ -1847,6 +1855,8 @@ bool spu_thread::process_mfc_cmd()
 	// Stall infinitely if MFC queue is full
 	while (UNLIKELY(mfc_size >= 16))
 	{
+		state += cpu_flag::wait;
+
 		if (is_stopped())
 		{
 			return false;
@@ -1875,6 +1885,8 @@ bool spu_thread::process_mfc_cmd()
 
 			while (rdata == data && (vm::reservation_acquire(addr, 128)) == rtime)
 			{
+				state += cpu_flag::wait;
+
 				if (is_stopped())
 				{
 					break;
@@ -1882,17 +1894,39 @@ bool spu_thread::process_mfc_cmd()
 
 				thread_ctrl::wait_for(100);
 			}
+
+			if (test_stopped())
+			{
+				return false;
+			}
 		}
 
-		if (LIKELY(g_use_rtm))
+		if (LIKELY(g_use_rtm && !g_cfg.core.spu_accurate_getllar))
 		{
-			const u64 count = g_cfg.core.spu_accurate_getllar ? spu_getll_tx(addr, dst.data(), &ntime) : spu_getll_fast(addr, dst.data(), &ntime);
+			const u64 count = spu_getll_fast(addr, dst.data(), &ntime);
 
 			if (count >= 10)
 			{
 				LOG_ERROR(SPU, "%s took too long: %u", ch_mfc_cmd.cmd, count);
 			}
 		}
+		else if (g_use_rtm)
+		{
+			ntime = spu_getll_tx(addr, dst.data());
+
+			if (ntime == 1)
+			{
+				cpu_thread::suspend_all cpu_lock(this);
+
+				while (vm::reservation_acquire(addr, 128) & 127)
+				{
+					busy_wait(100);
+				}
+
+				ntime = vm::reservation_acquire(addr, 128);
+				mov_rdata(dst.data(), data.data());
+			}
+		}
 		else
 		{
 			auto& res = vm::reservation_lock(addr, 128);
@@ -1968,7 +2002,34 @@ bool spu_thread::process_mfc_cmd()
 
 				if (count >= 10)
 				{
-					LOG_ERROR(SPU, "%s took too long: %u (r=%u)", ch_mfc_cmd.cmd, count, result);
+					LOG_WARNING(SPU, "%s took too long: %u (r=%u)", ch_mfc_cmd.cmd, count, result);
+				}
+
+				if (count >= 16 && !result)
+				{
+					cpu_thread::suspend_all cpu_lock(this);
+
+					while (atomic_storage<u64>::bts(vm::reservation_acquire(addr, 128).raw(), 6))
+					{
+						busy_wait(100);
+					}
+
+					auto& data = vm::_ref<decltype(rdata)>(addr);
+
+					if ((vm::reservation_acquire(addr, 128) & -128) == rtime && rdata == data)
+					{
+						mov_rdata(data.data(), to_write.data());
+						vm::reservation_acquire(addr, 128) += 63;
+						result = 1;
+					}
+					else
+					{
+						vm::reservation_acquire(addr, 128) -= 65;
+					}
+				}
+				else if (count && !result)
+				{
+					vm::reservation_acquire(addr, 128) -= 1;
 				}
 			}
 			else if (auto& data = vm::_ref<decltype(rdata)>(addr); rdata == data)
@@ -2256,6 +2317,11 @@ s64 spu_thread::get_ch_value(u32 ch)
 
 	auto read_channel = [&](spu_channel& channel) -> s64
 	{
+		if (channel.get_count() == 0)
+		{
+			state += cpu_flag::wait;
+		}
+
 		for (int i = 0; i < 10 && channel.get_count() == 0; i++)
 		{
 			busy_wait();
@@ -2273,6 +2339,7 @@ s64 spu_thread::get_ch_value(u32 ch)
 			thread_ctrl::wait();
 		}
 
+		check_state();
 		return out;
 	};
 
@@ -2284,6 +2351,11 @@ s64 spu_thread::get_ch_value(u32 ch)
 	}
 	case SPU_RdInMbox:
 	{
+		if (ch_in_mbox.get_count() == 0)
+		{
+			state += cpu_flag::wait;
+		}
+
 		while (true)
 		{
 			for (int i = 0; i < 10 && ch_in_mbox.get_count() == 0; i++)
@@ -2300,6 +2372,7 @@ s64 spu_thread::get_ch_value(u32 ch)
 					int_ctrl[2].set(SPU_INT2_STAT_SPU_MAILBOX_THRESHOLD_INT);
 				}
 
+				check_state();
 				return out;
 			}
 
@@ -2410,6 +2483,8 @@ s64 spu_thread::get_ch_value(u32 ch)
 
 			while (res = get_events(), !res)
 			{
+				state += cpu_flag::wait;
+
 				if (is_stopped())
 				{
 					return -1;
@@ -2418,11 +2493,14 @@ s64 spu_thread::get_ch_value(u32 ch)
 				pseudo_lock.wait(100);
 			}
 
+			check_state();
 			return res;
 		}
 
 		while (res = get_events(true), !res)
 		{
+			state += cpu_flag::wait;
+
 			if (is_stopped())
 			{
 				return -1;
@@ -2431,6 +2509,7 @@ s64 spu_thread::get_ch_value(u32 ch)
 			thread_ctrl::wait_for(100);
 		}
 
+		check_state();
 		return res;
 	}
 
@@ -2463,6 +2542,8 @@ bool spu_thread::set_ch_value(u32 ch, u32 value)
 		{
 			while (!ch_out_intr_mbox.try_push(value))
 			{
+				state += cpu_flag::wait;
+
 				if (is_stopped())
 				{
 					return false;
@@ -2472,6 +2553,7 @@ bool spu_thread::set_ch_value(u32 ch, u32 value)
 			}
 
 			int_ctrl[2].set(SPU_INT2_STAT_MAILBOX_INT);
+			check_state();
 			return true;
 		}
 
@@ -2609,6 +2691,8 @@ bool spu_thread::set_ch_value(u32 ch, u32 value)
 	{
 		while (!ch_out_mbox.try_push(value))
 		{
+			state += cpu_flag::wait;
+
 			if (is_stopped())
 			{
 				return false;
@@ -2617,6 +2701,7 @@ bool spu_thread::set_ch_value(u32 ch, u32 value)
 			thread_ctrl::wait();
 		}
 
+		check_state();
 		return true;
 	}
 
@@ -2808,6 +2893,8 @@ bool spu_thread::stop_and_signal(u32 code)
 		// HACK: wait for executable code
 		while (!_ref<u32>(pc))
 		{
+			state += cpu_flag::wait;
+
 			if (is_stopped())
 			{
 				return false;
@@ -2816,12 +2903,15 @@ bool spu_thread::stop_and_signal(u32 code)
 			thread_ctrl::wait_for(1000);
 		}
 
+		check_state();
 		return false;
 	}
 
 	case 0x001:
 	{
+		state += cpu_flag::wait;
 		thread_ctrl::wait_for(1000); // hack
+		check_state();
 		return true;
 	}
 
@@ -2857,6 +2947,8 @@ bool spu_thread::stop_and_signal(u32 code)
 
 		std::shared_ptr<lv2_event_queue> queue;
 
+		state += cpu_flag::wait;
+
 		while (true)
 		{
 			queue.reset();
@@ -2897,6 +2989,7 @@ bool spu_thread::stop_and_signal(u32 code)
 
 			if (!queue)
 			{
+				check_state();
 				return ch_in_mbox.set_values(1, CELL_EINVAL), true; // TODO: check error value
 			}
 
@@ -2927,6 +3020,7 @@ bool spu_thread::stop_and_signal(u32 code)
 				const auto data3 = static_cast<u32>(std::get<3>(event));
 				ch_in_mbox.set_values(4, CELL_OK, data1, data2, data3);
 				queue->events.pop_front();
+				check_state();
 				return true;
 			}
 		}
@@ -2972,6 +3066,7 @@ bool spu_thread::stop_and_signal(u32 code)
 			}
 		}
 
+		check_state();
 		return true;
 	}
 
diff --git a/rpcs3/Emu/Cell/lv2/sys_net.cpp b/rpcs3/Emu/Cell/lv2/sys_net.cpp
index 8d08b6aa1023..ea3d87d302e2 100644
--- a/rpcs3/Emu/Cell/lv2/sys_net.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_net.cpp
@@ -357,6 +357,11 @@ s32 sys_net_bnet_accept(ppu_thread& ppu, s32 s, vm::ptr<sys_net_sockaddr> addr,
 		}
 	}
 
+	if (ppu.is_stopped())
+	{
+		return 0;
+	}
+
 	auto newsock = std::make_shared<lv2_socket>(native_socket);
 
 	result = idm::import_existing<lv2_socket>(newsock);
@@ -975,6 +980,11 @@ s32 sys_net_bnet_recvfrom(ppu_thread& ppu, s32 s, vm::ptr<void> buf, u32 len, s3
 		}
 	}
 
+	if (ppu.is_stopped())
+	{
+		return 0;
+	}
+
 	// TODO
 	if (addr)
 	{
@@ -1796,6 +1806,11 @@ s32 sys_net_bnet_select(ppu_thread& ppu, s32 nfds, vm::ptr<sys_net_fd_set> readf
 		}
 	}
 
+	if (ppu.is_stopped())
+	{
+		return 0;
+	}
+
 	if (readfds)
 		*readfds = rread;
 	if (writefds)
diff --git a/rpcs3/Emu/Memory/vm.cpp b/rpcs3/Emu/Memory/vm.cpp
index 0e4fed623e01..1a6dd0a6239c 100644
--- a/rpcs3/Emu/Memory/vm.cpp
+++ b/rpcs3/Emu/Memory/vm.cpp
@@ -172,6 +172,8 @@ namespace vm
 
 	void temporary_unlock(cpu_thread& cpu) noexcept
 	{
+		cpu.state += cpu_flag::wait;
+
 		if (g_tls_locked && g_tls_locked->compare_and_swap_test(&cpu, nullptr))
 		{
 			cpu.cpu_unmem();
diff --git a/rpcs3/Emu/System.h b/rpcs3/Emu/System.h
index 8ae7c33f05cb..1bc23dfebd67 100644
--- a/rpcs3/Emu/System.h
+++ b/rpcs3/Emu/System.h
@@ -379,7 +379,6 @@ struct cfg_root : cfg::node
 		cfg::_enum<spu_block_size_type> spu_block_size{this, "SPU Block Size", spu_block_size_type::safe};
 		cfg::_bool spu_accurate_getllar{this, "Accurate GETLLAR", false};
 		cfg::_bool spu_accurate_putlluc{this, "Accurate PUTLLUC", false};
-		cfg::_bool spu_accurate_putllc{this, "Accurate PUTLLC", false};
 		cfg::_bool spu_verification{this, "SPU Verification", true}; // Should be enabled
 		cfg::_bool spu_cache{this, "SPU Cache", true};
 		cfg::_enum<tsx_usage> enable_TSX{this, "Enable TSX", tsx_usage::enabled}; // Enable TSX. Forcing this on Haswell/Broadwell CPUs should be used carefully