From 425e032a62fee3ed295454bf20c8be0252de9d3b Mon Sep 17 00:00:00 2001
From: Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
Date: Wed, 20 Nov 2019 16:53:22 +0100
Subject: [PATCH] SPU: Copy with memcpy() instead of hand-rolled SSE2

In some very unscientific benchmark:
spu_thread::do_dma_transfer() was taking 2.27% of my CPU before, now
0.07%, while __memmove_avx_unaligned_erms() was taking 1.47% and now
2.88%, which added makes about 0.8% saved.
---
 rpcs3/Emu/Cell/SPUThread.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)
diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp
index cb9f124ef2a2..d1c58afe9159 100644
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@@ -1433,6 +1433,9 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 
 			auto lock = vm::passive_lock(eal & -128, ::align(eal + size, 128));
 
+#ifdef __GNUG__
+			std::memcpy(dst, src, size);
+#else
 			while (size >= 128)
 			{
 				mov_rdata(*reinterpret_cast<decltype(spu_thread::rdata)*>(dst), *reinterpret_cast<const decltype(spu_thread::rdata)*>(src));
@@ -1450,6 +1453,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 				src += 16;
 				size -= 16;
 			}
+#endif
 
 			lock->release(0);
 			break;
@@ -1483,6 +1487,9 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 	}
 	default:
 	{
+#ifdef __GNUG__
+		std::memcpy(dst, src, size);
+#else
 		while (size >= 128)
 		{
 			mov_rdata(*reinterpret_cast<decltype(spu_thread::rdata)*>(dst), *reinterpret_cast<const decltype(spu_thread::rdata)*>(src));
@@ -1500,6 +1507,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 			src += 16;
 			size -= 16;
 		}
+#endif
 
 		break;
 	}