Optimise primitive_restart::upload_untouched()

This optimisation targets SSE4.1 and AVX2 and is only applied when skip_restart is false. I’ve only tested the u16 codepath, as it is the one used in NieR. In some very unscientific profiling, this function used to take 2.76% of the total time at some spot of the port town, it now takes about 0.40% on SSE4.1 and 0.30% on AVX2.
RPCS3 · Oct 26, 2019 · e255c81 · e255c81
1 parent f3ed26e
commit e255c81
Showing 1 changed file with 258 additions and 7 deletions.
diff --git a/rpcs3/Emu/RSX/Common/BufferUtils.cpp b/rpcs3/Emu/RSX/Common/BufferUtils.cpp
@@ -58,6 +58,7 @@ __sse_intrin __m128i __mm_min_epu16(__m128i opd, __m128i opa)
 
 const bool s_use_ssse3 = utils::has_ssse3();
 const bool s_use_sse4_1 = utils::has_sse41();
+const bool s_use_avx2 = utils::has_avx2();
 
 namespace
 {
@@ -755,36 +756,286 @@ namespace
 
 	struct primitive_restart_impl
 	{
+#if __AVX2__
+		static
+		std::tuple<u16, u16, u32> upload_u16_swapped_avx2(const void *src, void *dst, u32 count, u32 restart_index)
+		{
+			u32 dst_index = 0;
+
+			const __m256i mask = _mm256_set_epi8(
+				0x1E, 0x1F, 0x1C, 0x1D,
+				0x1A, 0x1B, 0x18, 0x19,
+				0x16, 0x17, 0x14, 0x15,
+				0x12, 0x13, 0x10, 0x11,
+				0xE, 0xF, 0xC, 0xD,
+				0xA, 0xB, 0x8, 0x9,
+				0x6, 0x7, 0x4, 0x5,
+				0x2, 0x3, 0x0, 0x1);
+
+			auto src_stream = (const __m256i*)src;
+			auto dst_stream = (__m256i*)dst;
+
+			__m256i restart = _mm256_set1_epi16(restart_index);
+			__m256i min = _mm256_set1_epi16(0xffff);
+			__m256i max = _mm256_set1_epi16(0);
+
+			const auto iterations = count / 8;
+			for (unsigned n = 0; n < iterations; ++n)
+			{
+				const __m256i raw = _mm256_loadu_si256(src_stream++);
+				const __m256i value = _mm256_shuffle_epi8(raw, mask);
+				const __m256i mask = _mm256_cmpeq_epi16(restart, value);
+				const __m256i tmp = _mm256_andnot_si256(mask, value);
+				max = _mm256_max_epu16(max, tmp);
+				min = _mm256_min_epu16(min, value);
+				_mm256_storeu_si256(dst_stream++, value);
+			}
+
+			const __m256i mask_step1 = _mm256_set_epi8(
+				0, 0, 0, 0, 0, 0, 0, 0,
+				0, 0, 0, 0, 0, 0, 0, 0,
+				0x1F, 0x1E, 0x1D, 0x1C, 0x1B, 0x1A, 0x19, 0x18,
+				0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10);
+
+			const __m256i mask_step2 = _mm256_set_epi8(
+				0, 0, 0, 0, 0, 0, 0, 0,
+				0, 0, 0, 0, 0, 0, 0, 0,
+				0, 0, 0, 0, 0, 0, 0, 0,
+				0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8);
+
+			const __m256i mask_step3 = _mm256_set_epi8(
+				0, 0, 0, 0, 0, 0, 0, 0,
+				0, 0, 0, 0, 0, 0, 0, 0,
+				0, 0, 0, 0, 0, 0, 0, 0,
+				0, 0, 0, 0, 0x7, 0x6, 0x5, 0x4);
+
+			const __m256i mask_step4 = _mm256_set_epi8(
+				0, 0, 0, 0, 0, 0, 0, 0,
+				0, 0, 0, 0, 0, 0, 0, 0,
+				0, 0, 0, 0, 0, 0, 0, 0,
+				0, 0, 0, 0, 0, 0, 0x3, 0x2);
+
+			__m256i tmp = _mm256_shuffle_epi8(min, mask_step1);
+			min = _mm256_min_epu16(min, tmp);
+			tmp = _mm256_shuffle_epi8(min, mask_step2);
+			min = _mm256_min_epu16(min, tmp);
+			tmp = _mm256_shuffle_epi8(min, mask_step3);
+			min = _mm256_min_epu16(min, tmp);
+			tmp = _mm256_shuffle_epi8(min, mask_step4);
+			min = _mm256_min_epu16(min, tmp);
+
+			tmp = _mm256_shuffle_epi8(max, mask_step1);
+			max = _mm256_max_epu16(max, tmp);
+			tmp = _mm256_shuffle_epi8(max, mask_step2);
+			max = _mm256_max_epu16(max, tmp);
+			tmp = _mm256_shuffle_epi8(max, mask_step3);
+			max = _mm256_max_epu16(max, tmp);
+			tmp = _mm256_shuffle_epi8(max, mask_step4);
+			max = _mm256_max_epu16(max, tmp);
+
+			const u16 min_index = u16(_mm_cvtsi128_si32(_mm256_castsi256_si128(min)) & 0xFFFF);
+			const u16 max_index = u16(_mm_cvtsi128_si32(_mm256_castsi256_si128(max)) & 0xFFFF);
+
+			return std::make_tuple(min_index, max_index, count);
+		}
+#endif
+
+		static
+		std::tuple<u16, u16, u32> upload_u16_swapped_sse4_1(const void *src, void *dst, u32 count, u32 restart_index)
+		{
+			u32 dst_index = 0;
+
+			const __m128i mask = _mm_set_epi8(
+				0xE, 0xF, 0xC, 0xD,
+				0xA, 0xB, 0x8, 0x9,
+				0x6, 0x7, 0x4, 0x5,
+				0x2, 0x3, 0x0, 0x1);
+
+			auto src_stream = (const __m128i*)src;
+			auto dst_stream = (__m128i*)dst;
+
+			__m128i restart = _mm_set1_epi16(restart_index);
+			__m128i min = _mm_set1_epi16(0xffff);
+			__m128i max = _mm_set1_epi16(0);
+
+			const auto iterations = count / 8;
+			for (unsigned n = 0; n < iterations; ++n)
+			{
+				const __m128i raw = _mm_loadu_si128(src_stream++);
+				const __m128i value = __mm_shuffle_epi8(raw, mask);
+				const __m128i mask = _mm_cmpeq_epi16(restart, value);
+				const __m128i tmp = _mm_andnot_si128(mask, value);
+				max = __mm_max_epu16(max, tmp);
+				min = __mm_min_epu16(min, value);
+				_mm_storeu_si128(dst_stream++, value);
+			}
+
+			const __m128i mask_step1 = _mm_set_epi8(
+				0, 0, 0, 0, 0, 0, 0, 0,
+				0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8);
+
+			const __m128i mask_step2 = _mm_set_epi8(
+				0, 0, 0, 0, 0, 0, 0, 0,
+				0, 0, 0, 0, 0x7, 0x6, 0x5, 0x4);
+
+			const __m128i mask_step3 = _mm_set_epi8(
+				0, 0, 0, 0, 0, 0, 0, 0,
+				0, 0, 0, 0, 0, 0, 0x3, 0x2);
+
+			__m128i tmp = __mm_shuffle_epi8(min, mask_step1);
+			min = __mm_min_epu16(min, tmp);
+			tmp = __mm_shuffle_epi8(min, mask_step2);
+			min = __mm_min_epu16(min, tmp);
+			tmp = __mm_shuffle_epi8(min, mask_step3);
+			min = __mm_min_epu16(min, tmp);
+
+			tmp = __mm_shuffle_epi8(max, mask_step1);
+			max = __mm_max_epu16(max, tmp);
+			tmp = __mm_shuffle_epi8(max, mask_step2);
+			max = __mm_max_epu16(max, tmp);
+			tmp = __mm_shuffle_epi8(max, mask_step3);
+			max = __mm_max_epu16(max, tmp);
+
+			const u16 min_index = u16(_mm_cvtsi128_si32(min) & 0xFFFF);
+			const u16 max_index = u16(_mm_cvtsi128_si32(max) & 0xFFFF);
+
+			return std::make_tuple(min_index, max_index, count);
+		}
+
+		static
+		std::tuple<u32, u32, u32> upload_u32_swapped(const void *src, void *dst, u32 count, u32 restart_index)
+		{
+			u32 dst_index = 0;
+
+			const __m128i mask = _mm_set_epi8(
+				0xC, 0xD, 0xE, 0xF,
+				0x8, 0x9, 0xA, 0xB,
+				0x4, 0x5, 0x6, 0x7,
+				0x0, 0x1, 0x2, 0x3);
+
+			auto src_stream = (const __m128i*)src;
+			auto dst_stream = (__m128i*)dst;
+
+			__m128i restart = _mm_set1_epi32(restart_index);
+			__m128i min = _mm_set1_epi32(0xffffffff);
+			__m128i max = _mm_set1_epi32(0);
+
+			const auto iterations = count / 4;
+			for (unsigned n = 0; n < iterations; ++n)
+			{
+				const __m128i raw = _mm_loadu_si128(src_stream++);
+				const __m128i value = __mm_shuffle_epi8(raw, mask);
+				const __m128i mask = _mm_cmpeq_epi32(restart, value);
+				const __m128i tmp = _mm_andnot_si128(mask, value);
+				max = __mm_max_epu32(max, tmp);
+				min = __mm_min_epu32(min, value);
+				_mm_storeu_si128(dst_stream++, value);
+			}
+
+			const __m128i mask_step1 = _mm_set_epi8(
+				0, 0, 0, 0, 0, 0, 0, 0,
+				0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8);
+
+			const __m128i mask_step2 = _mm_set_epi8(
+				0, 0, 0, 0, 0, 0, 0, 0,
+				0, 0, 0, 0, 0x7, 0x6, 0x5, 0x4);
+
+			__m128i tmp = __mm_shuffle_epi8(min, mask_step1);
+			min = __mm_min_epu32(min, tmp);
+			tmp = __mm_shuffle_epi8(min, mask_step2);
+			min = __mm_min_epu32(min, tmp);
+
+			tmp = __mm_shuffle_epi8(max, mask_step1);
+			max = __mm_max_epu32(max, tmp);
+			tmp = __mm_shuffle_epi8(max, mask_step2);
+			max = __mm_max_epu32(max, tmp);
+
+			const u32 min_index = u32(_mm_cvtsi128_si32(min));
+			const u32 max_index = u32(_mm_cvtsi128_si32(max));
+
+			return std::make_tuple(min_index, max_index, count);
+		}
+
 		template<typename T>
 		static
 		std::tuple<T, T, u32> upload_untouched(gsl::span<to_be_t<const T>> src, gsl::span<T> dst, u32 restart_index, bool skip_restart)
 		{
-			T min_index = index_limit<T>(), max_index = 0;
-			u32 dst_index = 0;
+			T min_index, max_index;
+			u32 written;
+			u32 remaining = src.size();
+
+#if __AVX2__
+			if (s_use_avx2 && remaining >= 64 && !skip_restart)
+			{
+				if constexpr (std::is_same<T, u32>::value)
+				{
+					const auto count = (remaining & ~0x3);
+					std::tie(min_index, max_index, written) = upload_u32_swapped(src.data(), dst.data(), count, restart_index);
+				}
+				else if constexpr (std::is_same<T, u16>::value)
+				{
+					const auto count = (remaining & ~0xf);
+					std::tie(min_index, max_index, written) = upload_u16_swapped_avx2(src.data(), dst.data(), count, restart_index);
+				}
+				else
+				{
+					fmt::throw_exception("Unreachable" HERE);
+				}
 
-			for (const T index : src)
+				remaining -= written;
+			}
+			else
+#endif
+			if (s_use_sse4_1 && remaining >= 32 && !skip_restart)
 			{
+				if constexpr (std::is_same<T, u32>::value)
+				{
+					const auto count = (remaining & ~0x3);
+					std::tie(min_index, max_index, written) = upload_u32_swapped(src.data(), dst.data(), count, restart_index);
+				}
+				else if constexpr (std::is_same<T, u16>::value)
+				{
+					const auto count = (remaining & ~0x7);
+					std::tie(min_index, max_index, written) = upload_u16_swapped_sse4_1(src.data(), dst.data(), count, restart_index);
+				}
+				else
+				{
+					fmt::throw_exception("Unreachable" HERE);
+				}
+
+				remaining -= written;
+			}
+			else
+			{
+				min_index = index_limit<T>();
+				max_index = 0;
+				written = 0;
+			}
+
+			while (remaining--)
+			{
+				T index = src[written];
 				if (index == restart_index)
 				{
 					if (!skip_restart)
 					{
-						dst[dst_index++] = index_limit<T>();
+						dst[written++] = index_limit<T>();
 					}
 				}
 				else
 				{
-					dst[dst_index++] = min_max(min_index, max_index, index);
+					dst[written++] = min_max(min_index, max_index, index);
 				}
 			}
 
-			return std::make_tuple(min_index, max_index, dst_index);
+			return std::make_tuple(min_index, max_index, written);
 		}
 	};
 
 	template<typename T>
 	std::tuple<T, T, u32> upload_untouched(gsl::span<to_be_t<const T>> src, gsl::span<T> dst, rsx::primitive_type draw_mode, bool is_primitive_restart_enabled, u32 primitive_restart_index)
 	{
-		if (LIKELY(!is_primitive_restart_enabled))
+		if (UNLIKELY(!is_primitive_restart_enabled))
 		{
 			return untouched_impl::upload_untouched(src, dst);
 		}