Skip to content

Commit

Permalink
Optimise primitive_restart::upload_untouched()
Browse files Browse the repository at this point in the history
This optimisation targets SSE4.1 and AVX2 and is only applied when
skip_restart is false.

I’ve only tested the u16 codepath, as it is the one used in NieR.

In some very unscientific profiling, this function used to take 2.76% of
the total time at some spot of the port town, it now takes about 0.40%
on SSE4.1 and 0.30% on AVX2.
  • Loading branch information
linkmauve committed Oct 26, 2019
1 parent f3ed26e commit e255c81
Showing 1 changed file with 258 additions and 7 deletions.
265 changes: 258 additions & 7 deletions rpcs3/Emu/RSX/Common/BufferUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ __sse_intrin __m128i __mm_min_epu16(__m128i opd, __m128i opa)

const bool s_use_ssse3 = utils::has_ssse3();
const bool s_use_sse4_1 = utils::has_sse41();
const bool s_use_avx2 = utils::has_avx2();

namespace
{
Expand Down Expand Up @@ -755,36 +756,286 @@ namespace

struct primitive_restart_impl
{
#if __AVX2__
static
std::tuple<u16, u16, u32> upload_u16_swapped_avx2(const void *src, void *dst, u32 count, u32 restart_index)
{
u32 dst_index = 0;

const __m256i mask = _mm256_set_epi8(
0x1E, 0x1F, 0x1C, 0x1D,
0x1A, 0x1B, 0x18, 0x19,
0x16, 0x17, 0x14, 0x15,
0x12, 0x13, 0x10, 0x11,
0xE, 0xF, 0xC, 0xD,
0xA, 0xB, 0x8, 0x9,
0x6, 0x7, 0x4, 0x5,
0x2, 0x3, 0x0, 0x1);

auto src_stream = (const __m256i*)src;
auto dst_stream = (__m256i*)dst;

__m256i restart = _mm256_set1_epi16(restart_index);
__m256i min = _mm256_set1_epi16(0xffff);
__m256i max = _mm256_set1_epi16(0);

const auto iterations = count / 8;
for (unsigned n = 0; n < iterations; ++n)
{
const __m256i raw = _mm256_loadu_si256(src_stream++);
const __m256i value = _mm256_shuffle_epi8(raw, mask);
const __m256i mask = _mm256_cmpeq_epi16(restart, value);
const __m256i tmp = _mm256_andnot_si256(mask, value);
max = _mm256_max_epu16(max, tmp);
min = _mm256_min_epu16(min, value);
_mm256_storeu_si256(dst_stream++, value);
}

const __m256i mask_step1 = _mm256_set_epi8(
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0x1F, 0x1E, 0x1D, 0x1C, 0x1B, 0x1A, 0x19, 0x18,
0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10);

const __m256i mask_step2 = _mm256_set_epi8(
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8);

const __m256i mask_step3 = _mm256_set_epi8(
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0x7, 0x6, 0x5, 0x4);

const __m256i mask_step4 = _mm256_set_epi8(
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0x3, 0x2);

__m256i tmp = _mm256_shuffle_epi8(min, mask_step1);
min = _mm256_min_epu16(min, tmp);
tmp = _mm256_shuffle_epi8(min, mask_step2);
min = _mm256_min_epu16(min, tmp);
tmp = _mm256_shuffle_epi8(min, mask_step3);
min = _mm256_min_epu16(min, tmp);
tmp = _mm256_shuffle_epi8(min, mask_step4);
min = _mm256_min_epu16(min, tmp);

tmp = _mm256_shuffle_epi8(max, mask_step1);
max = _mm256_max_epu16(max, tmp);
tmp = _mm256_shuffle_epi8(max, mask_step2);
max = _mm256_max_epu16(max, tmp);
tmp = _mm256_shuffle_epi8(max, mask_step3);
max = _mm256_max_epu16(max, tmp);
tmp = _mm256_shuffle_epi8(max, mask_step4);
max = _mm256_max_epu16(max, tmp);

const u16 min_index = u16(_mm_cvtsi128_si32(_mm256_castsi256_si128(min)) & 0xFFFF);
const u16 max_index = u16(_mm_cvtsi128_si32(_mm256_castsi256_si128(max)) & 0xFFFF);

return std::make_tuple(min_index, max_index, count);
}
#endif

static
std::tuple<u16, u16, u32> upload_u16_swapped_sse4_1(const void *src, void *dst, u32 count, u32 restart_index)
{
u32 dst_index = 0;

const __m128i mask = _mm_set_epi8(
0xE, 0xF, 0xC, 0xD,
0xA, 0xB, 0x8, 0x9,
0x6, 0x7, 0x4, 0x5,
0x2, 0x3, 0x0, 0x1);

auto src_stream = (const __m128i*)src;
auto dst_stream = (__m128i*)dst;

__m128i restart = _mm_set1_epi16(restart_index);
__m128i min = _mm_set1_epi16(0xffff);
__m128i max = _mm_set1_epi16(0);

const auto iterations = count / 8;
for (unsigned n = 0; n < iterations; ++n)
{
const __m128i raw = _mm_loadu_si128(src_stream++);
const __m128i value = __mm_shuffle_epi8(raw, mask);
const __m128i mask = _mm_cmpeq_epi16(restart, value);
const __m128i tmp = _mm_andnot_si128(mask, value);
max = __mm_max_epu16(max, tmp);
min = __mm_min_epu16(min, value);
_mm_storeu_si128(dst_stream++, value);
}

const __m128i mask_step1 = _mm_set_epi8(
0, 0, 0, 0, 0, 0, 0, 0,
0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8);

const __m128i mask_step2 = _mm_set_epi8(
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0x7, 0x6, 0x5, 0x4);

const __m128i mask_step3 = _mm_set_epi8(
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0x3, 0x2);

__m128i tmp = __mm_shuffle_epi8(min, mask_step1);
min = __mm_min_epu16(min, tmp);
tmp = __mm_shuffle_epi8(min, mask_step2);
min = __mm_min_epu16(min, tmp);
tmp = __mm_shuffle_epi8(min, mask_step3);
min = __mm_min_epu16(min, tmp);

tmp = __mm_shuffle_epi8(max, mask_step1);
max = __mm_max_epu16(max, tmp);
tmp = __mm_shuffle_epi8(max, mask_step2);
max = __mm_max_epu16(max, tmp);
tmp = __mm_shuffle_epi8(max, mask_step3);
max = __mm_max_epu16(max, tmp);

const u16 min_index = u16(_mm_cvtsi128_si32(min) & 0xFFFF);
const u16 max_index = u16(_mm_cvtsi128_si32(max) & 0xFFFF);

return std::make_tuple(min_index, max_index, count);
}

static
std::tuple<u32, u32, u32> upload_u32_swapped(const void *src, void *dst, u32 count, u32 restart_index)
{
u32 dst_index = 0;

const __m128i mask = _mm_set_epi8(
0xC, 0xD, 0xE, 0xF,
0x8, 0x9, 0xA, 0xB,
0x4, 0x5, 0x6, 0x7,
0x0, 0x1, 0x2, 0x3);

auto src_stream = (const __m128i*)src;
auto dst_stream = (__m128i*)dst;

__m128i restart = _mm_set1_epi32(restart_index);
__m128i min = _mm_set1_epi32(0xffffffff);
__m128i max = _mm_set1_epi32(0);

const auto iterations = count / 4;
for (unsigned n = 0; n < iterations; ++n)
{
const __m128i raw = _mm_loadu_si128(src_stream++);
const __m128i value = __mm_shuffle_epi8(raw, mask);
const __m128i mask = _mm_cmpeq_epi32(restart, value);
const __m128i tmp = _mm_andnot_si128(mask, value);
max = __mm_max_epu32(max, tmp);
min = __mm_min_epu32(min, value);
_mm_storeu_si128(dst_stream++, value);
}

const __m128i mask_step1 = _mm_set_epi8(
0, 0, 0, 0, 0, 0, 0, 0,
0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8);

const __m128i mask_step2 = _mm_set_epi8(
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0x7, 0x6, 0x5, 0x4);

__m128i tmp = __mm_shuffle_epi8(min, mask_step1);
min = __mm_min_epu32(min, tmp);
tmp = __mm_shuffle_epi8(min, mask_step2);
min = __mm_min_epu32(min, tmp);

tmp = __mm_shuffle_epi8(max, mask_step1);
max = __mm_max_epu32(max, tmp);
tmp = __mm_shuffle_epi8(max, mask_step2);
max = __mm_max_epu32(max, tmp);

const u32 min_index = u32(_mm_cvtsi128_si32(min));
const u32 max_index = u32(_mm_cvtsi128_si32(max));

return std::make_tuple(min_index, max_index, count);
}

template<typename T>
static
std::tuple<T, T, u32> upload_untouched(gsl::span<to_be_t<const T>> src, gsl::span<T> dst, u32 restart_index, bool skip_restart)
{
T min_index = index_limit<T>(), max_index = 0;
u32 dst_index = 0;
T min_index, max_index;
u32 written;
u32 remaining = src.size();

#if __AVX2__
if (s_use_avx2 && remaining >= 64 && !skip_restart)
{
if constexpr (std::is_same<T, u32>::value)
{
const auto count = (remaining & ~0x3);
std::tie(min_index, max_index, written) = upload_u32_swapped(src.data(), dst.data(), count, restart_index);
}
else if constexpr (std::is_same<T, u16>::value)
{
const auto count = (remaining & ~0xf);
std::tie(min_index, max_index, written) = upload_u16_swapped_avx2(src.data(), dst.data(), count, restart_index);
}
else
{
fmt::throw_exception("Unreachable" HERE);
}

for (const T index : src)
remaining -= written;
}
else
#endif
if (s_use_sse4_1 && remaining >= 32 && !skip_restart)
{
if constexpr (std::is_same<T, u32>::value)
{
const auto count = (remaining & ~0x3);
std::tie(min_index, max_index, written) = upload_u32_swapped(src.data(), dst.data(), count, restart_index);
}
else if constexpr (std::is_same<T, u16>::value)
{
const auto count = (remaining & ~0x7);
std::tie(min_index, max_index, written) = upload_u16_swapped_sse4_1(src.data(), dst.data(), count, restart_index);
}
else
{
fmt::throw_exception("Unreachable" HERE);
}

remaining -= written;
}
else
{
min_index = index_limit<T>();
max_index = 0;
written = 0;
}

while (remaining--)
{
T index = src[written];
if (index == restart_index)
{
if (!skip_restart)
{
dst[dst_index++] = index_limit<T>();
dst[written++] = index_limit<T>();
}
}
else
{
dst[dst_index++] = min_max(min_index, max_index, index);
dst[written++] = min_max(min_index, max_index, index);
}
}

return std::make_tuple(min_index, max_index, dst_index);
return std::make_tuple(min_index, max_index, written);
}
};

template<typename T>
std::tuple<T, T, u32> upload_untouched(gsl::span<to_be_t<const T>> src, gsl::span<T> dst, rsx::primitive_type draw_mode, bool is_primitive_restart_enabled, u32 primitive_restart_index)
{
if (LIKELY(!is_primitive_restart_enabled))
if (UNLIKELY(!is_primitive_restart_enabled))
{
return untouched_impl::upload_untouched(src, dst);
}
Expand Down

0 comments on commit e255c81

Please sign in to comment.