Skip to content

Commit

Permalink
rsx: Remove redefinition of SSE4.1 instructions
Browse files Browse the repository at this point in the history
Now that clang is aware that our functions are compiled with SSE4.1, it
lets us generate this code using its intrinsics.
  • Loading branch information
linkmauve committed Oct 28, 2019
1 parent 47fa651 commit fb33c18
Showing 1 changed file with 52 additions and 81 deletions.
133 changes: 52 additions & 81 deletions rpcs3/Emu/RSX/Common/BufferUtils.cpp
Expand Up @@ -10,7 +10,6 @@

#if defined(_MSC_VER)
#define __SSSE3__ 1
#define __SSE4_1__ 1
#define SSE4_1_FUNC
#else
#define __sse_intrin static FORCE_INLINE
Expand All @@ -28,34 +27,6 @@ __sse_intrin __m128i __mm_shuffle_epi8(__m128i opd, __m128i opa)
#define __mm_shuffle_epi8 _mm_shuffle_epi8
#endif // __SSSE3__

#ifndef __SSE4_1__
__sse_intrin __m128i __mm_max_epu32(__m128i opd, __m128i opa)
{
__asm__("pmaxud %1, %0" : "+x" (opd) : "xm" (opa));
return opd;
}
__sse_intrin __m128i __mm_min_epu32(__m128i opd, __m128i opa)
{
__asm__("pminud %1, %0" : "+x" (opd) : "xm" (opa));
return opd;
}
__sse_intrin __m128i __mm_max_epu16(__m128i opd, __m128i opa)
{
__asm__("pmaxuw %1, %0" : "+x" (opd) : "xm" (opa));
return opd;
}
__sse_intrin __m128i __mm_min_epu16(__m128i opd, __m128i opa)
{
__asm__("pminuw %1, %0" : "+x" (opd) : "xm" (opa));
return opd;
}
#else
#define __mm_max_epu32 _mm_max_epu32
#define __mm_min_epu32 _mm_min_epu32
#define __mm_max_epu16 _mm_max_epu16
#define __mm_min_epu16 _mm_min_epu16
#endif // __SSE4_1__

#undef __sse_intrin

const bool s_use_ssse3 = utils::has_ssse3();
Expand Down Expand Up @@ -624,9 +595,9 @@ namespace
for (unsigned n = 0; n < iterations; ++n)
{
const __m128i raw = _mm_loadu_si128(src_stream++);
const __m128i value = __mm_shuffle_epi8(raw, mask);
max = __mm_max_epu16(max, value);
min = __mm_min_epu16(min, value);
const __m128i value = _mm_shuffle_epi8(raw, mask);
max = _mm_max_epu16(max, value);
min = _mm_min_epu16(min, value);
_mm_storeu_si128(dst_stream++, value);
}

Expand All @@ -642,19 +613,19 @@ namespace
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0x3, 0x2);

__m128i tmp = __mm_shuffle_epi8(min, mask_step1);
min = __mm_min_epu16(min, tmp);
tmp = __mm_shuffle_epi8(min, mask_step2);
min = __mm_min_epu16(min, tmp);
tmp = __mm_shuffle_epi8(min, mask_step3);
min = __mm_min_epu16(min, tmp);
__m128i tmp = _mm_shuffle_epi8(min, mask_step1);
min = _mm_min_epu16(min, tmp);
tmp = _mm_shuffle_epi8(min, mask_step2);
min = _mm_min_epu16(min, tmp);
tmp = _mm_shuffle_epi8(min, mask_step3);
min = _mm_min_epu16(min, tmp);

tmp = __mm_shuffle_epi8(max, mask_step1);
max = __mm_max_epu16(max, tmp);
tmp = __mm_shuffle_epi8(max, mask_step2);
max = __mm_max_epu16(max, tmp);
tmp = __mm_shuffle_epi8(max, mask_step3);
max = __mm_max_epu16(max, tmp);
tmp = _mm_shuffle_epi8(max, mask_step1);
max = _mm_max_epu16(max, tmp);
tmp = _mm_shuffle_epi8(max, mask_step2);
max = _mm_max_epu16(max, tmp);
tmp = _mm_shuffle_epi8(max, mask_step3);
max = _mm_max_epu16(max, tmp);

const u16 min_index = u16(_mm_cvtsi128_si32(min) & 0xFFFF);
const u16 max_index = u16(_mm_cvtsi128_si32(max) & 0xFFFF);
Expand Down Expand Up @@ -682,9 +653,9 @@ namespace
for (unsigned n = 0; n < iterations; ++n)
{
const __m128i raw = _mm_loadu_si128(src_stream++);
const __m128i value = __mm_shuffle_epi8(raw, mask);
max = __mm_max_epu32(max, value);
min = __mm_min_epu32(min, value);
const __m128i value = _mm_shuffle_epi8(raw, mask);
max = _mm_max_epu32(max, value);
min = _mm_min_epu32(min, value);
_mm_storeu_si128(dst_stream++, value);
}

Expand All @@ -697,15 +668,15 @@ namespace
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0x7, 0x6, 0x5, 0x4);

__m128i tmp = __mm_shuffle_epi8(min, mask_step1);
min = __mm_min_epu32(min, tmp);
tmp = __mm_shuffle_epi8(min, mask_step2);
min = __mm_min_epu32(min, tmp);
__m128i tmp = _mm_shuffle_epi8(min, mask_step1);
min = _mm_min_epu32(min, tmp);
tmp = _mm_shuffle_epi8(min, mask_step2);
min = _mm_min_epu32(min, tmp);

tmp = __mm_shuffle_epi8(max, mask_step1);
max = __mm_max_epu32(max, tmp);
tmp = __mm_shuffle_epi8(max, mask_step2);
max = __mm_max_epu32(max, tmp);
tmp = _mm_shuffle_epi8(max, mask_step1);
max = _mm_max_epu32(max, tmp);
tmp = _mm_shuffle_epi8(max, mask_step2);
max = _mm_max_epu32(max, tmp);

const u32 min_index = u32(_mm_cvtsi128_si32(min));
const u32 max_index = u32(_mm_cvtsi128_si32(max));
Expand Down Expand Up @@ -782,12 +753,12 @@ namespace
for (unsigned n = 0; n < iterations; ++n)
{
const __m128i raw = _mm_loadu_si128(src_stream++);
const __m128i value = __mm_shuffle_epi8(raw, shuffle_mask);
const __m128i value = _mm_shuffle_epi8(raw, shuffle_mask);
const __m128i mask = _mm_cmpeq_epi16(restart, value);
const __m128i value_with_min_restart = _mm_andnot_si128(mask, value);
const __m128i value_with_max_restart = _mm_or_si128(mask, value);
max = __mm_max_epu16(max, value_with_min_restart);
min = __mm_min_epu16(min, value_with_max_restart);
max = _mm_max_epu16(max, value_with_min_restart);
min = _mm_min_epu16(min, value_with_max_restart);
_mm_storeu_si128(dst_stream++, value);
}

Expand All @@ -803,19 +774,19 @@ namespace
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0x3, 0x2);

__m128i tmp = __mm_shuffle_epi8(min, mask_step1);
min = __mm_min_epu16(min, tmp);
tmp = __mm_shuffle_epi8(min, mask_step2);
min = __mm_min_epu16(min, tmp);
tmp = __mm_shuffle_epi8(min, mask_step3);
min = __mm_min_epu16(min, tmp);
__m128i tmp = _mm_shuffle_epi8(min, mask_step1);
min = _mm_min_epu16(min, tmp);
tmp = _mm_shuffle_epi8(min, mask_step2);
min = _mm_min_epu16(min, tmp);
tmp = _mm_shuffle_epi8(min, mask_step3);
min = _mm_min_epu16(min, tmp);

tmp = __mm_shuffle_epi8(max, mask_step1);
max = __mm_max_epu16(max, tmp);
tmp = __mm_shuffle_epi8(max, mask_step2);
max = __mm_max_epu16(max, tmp);
tmp = __mm_shuffle_epi8(max, mask_step3);
max = __mm_max_epu16(max, tmp);
tmp = _mm_shuffle_epi8(max, mask_step1);
max = _mm_max_epu16(max, tmp);
tmp = _mm_shuffle_epi8(max, mask_step2);
max = _mm_max_epu16(max, tmp);
tmp = _mm_shuffle_epi8(max, mask_step3);
max = _mm_max_epu16(max, tmp);

const u16 min_index = u16(_mm_cvtsi128_si32(min) & 0xFFFF);
const u16 max_index = u16(_mm_cvtsi128_si32(max) & 0xFFFF);
Expand Down Expand Up @@ -846,12 +817,12 @@ namespace
for (unsigned n = 0; n < iterations; ++n)
{
const __m128i raw = _mm_loadu_si128(src_stream++);
const __m128i value = __mm_shuffle_epi8(raw, shuffle_mask);
const __m128i value = _mm_shuffle_epi8(raw, shuffle_mask);
const __m128i mask = _mm_cmpeq_epi32(restart, value);
const __m128i value_with_min_restart = _mm_andnot_si128(mask, value);
const __m128i value_with_max_restart = _mm_or_si128(mask, value);
max = __mm_max_epu32(max, value_with_min_restart);
min = __mm_min_epu32(min, value_with_max_restart);
max = _mm_max_epu32(max, value_with_min_restart);
min = _mm_min_epu32(min, value_with_max_restart);
_mm_storeu_si128(dst_stream++, value);
}

Expand All @@ -863,15 +834,15 @@ namespace
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0x7, 0x6, 0x5, 0x4);

__m128i tmp = __mm_shuffle_epi8(min, mask_step1);
min = __mm_min_epu32(min, tmp);
tmp = __mm_shuffle_epi8(min, mask_step2);
min = __mm_min_epu32(min, tmp);
__m128i tmp = _mm_shuffle_epi8(min, mask_step1);
min = _mm_min_epu32(min, tmp);
tmp = _mm_shuffle_epi8(min, mask_step2);
min = _mm_min_epu32(min, tmp);

tmp = __mm_shuffle_epi8(max, mask_step1);
max = __mm_max_epu32(max, tmp);
tmp = __mm_shuffle_epi8(max, mask_step2);
max = __mm_max_epu32(max, tmp);
tmp = _mm_shuffle_epi8(max, mask_step1);
max = _mm_max_epu32(max, tmp);
tmp = _mm_shuffle_epi8(max, mask_step2);
max = _mm_max_epu32(max, tmp);

const u32 min_index = u32(_mm_cvtsi128_si32(min));
const u32 max_index = u32(_mm_cvtsi128_si32(max));
Expand Down

0 comments on commit fb33c18

Please sign in to comment.