Skip to content

Commit

Permalink
PPU: minor changes
Browse files Browse the repository at this point in the history
  • Loading branch information
Nekotekina committed Jan 21, 2022
1 parent 0de9960 commit bb504d9
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 10 deletions.
14 changes: 4 additions & 10 deletions rpcs3/Emu/Cell/PPUInterpreter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2839,12 +2839,9 @@ auto VSUM4SBS()

static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& sat)
{
//const auto r = _mm_dpbusds_epi32(b, _mm_set1_epi8(1), a);
//const auto s = _mm_dpbusd_epi32(b, _mm_set1_epi8(1), a);
auto x = gv_hadds8x4(a);
auto r = gv_adds_s32(x, b);
auto r = gv_dots_u8s8x4(gv_bcst8(1), a, b);
if constexpr (((Flags == set_sat) || ...))
sat = gv_or32(gv_xor32(gv_add32(std::move(x), std::move(b)), r), std::move(sat));
sat = gv_or32(gv_xor32(gv_add32(gv_hadds8x4(std::move(a)), std::move(b)), r), std::move(sat));
d = std::move(r);
};

Expand All @@ -2859,12 +2856,9 @@ auto VSUM4SHS()

static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& sat)
{
//const auto r = _mm_dpwssds_epi32(b, a, _mm_set1_epi16(1));
//const auto s = _mm_dpwssd_epi32(b, a, _mm_set1_epi16(1));
auto x = gv_hadds16x2(a);
auto r = gv_adds_s32(x, b);
auto r = gv_dots_s16x2(a, gv_bcst16(1), b);
if constexpr (((Flags == set_sat) || ...))
sat = gv_or32(gv_xor32(gv_add32(std::move(x), std::move(b)), r), std::move(sat));
sat = gv_or32(gv_xor32(gv_add32(gv_hadds16x2(std::move(a)), std::move(b)), r), std::move(sat));
d = std::move(r);
};

Expand Down
20 changes: 20 additions & 0 deletions rpcs3/util/simd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2099,6 +2099,26 @@ inline v128 gv_dotu16x2(const v128& a, const v128& b)
#endif
}

// Unsigned bytes from a, signed bytes from b, 32-bit accumulator c
inline v128 gv_dots_u8s8x4(const v128& a, const v128& b, const v128& c)
{
#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__)
return _mm_dpbusds_epi32(c, a, b);
#elif defined(ARCH_X64)
const __m128i ah = _mm_srli_epi16(a, 8);
const __m128i al = _mm_and_si128(a, _mm_set1_epi16(0x00ff));
const __m128i bh = _mm_srai_epi16(b, 8);
const __m128i bl = _mm_srai_epi16(_mm_slli_epi16(b, 8), 8);
const __m128i mh = _mm_madd_epi16(ah, bh);
const __m128i ml = _mm_madd_epi16(al, bl);
return gv_adds_s32(c, _mm_add_epi32(mh, ml));
#elif defined(ARCH_ARM64)
const auto l = vpaddlq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))), vmovl_s8(vget_low_s8(b))));
const auto h = vpaddlq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))), vmovl_s8(vget_high_s8(b))));
return vqaddq_s32(c, vaddq_s32(vuzp1q_s32(l, h), vuzp2q_s32(l, h)));
#endif
}

// Signed s16 from a and b, 32-bit accumulator c; signed saturation
inline v128 gv_dots_s16x2(const v128& a, const v128& b, const v128& c)
{
Expand Down

0 comments on commit bb504d9

Please sign in to comment.