Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixups after #9048 #9070

Merged
merged 5 commits into from
Oct 13, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 25 additions & 25 deletions rpcs3/Emu/Cell/PPUThread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ using spu_rdata_t = decltype(ppu_thread::rdata);

extern void mov_rdata(spu_rdata_t& _dst, const spu_rdata_t& _src);
extern bool cmp_rdata(const spu_rdata_t& _lhs, const spu_rdata_t& _rhs);
extern u32(*const spu_getllar_tx)(u32 raddr, void* rdata, cpu_thread* _cpu, u64 rtime);

// Verify AVX availability for TSX transactions
static const bool s_tsx_avx = utils::has_avx();
Expand Down Expand Up @@ -1178,29 +1179,39 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)

be_t<u64> rdata;

u64 test_mask = ~vm::rsrv_shared_mask;

if (ppu.use_full_rdata)
{
if (ppu.rtime & 127)
{
continue;
// Try to use TSX to obtain data atomically
if (!g_use_rtm || !spu_getllar_tx(addr & -128, ppu.rdata, &ppu, ppu.rtime))
{
continue;
}
}
else
{
mov_rdata(ppu.rdata, vm::_ref<spu_rdata_t>(addr & -128));

mov_rdata(ppu.rdata, vm::_ref<spu_rdata_t>(addr & -128));
// Check all bit changes
test_mask = -1;
}
}
else
{
rdata = data.load();
}

if (vm::reservation_acquire(addr, sizeof(T)) == ppu.rtime) [[likely]]
if ((vm::reservation_acquire(addr, sizeof(T)) & test_mask) == (ppu.rtime & test_mask)) [[likely]]
{
if (!ppu.use_full_rdata)
{
if (ppu.rtime & vm::rsrv_shared_mask)
{
// Let the ongoing operation some tiny time to complete
busy_wait(100);
ppu.rtime &= ~vm::rsrv_shared_mask;
}

if (data.load() != rdata)
Expand All @@ -1222,6 +1233,8 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
ppu_log.warning("%s took too long: %u", sizeof(T) == 4 ? "LWARX" : "LDARX", count);
}

ppu.rtime &= ~vm::rsrv_shared_mask;

return static_cast<T>(rdata << data_off >> size_off);
}
}
Expand Down Expand Up @@ -1628,11 +1641,11 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
if (cmp_rdata(ppu.rdata, super_data))
{
data.release(reg_value);
res.release(rtime + 128);
res += 64;
return true;
}

res.release(rtime);
res -= 64;
return false;
}();

Expand All @@ -1648,25 +1661,7 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
// Aligned 8-byte reservations will be used here
addr &= -8;

if (g_use_rtm) [[likely]]
{
if (res.fetch_add(1) & vm::rsrv_unique_lock)
{
res -= 1;
return false;
}

if (data.compare_and_swap_test(old_data, reg_value))
{
res += 127;
return true;
}

res -= 1;
return false;
}

while (true)
for (u64 count = 0;; count++)
{
auto [_old, _ok] = res.fetch_op([&](u64& r)
{
Expand All @@ -1687,6 +1682,11 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)

if (_ok)
{
if (count >= 20)
{
ppu_log.notice("%s took too long (%u):", sizeof(T) == 4 ? "STWCX" : "STDCX", count);
}

break;
}

Expand Down
161 changes: 153 additions & 8 deletions rpcs3/Emu/Cell/SPUThread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,14 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, const
c.jne(fail2);

Label tx1 = build_transaction_enter(c, fall2, x86::r12, 666);

// Check pause flag
c.bt(x86::dword_ptr(args[2], ::offset32(&spu_thread::state) - ::offset32(&spu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
c.jc(fail3);
c.mov(x86::rax, x86::qword_ptr(x86::rbx));
c.and_(x86::rax, -128);
c.cmp(x86::rax, x86::r13);
c.jne(fail2);
c.xbegin(tx1);

if (s_tsx_avx)
Expand Down Expand Up @@ -781,6 +789,127 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
c.ret();
});

const extern auto spu_getllar_tx = build_function_asm<u32(*)(u32 raddr, void* rdata, cpu_thread* _cpu, u64 rtime)>([](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;

Label fall = c.newLabel();
Label _ret = c.newLabel();

//if (utils::has_avx() && !s_tsx_avx)
//{
// c.vzeroupper();
//}

// Create stack frame if necessary (Windows ABI has only 6 volatile vector registers)
c.push(x86::rbp);
c.push(x86::r13);
c.push(x86::r12);
c.push(x86::rbx);
c.sub(x86::rsp, 40);
#ifdef _WIN32
if (!s_tsx_avx)
{
c.movups(x86::oword_ptr(x86::rsp, 0), x86::xmm6);
c.movups(x86::oword_ptr(x86::rsp, 16), x86::xmm7);
}
#endif

// Prepare registers
c.mov(x86::rbx, imm_ptr(+vm::g_reservations));
c.mov(x86::rax, imm_ptr(&vm::g_base_addr));
c.mov(x86::rbp, x86::qword_ptr(x86::rax));
c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0]));
c.and_(args[0].r32(), 0xff80);
c.shr(args[0].r32(), 1);
c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0]));
c.xor_(x86::r12d, x86::r12d);
c.mov(x86::r13, args[1]);

// Begin transaction
Label tx0 = build_transaction_enter(c, fall, x86::r12, 8);

// Check pause flag
c.bt(x86::dword_ptr(args[2], ::offset32(&cpu_thread::state)), static_cast<u32>(cpu_flag::pause));
c.jc(fall);
c.mov(x86::rax, x86::qword_ptr(x86::rbx));
c.and_(x86::rax, -128);
c.cmp(x86::rax, args[3]);
c.jne(fall);
c.xbegin(tx0);

// Just read data to registers
if (s_tsx_avx)
{
c.vmovups(x86::ymm0, x86::yword_ptr(x86::rbp, 0));
c.vmovups(x86::ymm1, x86::yword_ptr(x86::rbp, 32));
c.vmovups(x86::ymm2, x86::yword_ptr(x86::rbp, 64));
c.vmovups(x86::ymm3, x86::yword_ptr(x86::rbp, 96));
}
else
{
c.movaps(x86::xmm0, x86::oword_ptr(x86::rbp, 0));
c.movaps(x86::xmm1, x86::oword_ptr(x86::rbp, 16));
c.movaps(x86::xmm2, x86::oword_ptr(x86::rbp, 32));
c.movaps(x86::xmm3, x86::oword_ptr(x86::rbp, 48));
c.movaps(x86::xmm4, x86::oword_ptr(x86::rbp, 64));
c.movaps(x86::xmm5, x86::oword_ptr(x86::rbp, 80));
c.movaps(x86::xmm6, x86::oword_ptr(x86::rbp, 96));
c.movaps(x86::xmm7, x86::oword_ptr(x86::rbp, 112));
}

c.xend();

// Store data
if (s_tsx_avx)
{
c.vmovaps(x86::yword_ptr(args[1], 0), x86::ymm0);
c.vmovaps(x86::yword_ptr(args[1], 32), x86::ymm1);
c.vmovaps(x86::yword_ptr(args[1], 64), x86::ymm2);
c.vmovaps(x86::yword_ptr(args[1], 96), x86::ymm3);
}
else
{
c.movaps(x86::oword_ptr(args[1], 0), x86::xmm0);
c.movaps(x86::oword_ptr(args[1], 16), x86::xmm1);
c.movaps(x86::oword_ptr(args[1], 32), x86::xmm2);
c.movaps(x86::oword_ptr(args[1], 48), x86::xmm3);
c.movaps(x86::oword_ptr(args[1], 64), x86::xmm4);
c.movaps(x86::oword_ptr(args[1], 80), x86::xmm5);
c.movaps(x86::oword_ptr(args[1], 96), x86::xmm6);
c.movaps(x86::oword_ptr(args[1], 112), x86::xmm7);
}

c.mov(x86::eax, 1);
c.jmp(_ret);

c.bind(fall);
c.xor_(x86::eax, x86::eax);
//c.jmp(_ret);

c.bind(_ret);

#ifdef _WIN32
if (!s_tsx_avx)
{
c.movups(x86::xmm6, x86::oword_ptr(x86::rsp, 0));
c.movups(x86::xmm7, x86::oword_ptr(x86::rsp, 16));
}
#endif

if (s_tsx_avx)
{
c.vzeroupper();
}

c.add(x86::rsp, 40);
c.pop(x86::rbx);
c.pop(x86::r12);
c.pop(x86::r13);
c.pop(x86::rbp);
c.ret();
});

void spu_int_ctrl_t::set(u64 ints)
{
// leave only enabled interrupts
Expand Down Expand Up @@ -2029,11 +2158,11 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
if (cmp_rdata(rdata, super_data))
{
mov_rdata(super_data, to_write);
res.release(rtime + 128);
res += 64;
return true;
}

res.release(rtime);
res -= 64;
return false;
}();

Expand Down Expand Up @@ -2102,7 +2231,7 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write)
// TODO: vm::check_addr
vm::writer_lock lock(addr);
mov_rdata(super_data, *static_cast<const spu_rdata_t*>(to_write));
res.release(time0 + 128);
res += 64;
}

if (render) render->unpause();
Expand Down Expand Up @@ -2330,19 +2459,35 @@ bool spu_thread::process_mfc_cmd()
{
ntime = vm::reservation_acquire(addr, 128);

if (ntime & 127)
if (ntime & vm::rsrv_unique_lock)
{
// There's an on-going reservation store, wait
continue;
}

mov_rdata(rdata, data);

if (u64 time0 = vm::reservation_acquire(addr, 128);
ntime != time0)
u64 test_mask = -1;

if (ntime & 127)
{
// Try to use TSX to obtain data atomically
if (!g_use_rtm || !spu_getllar_tx(addr, rdata, this, ntime))
{
// See previous ntime check.
continue;
}
else
{
// If succeeded, only need to check unique lock bit
test_mask = ~vm::rsrv_shared_mask;
}
}

if (u64 time0 = vm::reservation_acquire(addr, 128); (ntime & test_mask) != (time0 & test_mask))
{
// Reservation data has been modified recently
if (time0 & 127) i += 12;
if (time0 & vm::rsrv_unique_lock) i += 12;
continue;
}

Expand All @@ -2352,7 +2497,7 @@ bool spu_thread::process_mfc_cmd()
continue;
}

if (i >= 25) [[unlikely]]
if (i >= 40) [[unlikely]]
{
spu_log.warning("GETLLAR took too long: %u", i);
}
Expand Down
2 changes: 1 addition & 1 deletion rpcs3/Emu/RSX/rsx_methods.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ namespace rsx

if (res)
{
res += 127;
res += 64;
}

vm::reservation_notifier(addr, 4).notify_all();
Expand Down