From 6f736543da1d863e3843edbd2b9a7f8d75ec6c86 Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Mon, 30 Apr 2018 19:39:06 +0300 Subject: [PATCH] SPU ASMJIT: internal jumptable Allow indirect calls within current function using a jumptable This restores some functionality removed in SPU ASMJIT 2.0 --- rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp | 258 +++++++++++++++---------- rpcs3/Emu/Cell/SPUASMJITRecompiler.h | 9 + rpcs3/Emu/Cell/SPUThread.cpp | 16 ++ rpcs3/Emu/System.h | 8 + 4 files changed, 189 insertions(+), 102 deletions(-) diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp index 9cb4d3121a0d..afcd0600018d 100644 --- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp @@ -152,7 +152,7 @@ spu_function_t spu_recompiler::compile(const std::vector& func) vec[i] = vec_vars[i]; } - Label label_stop = c->newLabel(); + label_stop = c->newLabel(); Label label_diff = c->newLabel(); Label label_code = c->newLabel(); std::vector words; @@ -163,6 +163,15 @@ spu_function_t spu_recompiler::compile(const std::vector& func) const u32 start = m_pos; const u32 end = m_pos + (func.size() - 1) * 4; + // Create instruction labels (TODO: some of them are unnecessary) + for (u32 i = 1; i < func.size(); i++) + { + if (func[i]) + { + instr_labels[i * 4 - 4 + m_pos] = c->newLabel(); + } + } + // Set PC and check status c->mov(SPU_OFF_32(pc), m_pos); c->cmp(SPU_OFF_32(state), 0); @@ -713,6 +722,14 @@ spu_function_t spu_recompiler::compile(const std::vector& func) // Update position m_pos = pos; + // Bind instruction label if necessary + const auto found = instr_labels.find(pos); + + if (found != instr_labels.end()) + { + c->bind(found->second); + } + // Execute recompiler function (this->*s_spu_decoder.decode(op))({op}); @@ -750,6 +767,27 @@ spu_function_t spu_recompiler::compile(const std::vector& func) work(); } + // Build instruction dispatch table + if (instr_table.isValid()) + { + c->align(kAlignData, 8); + c->bind(instr_table); + + for (u32 addr = start; addr < end; addr += 4) + { + const auto found = instr_labels.find(addr); + + if (found != instr_labels.end()) + { + c->embedLabel(found->second); + } + else + { + c->embedLabel(label_stop); + } + } + } + c->align(kAlignData, words_align); c->bind(label_code); for (u32 d : words) @@ -760,6 +798,9 @@ spu_function_t spu_recompiler::compile(const std::vector& func) work(); } + label_stop.reset(); + instr_table.reset(); + instr_labels.clear(); xmm_consts.clear(); // Compile and get function address @@ -1066,6 +1107,28 @@ void spu_recompiler::branch_fixed(u32 target) { using namespace asmjit; + // Check local branch + const auto local = instr_labels.find(target); + + if (local != instr_labels.end() && local->second.isValid()) + { + c->cmp(SPU_OFF_32(state), 0); + c->jz(local->second); + c->mov(SPU_OFF_32(pc), target); + c->ret(); + return; + } + + if (g_cfg.core.spu_block_size == spu_block_size_type::giga) + { + // Don't generate patch points in this mode + c->mov(x86::rax, x86::qword_ptr(*cpu, offset32(&SPUThread::jit_dispatcher) + target * 2)); + c->mov(SPU_OFF_32(pc), target); + c->xor_(qw0->r32(), qw0->r32()); + c->jmp(x86::rax); + return; + } + // Set patch address as a third argument and fallback to it Label patch_point = c->newLabel(); c->lea(*qw0, x86::qword_ptr(patch_point)); @@ -1109,9 +1172,23 @@ void spu_recompiler::branch_indirect(spu_opcode_t op) { using namespace asmjit; - // Load indirect jump address - c->mov(x86::r10, x86::qword_ptr(*cpu, addr->r64(), 1, offset32(&SPUThread::jit_dispatcher))); + if (!instr_table.isValid()) + { + // Request instruction table + instr_table = c->newLabel(); + } + + const u32 start = instr_labels.begin()->first; + const u32 end = instr_labels.rbegin()->first + 4; + + // Load indirect jump address, choose between local and external + c->lea(x86::r10, x86::qword_ptr(instr_table)); + c->lea(*qw1, x86::qword_ptr(*addr, 0 - start)); c->xor_(qw0->r32(), qw0->r32()); + c->cmp(qw1->r32(), end - start); + c->cmovae(qw1->r32(), qw0->r32()); + c->cmovb(x86::r10, x86::qword_ptr(x86::r10, *qw1, 1, 0)); + c->cmovae(x86::r10, x86::qword_ptr(*cpu, addr->r64(), 1, offset32(&SPUThread::jit_dispatcher))); if (op.d) { @@ -1119,22 +1196,34 @@ void spu_recompiler::branch_indirect(spu_opcode_t op) } else if (op.e) { - c->lock().bts(SPU_OFF_8(interrupts_enabled), 0); - c->mov(x86::r9d, SPU_OFF_32(ch_event_stat)); - c->and_(x86::r9d, SPU_OFF_32(ch_event_mask)); - c->and_(x86::r9d, SPU_EVENT_INTR_TEST); - c->cmp(x86::r9d, 0); + Label no_intr = c->newLabel(); + Label intr = c->newLabel(); + Label fail = c->newLabel(); - Label noInterrupt = c->newLabel(); - c->je(noInterrupt); + c->lock().bts(SPU_OFF_8(interrupts_enabled), 0); + c->mov(qw1->r32(), SPU_OFF_32(ch_event_mask)); + c->test(qw1->r32(), ~SPU_EVENT_INTR_IMPLEMENTED); + c->jnz(fail); + c->and_(qw1->r32(), SPU_OFF_32(ch_event_stat)); + c->test(qw1->r32(), SPU_EVENT_INTR_IMPLEMENTED); + c->jnz(intr); + c->jmp(no_intr); + c->bind(fail); + c->mov(SPU_OFF_32(pc), *addr); + c->mov(addr->r64(), reinterpret_cast(vm::base(0xffdead00))); + c->mov(asmjit::x86::dword_ptr(addr->r64()), "INTR"_u32); + c->bind(intr); c->lock().btr(SPU_OFF_8(interrupts_enabled), 0); c->mov(SPU_OFF_32(srr0), *addr); - branch_fixed(0); + c->mov(*addr, qw0->r32()); + c->mov(x86::r10, x86::qword_ptr(*cpu, offset32(&SPUThread::jit_dispatcher))); c->align(kAlignCode, 16); - c->bind(noInterrupt); + c->bind(no_intr); } c->mov(SPU_OFF_32(pc), *addr); + c->cmp(SPU_OFF_32(state), 0); + c->jnz(label_stop); c->jmp(x86::r10); } @@ -1348,25 +1437,37 @@ void spu_recompiler::MFSPR(spu_opcode_t op) c->movdqa(SPU_OFF_128(gpr, op.rt), vr); } +static void ret_helper(SPUThread& spu, void*, u8*) +{ + // MSVC workaround (TCO) +} + void spu_recompiler::RDCH(spu_opcode_t op) { using namespace asmjit; - auto gate = [](SPUThread* _spu, u32 ch, v128* out) + auto gate = [](SPUThread* _spu, u32 ch, v128* out, spu_function_t _ret) { - u32 value; + // Workaround (TCO) + static thread_local u32 value; if (_spu->get_ch_value(ch, value)) { *out = v128::from32r(value); - _spu->pc += 4; } + else + { + _ret = &ret_helper; + } + + _ret(*_spu, _spu->_ptr(0), nullptr); }; auto read_channel = [&](X86Mem channel_ptr, bool sync = true) { Label wait = c->newLabel(); Label again = c->newLabel(); + Label ret = c->newLabel(); c->mov(addr->r64(), channel_ptr); c->xor_(qw0->r32(), qw0->r32()); c->align(kAlignCode, 16); @@ -1376,12 +1477,12 @@ void spu_recompiler::RDCH(spu_opcode_t op) after.emplace_back([=, pos = m_pos] { - // Do not continue after waiting c->bind(wait); c->mov(SPU_OFF_32(pc), pos); c->mov(*ls, op.ra); c->lea(*qw0, SPU_OFF_128(gpr, op.rt)); - c->jmp(imm_ptr(gate)); + c->lea(*qw1, x86::qword_ptr(ret)); + c->jmp(imm_ptr(gate)); }); if (sync) @@ -1400,6 +1501,7 @@ void spu_recompiler::RDCH(spu_opcode_t op) c->movd(vr, *addr); c->pslldq(vr, 12); c->movdqa(SPU_OFF_128(gpr, op.rt), vr); + c->bind(ret); }; switch (op.ra) @@ -1415,42 +1517,7 @@ void spu_recompiler::RDCH(spu_opcode_t op) case SPU_RdInMbox: { // TODO - Label wait = c->newLabel(); - Label next = c->newLabel(); - c->mov(SPU_OFF_32(pc), m_pos); - c->cmp(x86::byte_ptr(*cpu, offset32(&SPUThread::ch_in_mbox) + 1), 0); - c->jz(wait); - - after.emplace_back([=] - { - // Do not continue after waiting - c->bind(wait); - c->mov(*ls, op.ra); - c->lea(*qw0, SPU_OFF_128(gpr, op.rt)); - c->jmp(imm_ptr(gate)); - }); - - auto sub = [](SPUThread* _spu, v128* out, spu_function_t _ret) - { - // Workaround for gcc (TCO) - static thread_local u32 value; - - if (!_spu->get_ch_value(SPU_RdInMbox, value)) - { - // Workaround for MSVC (TCO) - fmt::raw_error("spu_recompiler::RDCH(): unexpected SPUThread::get_ch_value(SPU_RdInMbox) call"); - } - - *out = v128::from32r(value); - _ret(*_spu, _spu->_ptr(0), nullptr); - }; - - c->lea(*ls, SPU_OFF_128(gpr, op.rt)); - c->lea(*qw0, x86::qword_ptr(next)); - c->jmp(imm_ptr(sub)); - c->align(kAlignCode, 16); - c->bind(next); - return; + break; } case MFC_RdTagStat: { @@ -1489,7 +1556,7 @@ void spu_recompiler::RDCH(spu_opcode_t op) { LOG_WARNING(SPU, "[0x%x] RDCH: RdDec", m_pos); - auto gate1 = [](SPUThread* _spu, v128* _res, spu_function_t _ret) + auto sub1 = [](SPUThread* _spu, v128* _res, spu_function_t _ret) { const u32 out = _spu->ch_dec_value - static_cast(get_timebased_time() - _spu->ch_dec_start_timestamp); @@ -1500,7 +1567,7 @@ void spu_recompiler::RDCH(spu_opcode_t op) _ret(*_spu, _spu->_ptr(0), nullptr); }; - auto gate2 = [](SPUThread* _spu, v128* _res, spu_function_t _ret) + auto sub2 = [](SPUThread* _spu, v128* _res, spu_function_t _ret) { const u32 out = _spu->ch_dec_value - static_cast(get_timebased_time() - _spu->ch_dec_start_timestamp); @@ -1514,7 +1581,7 @@ void spu_recompiler::RDCH(spu_opcode_t op) c->mov(SPU_OFF_32(pc), m_pos); c->lea(*ls, SPU_OFF_128(gpr, op.rt)); c->lea(*qw0, asmjit::x86::qword_ptr(next)); - c->jmp(g_cfg.core.spu_loop_detection ? asmjit::imm_ptr(gate1) : asmjit::imm_ptr(gate2)); + c->jmp(g_cfg.core.spu_loop_detection ? asmjit::imm_ptr(sub1) : asmjit::imm_ptr(sub2)); c->align(asmjit::kAlignCode, 16); c->bind(next); return; @@ -1532,22 +1599,24 @@ void spu_recompiler::RDCH(spu_opcode_t op) LOG_WARNING(SPU, "[0x%x] RDCH: RdEventStat", m_pos); get_events(); Label wait = c->newLabel(); + Label ret = c->newLabel(); c->jz(wait); after.emplace_back([=, pos = m_pos] { - // Do not continue after waiting c->bind(wait); c->mov(SPU_OFF_32(pc), pos); c->mov(*ls, op.ra); c->lea(*qw0, SPU_OFF_128(gpr, op.rt)); - c->jmp(imm_ptr(gate)); + c->lea(*qw1, x86::qword_ptr(ret)); + c->jmp(imm_ptr(gate)); }); const XmmLink& vr = XmmAlloc(); c->movd(vr, *addr); c->pslldq(vr, 12); c->movdqa(SPU_OFF_128(gpr, op.rt), vr); + c->bind(ret); return; } case SPU_RdMachStat: @@ -1561,11 +1630,13 @@ void spu_recompiler::RDCH(spu_opcode_t op) } } + Label ret = c->newLabel(); c->mov(SPU_OFF_32(pc), m_pos); c->mov(*ls, op.ra); c->lea(*qw0, SPU_OFF_128(gpr, op.rt)); - c->jmp(imm_ptr(gate)); - m_pos = -1; + c->lea(*qw1, x86::qword_ptr(ret)); + c->jmp(imm_ptr(gate)); + c->bind(ret); } void spu_recompiler::RCHCNT(spu_opcode_t op) @@ -2314,12 +2385,14 @@ void spu_recompiler::WRCH(spu_opcode_t op) { using namespace asmjit; - auto gate = [](SPUThread* _spu, u32 ch, u32 value) + auto gate = [](SPUThread* _spu, u32 ch, u32 value, spu_function_t _ret) { - if (_spu->set_ch_value(ch, value)) + if (!_spu->set_ch_value(ch, value)) { - _spu->pc += 4; + _ret = &ret_helper; } + + _ret(*_spu, _spu->_ptr(0), nullptr); }; switch (op.ra) @@ -2332,42 +2405,14 @@ void spu_recompiler::WRCH(spu_opcode_t op) } case SPU_WrOutIntrMbox: { - auto sub = [](SPUThread* _spu, spu_function_t _ret, u32 value) - { - if (!_spu->set_ch_value(SPU_WrOutIntrMbox, value)) - { - fmt::raw_error("spu_recompiler::WRCH(): unexpected SPUThread::set_ch_value(SPU_WrOutIntrMbox) call"); - } - - // Continue - _ret(*_spu, _spu->_ptr(0), nullptr); - }; - - Label ret = c->newLabel(); - Label wait = c->newLabel(); - c->mov(SPU_OFF_32(pc), m_pos); - c->mov(qw0->r32(), SPU_OFF_32(gpr, op.rt, &v128::_u32, 3)); - c->bt(SPU_OFF_64(ch_out_intr_mbox), spu_channel::off_count); - c->jc(wait); - - after.emplace_back([=] - { - // Do not continue after waiting - c->bind(wait); - c->mov(*ls, op.ra); - c->jmp(imm_ptr(gate)); - }); - - c->lea(*ls, x86::qword_ptr(ret)); - c->jmp(imm_ptr(sub)); - c->align(kAlignCode, 16); - c->bind(ret); - return; + // Can't seemingly be optimized + break; } case SPU_WrOutMbox: { Label wait = c->newLabel(); Label again = c->newLabel(); + Label ret = c->newLabel(); c->mov(qw0->r32(), SPU_OFF_32(gpr, op.rt, &v128::_u32, 3)); c->mov(addr->r64(), SPU_OFF_64(ch_out_mbox)); c->align(kAlignCode, 16); @@ -2378,16 +2423,17 @@ void spu_recompiler::WRCH(spu_opcode_t op) after.emplace_back([=, pos = m_pos] { - // Do not continue after waiting c->bind(wait); c->mov(SPU_OFF_32(pc), pos); c->mov(*ls, op.ra); - c->jmp(imm_ptr(gate)); + c->lea(*qw1, x86::qword_ptr(ret)); + c->jmp(imm_ptr(gate)); }); c->bts(*qw0, spu_channel::off_count); c->lock().cmpxchg(SPU_OFF_64(ch_out_mbox), *qw0); c->jnz(again); + c->bind(ret); return; } case MFC_WrTagMask: @@ -2435,7 +2481,8 @@ void spu_recompiler::WRCH(spu_opcode_t op) c->bind(fail); c->mov(SPU_OFF_32(pc), pos); c->mov(*ls, op.ra); - c->jmp(imm_ptr(gate)); + c->lea(*qw1, x86::qword_ptr(ret)); + c->jmp(imm_ptr(gate)); c->bind(zero); c->mov(SPU_OFF_32(ch_tag_upd), qw0->r32()); @@ -2500,7 +2547,7 @@ void spu_recompiler::WRCH(spu_opcode_t op) { if (!_spu->process_mfc_cmd(_spu->ch_mfc_cmd)) { - throw cpu_flag::ret; + _ret = &ret_helper; } _ret(*_spu, _spu->_ptr(0), nullptr); @@ -2555,6 +2602,7 @@ void spu_recompiler::WRCH(spu_opcode_t op) case SPU_WrEventMask: { Label fail = c->newLabel(); + Label ret = c->newLabel(); c->mov(qw0->r32(), SPU_OFF_32(gpr, op.rt, &v128::_u32, 3)); c->mov(*addr, ~SPU_EVENT_IMPLEMENTED); c->mov(qw1->r32(), ~SPU_EVENT_INTR_IMPLEMENTED); @@ -2568,15 +2616,18 @@ void spu_recompiler::WRCH(spu_opcode_t op) c->bind(fail); c->mov(SPU_OFF_32(pc), pos); c->mov(*ls, op.ra); - c->jmp(imm_ptr(gate)); + c->lea(*qw1, x86::qword_ptr(ret)); + c->jmp(imm_ptr(gate)); }); c->mov(SPU_OFF_32(ch_event_mask), qw0->r32()); + c->bind(ret); return; } case SPU_WrEventAck: { Label fail = c->newLabel(); + Label ret = c->newLabel(); c->mov(qw0->r32(), SPU_OFF_32(gpr, op.rt, &v128::_u32, 3)); c->test(qw0->r32(), ~SPU_EVENT_IMPLEMENTED); c->jnz(fail); @@ -2586,7 +2637,8 @@ void spu_recompiler::WRCH(spu_opcode_t op) c->bind(fail); c->mov(SPU_OFF_32(pc), pos); c->mov(*ls, op.ra); - c->jmp(imm_ptr(gate)); + c->lea(*qw1, x86::qword_ptr(ret)); + c->jmp(imm_ptr(gate)); }); c->not_(qw0->r32()); @@ -2599,11 +2651,13 @@ void spu_recompiler::WRCH(spu_opcode_t op) } } + Label ret = c->newLabel(); c->mov(SPU_OFF_32(pc), m_pos); c->mov(*ls, op.ra); c->mov(qw0->r32(), SPU_OFF_32(gpr, op.rt, &v128::_u32, 3)); - c->jmp(imm_ptr(gate)); - m_pos = -1; + c->lea(*qw1, x86::qword_ptr(ret)); + c->jmp(imm_ptr(gate)); + c->bind(ret); } void spu_recompiler::BIZ(spu_opcode_t op) diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h index eb3de416fa40..84fecb4a8a8e 100644 --- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h +++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h @@ -59,6 +59,15 @@ class spu_recompiler : public spu_recompiler_base std::vector> after; std::vector> consts; + // Function return label + asmjit::Label label_stop; + + // Indirect branch dispatch table + asmjit::Label instr_table; + + // All valid instruction labels + std::map instr_labels; + // All emitted 128-bit consts std::map, asmjit::Label> xmm_consts; diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index 2755f864fdf6..4311f1763805 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -119,6 +119,22 @@ void fmt_class_string::format(std::string& out, u64 arg) }); } +template <> +void fmt_class_string::format(std::string& out, u64 arg) +{ + format_enum(out, arg, [](spu_block_size_type type) + { + switch (type) + { + case spu_block_size_type::safe: return "Safe"; + case spu_block_size_type::mega: return "Mega"; + case spu_block_size_type::giga: return "Giga"; + } + + return unknown; + }); +} + namespace spu { namespace scheduler diff --git a/rpcs3/Emu/System.h b/rpcs3/Emu/System.h index 9c046470f799..96f6002e21d8 100644 --- a/rpcs3/Emu/System.h +++ b/rpcs3/Emu/System.h @@ -30,6 +30,13 @@ enum class spu_decoder_type llvm, }; +enum class spu_block_size_type +{ + safe, + mega, + giga, +}; + enum class lib_loading_type { automatic, @@ -303,6 +310,7 @@ struct cfg_root : cfg::node cfg::_int<0, 16> spu_delay_penalty{this, "SPU delay penalty", 3}; //Number of milliseconds to block a thread if a virtual 'core' isn't free cfg::_bool spu_loop_detection{this, "SPU loop detection", true}; //Try to detect wait loops and trigger thread yield cfg::_bool spu_shared_runtime{this, "SPU Shared Runtime", true}; // Share compiled SPU functions between all threads + cfg::_enum spu_block_size{this, "SPU Block Size"}; cfg::_enum lib_loading{this, "Lib Loader", lib_loading_type::liblv2only}; cfg::_bool hook_functions{this, "Hook static functions"};