From e3ca4dd5049bec9c673921d95ef26aa7db91750b Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Sun, 5 May 2019 16:28:41 +0300 Subject: [PATCH] SPU: basic function analysis implemented Basic stack frame layout analysis. Function detection in Giga mode. Basic use of new information in SPU LLVM. This is WIP and may not work correctly. Optimizations include but not limited to: * Compiling SPU functions as native functions when eligible * Avoiding register context write-out --- rpcs3/Emu/Cell/SPUAnalyser.h | 27 +- rpcs3/Emu/Cell/SPURecompiler.cpp | 1105 +++++++++++++++++++++++++----- rpcs3/Emu/Cell/SPURecompiler.h | 60 +- 3 files changed, 1001 insertions(+), 191 deletions(-) diff --git a/rpcs3/Emu/Cell/SPUAnalyser.h b/rpcs3/Emu/Cell/SPUAnalyser.h index adaa4ebc6489..65ac1d5d9710 100644 --- a/rpcs3/Emu/Cell/SPUAnalyser.h +++ b/rpcs3/Emu/Cell/SPUAnalyser.h @@ -11,6 +11,7 @@ struct spu_itype static constexpr struct branch_tag{} branch{}; // Branch Instructions static constexpr struct floating_tag{} floating{}; // Floating-Point Instructions static constexpr struct quadrop_tag{} _quadrop{}; // 4-op Instructions + static constexpr struct xfloat_tag{} xfloat{}; // Instructions producing xfloat values enum type : unsigned char { @@ -146,24 +147,26 @@ struct spu_itype FMS, // quadrop_tag last FA, - DFA, FS, - DFS, FM, + FREST, + FRSQEST, + FI, + CSFLT, + CUFLT, + FRDS, // xfloat_tag last + + DFA, + DFS, DFM, DFMA, DFNMS, DFMS, DFNMA, - FREST, - FRSQEST, - FI, - CSFLT, + FESD, + CFLTS, - CUFLT, CFLTU, - FRDS, - FESD, FCEQ, FCMEQ, FCGT, @@ -252,6 +255,12 @@ struct spu_itype { return value >= MPYA && value <= FMS; } + + // Test for xfloat instruction + friend constexpr bool operator &(type value, xfloat_tag) + { + return value >= FMA && value <= FRDS; + } }; struct spu_iflag diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index 54ef3a8cd2b4..5ec798081774 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -915,6 +915,8 @@ const std::vector& spu_recompiler_base::analyse(const be_t* ls, u32 en m_preds.clear(); m_preds[entry_point]; m_bbs.clear(); + m_chunks.clear(); + m_funcs.clear(); // Value flags (TODO) enum class vf : u32 @@ -1885,13 +1887,36 @@ const std::vector& spu_recompiler_base::analyse(const be_t* ls, u32 en { block.size++; + // Decode instruction + const spu_opcode_t op{se_storage::swap(result[(ia - lsa) / 4 + 1])}; + + const auto type = s_spu_itype.decode(op.opcode); + + u8 reg_save = 255; + + if (type == spu_itype::STQD && op.ra == s_reg_sp && !block.reg_mod[op.rt] && !block.reg_use[op.rt]) + { + // Register saved onto the stack before use + block.reg_save_dom[op.rt] = true; + + reg_save = op.rt; + } + for (auto* _use : {&m_use_ra, &m_use_rb, &m_use_rc}) { if (u8 reg = (*_use)[ia / 4]; reg < s_reg_max) { // Register reg use only if it happens before reg mod if (!block.reg_mod[reg]) + { block.reg_use.set(reg); + + if (reg_save != reg && block.reg_save_dom[reg]) + { + // Register is still used after saving; probably not eligible for optimization + block.reg_save_dom[reg] = false; + } + } } } @@ -1909,6 +1934,16 @@ const std::vector& spu_recompiler_base::analyse(const be_t* ls, u32 en if (u8 reg = m_regmod[ia / 4]; reg < s_reg_max) { block.reg_mod.set(reg); + block.reg_mod_xf.set(reg, type & spu_itype::xfloat); + + if (type == spu_itype::SELB && (block.reg_mod_xf[op.ra] || block.reg_mod_xf[op.rb])) + block.reg_mod_xf.set(reg); + + // Possible post-dominating register load + if (type == spu_itype::LQD && op.ra == s_reg_sp) + block.reg_load_mod[reg] = ia + 1; + else + block.reg_load_mod[reg] = 0; } // Find targets (also means end of the block) @@ -1918,6 +1953,25 @@ const std::vector& spu_recompiler_base::analyse(const be_t* ls, u32 en { // Copy targets block.targets = tfound->second; + + // Assume that the call modifies all volatile registers + if (type == spu_itype::BRSL || type == spu_itype::BRASL) + { + const u32 target = spu_branch_target(type == spu_itype::BRASL ? 0 : ia, op.i16); + + if (target != ia + 4) + { + for (u32 i = 0; i < s_reg_max; ++i) + { + if (i == s_reg_lr || (i >= s_reg_80 && i <= s_reg_127)) + { + block.reg_mod.set(i); + block.reg_mod_xf[i] = false; + } + } + } + } + break; } } @@ -1926,10 +1980,91 @@ const std::vector& spu_recompiler_base::analyse(const be_t* ls, u32 en // Fixup block predeccessors to point to basic blocks, not last instructions for (auto& bb : m_bbs) { + const u32 addr = bb.first; + for (u32& pred : bb.second.preds) { pred = std::prev(m_bbs.upper_bound(pred))->first; } + + if (m_entry_info[addr / 4]) + { + // Register empty chunk + m_chunks.push_back(addr); + + // Register function if necessary + if (!m_ret_info[addr / 4]) + { + m_funcs[addr]; + } + } + } + + // Ensure there is a function at the lowest address + if (auto emp = m_funcs.try_emplace(m_bbs.begin()->first); emp.second) + { + const u32 addr = emp.first->first; + LOG_ERROR(SPU, "Fixed first function at 0x%05x", addr); + m_entry_info[addr / 4] = true; + m_ret_info[addr / 4] = false; + } + + // Split functions + while (true) + { + bool need_repeat = false; + + u32 start = 0; + u32 limit = 0x40000; + + // Walk block list in ascending order + for (auto& block : m_bbs) + { + const u32 addr = block.first; + + if (m_entry_info[addr / 4] && !m_ret_info[addr / 4]) + { + const auto upper = m_funcs.upper_bound(addr); + start = addr; + limit = upper == m_funcs.end() ? 0x40000 : upper->first; + } + + // Find targets that exceed [start; limit) range and make new functions from them + for (u32 target : block.second.targets) + { + const auto tfound = m_bbs.find(target); + + if (tfound == m_bbs.end()) + { + continue; + } + + if (target < start || target >= limit) + { + if (!m_entry_info[target / 4] || m_ret_info[target / 4]) + { + // Create new function entry (likely a tail call) + m_entry_info[target / 4] = true; + + m_ret_info[target / 4] = false; + + m_funcs.try_emplace(target); + + if (target < limit) + { + need_repeat = true; + } + } + } + } + + block.second.func = start; + } + + if (!need_repeat) + { + break; + } } // Fill entry map, add chunk addresses @@ -1951,7 +2086,7 @@ const std::vector& spu_recompiler_base::analyse(const be_t* ls, u32 en // Check block predecessors for (u32 pred : block.preds) { - const u32 _old = m_bbs[pred].chunk; + const u32 _old = m_bbs.at(pred).chunk; if (_old < 0x40000 && _old != _new) { @@ -2040,6 +2175,12 @@ const std::vector& spu_recompiler_base::analyse(const be_t* ls, u32 en workload.push_back(target); tb.analysed = true; } + + // Limited xfloat hint propagation (possibly TODO) + if (tb.chunk == block.chunk) + { + tb.reg_maybe_xf |= block.reg_mod_xf; + } } block.reg_origin.fill(0x80000000); @@ -2103,64 +2244,563 @@ const std::vector& spu_recompiler_base::analyse(const be_t* ls, u32 en // Set -1 if multiple origins merged (requires PHI node) tb.reg_origin[i] = -1; - must_repeat |= !tb.targets.empty(); + must_repeat |= !tb.targets.empty(); + } + } + + if (tb.chunk != block.chunk && !(m_entry_info[target / 4] && m_ret_info[target / 4])) + { + // Skip call targets completely + continue; + } + + if (tb.reg_origin_abs[i] != -2) + { + const u32 expected = block.reg_mod[i] ? addr : block.reg_origin_abs[i]; + + if (tb.reg_origin_abs[i] == 0x80000000) + { + tb.reg_origin_abs[i] = expected; + } + else if (tb.reg_origin_abs[i] != expected) + { + if (tb.reg_origin_abs[i] == 0x40000 || expected == -2 || expected == 0x40000) + { + // Set -2: sticky value indicating possible external reg origin (0x40000) + tb.reg_origin_abs[i] = -2; + + must_repeat |= !tb.targets.empty(); + } + else if (tb.reg_origin_abs[i] != -1) + { + tb.reg_origin_abs[i] = -1; + + must_repeat |= !tb.targets.empty(); + } + } + } + } + } + } + + if (!must_repeat) + { + break; + } + + for (u32 wi = 0; wi < workload.size(); wi++) + { + const u32 addr = workload[wi]; + auto& block = m_bbs.at(addr); + + // Reset values for the next attempt (keep negative values) + for (u32 i = 0; i < s_reg_max; i++) + { + if (block.reg_origin[i] <= 0x40000) + block.reg_origin[i] = 0x80000000; + if (block.reg_origin_abs[i] <= 0x40000) + block.reg_origin_abs[i] = 0x80000000; + } + } + } + + // Fill more block info + for (u32 wi = 0; wi < workload.size(); wi++) + { + const u32 addr = workload[wi]; + auto& bb = m_bbs.at(addr); + auto& func = m_funcs.at(bb.func); + + // Update function size + func.size = std::max(func.size, bb.size + (addr - bb.func) / 4); + + // Copy constants according to reg_origin info + for (u32 i = 0; i < s_reg_max; i++) + { + // Select reg origin (this isn't completely safe) + const u32 orig = bb.reg_origin_abs[i]; + + if (orig < 0x40000) + { + auto& src = m_bbs.at(orig); + bb.reg_const[i] = src.reg_const[i]; + bb.reg_val32[i] = src.reg_val32[i]; + } + + if (!bb.reg_save_dom[i] && bb.reg_use[i] && (orig == 0x40000 || orig == -2)) + { + // Destroy offset if external reg value is used + func.reg_save_off[i] = -1; + } + } + + if (u32 orig = bb.reg_origin_abs[s_reg_sp]; orig < 0x40000) + { + auto& prologue = m_bbs.at(orig); + + // Copy stack offset (from the assumed prologue) + bb.stack_sub = prologue.stack_sub; + } + else if (orig > 0x40000) + { + // Unpredictable stack + bb.stack_sub = 0x80000000; + } + + spu_opcode_t op; + + auto last_inst = spu_itype::UNK; + + for (u32 ia = addr; ia < addr + bb.size * 4; ia += 4) + { + // Decode instruction again + op.opcode = se_storage::swap(result[(ia - lsa) / 4 + 1]); + last_inst = s_spu_itype.decode(op.opcode); + + // Propagate some constants + switch (last_inst) + { + case spu_itype::IL: + { + bb.reg_const[op.rt] = true; + bb.reg_val32[op.rt] = op.si16; + break; + } + case spu_itype::ILA: + { + bb.reg_const[op.rt] = true; + bb.reg_val32[op.rt] = op.i18; + break; + } + case spu_itype::ILHU: + { + bb.reg_const[op.rt] = true; + bb.reg_val32[op.rt] = op.i16 << 16; + break; + } + case spu_itype::ILH: + { + bb.reg_const[op.rt] = true; + bb.reg_val32[op.rt] = op.i16 << 16 | op.i16; + break; + } + case spu_itype::IOHL: + { + bb.reg_val32[op.rt] = bb.reg_val32[op.rt] | op.i16; + break; + } + case spu_itype::ORI: + { + bb.reg_const[op.rt] = bb.reg_const[op.ra]; + bb.reg_val32[op.rt] = bb.reg_val32[op.ra] | op.si10; + break; + } + case spu_itype::OR: + { + bb.reg_const[op.rt] = bb.reg_const[op.ra] & bb.reg_const[op.rb]; + bb.reg_val32[op.rt] = bb.reg_val32[op.ra] | bb.reg_val32[op.rb]; + break; + } + case spu_itype::AI: + { + bb.reg_const[op.rt] = bb.reg_const[op.ra]; + bb.reg_val32[op.rt] = bb.reg_val32[op.ra] + op.si10; + break; + } + case spu_itype::A: + { + bb.reg_const[op.rt] = bb.reg_const[op.ra] & bb.reg_const[op.rb]; + bb.reg_val32[op.rt] = bb.reg_val32[op.ra] + bb.reg_val32[op.rb]; + break; + } + case spu_itype::SFI: + { + bb.reg_const[op.rt] = bb.reg_const[op.ra]; + bb.reg_val32[op.rt] = op.si10 - bb.reg_val32[op.ra]; + break; + } + case spu_itype::SF: + { + bb.reg_const[op.rt] = bb.reg_const[op.ra] & bb.reg_const[op.rb]; + bb.reg_val32[op.rt] = bb.reg_val32[op.rb] - bb.reg_val32[op.ra]; + break; + } + case spu_itype::STQD: + { + if (op.ra == s_reg_sp && bb.stack_sub != 0x80000000 && bb.reg_save_dom[op.rt]) + { + const u32 offset = 0x80000000 + op.si10 * 16 - bb.stack_sub; + + if (func.reg_save_off[op.rt] == 0) + { + // Store reg save offset + func.reg_save_off[op.rt] = offset; + } + else if (func.reg_save_off[op.rt] != offset) + { + // Conflict of different offsets + func.reg_save_off[op.rt] = -1; + } + } + + break; + } + case spu_itype::LQD: + { + if (op.ra == s_reg_sp && bb.stack_sub != 0x80000000 && bb.reg_load_mod[op.rt] == ia + 1) + { + // Adjust reg load offset + bb.reg_load_mod[op.rt] = 0x80000000 + op.si10 * 16 - bb.stack_sub; + } + + // Clear const + bb.reg_const[op.rt] = false; + break; + } + default: + { + // Clear const if reg is modified here + if (u8 reg = m_regmod[ia / 4]; reg < s_reg_max) + bb.reg_const[reg] = false; + break; + } + } + + // $SP is modified + if (m_regmod[ia / 4] == s_reg_sp) + { + if (bb.reg_const[s_reg_sp]) + { + // Making $SP a constant is a funny thing too. + bb.stack_sub = 0x80000000; + } + + if (bb.stack_sub != 0x80000000) + { + switch (last_inst) + { + case spu_itype::AI: + { + if (op.ra == s_reg_sp) + bb.stack_sub -= op.si10; + else + bb.stack_sub = 0x80000000; + break; + } + case spu_itype::A: + { + if (op.ra == s_reg_sp && bb.reg_const[op.rb]) + bb.stack_sub -= bb.reg_val32[op.rb]; + else if (op.rb == s_reg_sp && bb.reg_const[op.ra]) + bb.stack_sub -= bb.reg_val32[op.ra]; + else + bb.stack_sub = 0x80000000; + break; + } + case spu_itype::SF: + { + if (op.rb == s_reg_sp && bb.reg_const[op.ra]) + bb.stack_sub += bb.reg_val32[op.ra]; + else + bb.stack_sub = 0x80000000; + break; + } + default: + { + bb.stack_sub = 0x80000000; + break; + } + } + } + + // Check for funny values. + if (bb.stack_sub >= 0x40000 || bb.stack_sub % 16) + { + bb.stack_sub = 0x80000000; + } + } + } + + // Analyse terminator instruction + const u32 tia = addr + bb.size * 4 - 4; + + switch (last_inst) + { + case spu_itype::BR: + case spu_itype::BRA: + case spu_itype::BRNZ: + case spu_itype::BRZ: + case spu_itype::BRHNZ: + case spu_itype::BRHZ: + case spu_itype::BRSL: + case spu_itype::BRASL: + { + const u32 target = spu_branch_target(last_inst == spu_itype::BRA || last_inst == spu_itype::BRASL ? 0 : tia, op.i16); + + if (target == tia + 4) + { + bb.terminator = term_type::fallthrough; + } + else if (last_inst != spu_itype::BRSL && last_inst != spu_itype::BRASL) + { + // No-op terminator or simple branch instruction + bb.terminator = term_type::br; + + if (target == bb.func) + { + // Recursive tail call + bb.terminator = term_type::ret; + } + } + else if (op.rt == s_reg_lr) + { + bb.terminator = term_type::call; + } + else + { + bb.terminator = term_type::interrupt_call; + } + + break; + } + case spu_itype::BI: + { + if (op.d || op.e || bb.targets.size() == 1) + { + bb.terminator = term_type::interrupt_call; + } + else if (bb.targets.size() > 1) + { + // Jump table + bb.terminator = term_type::br; + } + else if (op.ra == s_reg_lr) + { + // Return (TODO) + bb.terminator = term_type::ret; + } + else + { + // Indirect tail call (TODO) + bb.terminator = term_type::interrupt_call; + } + + break; + } + case spu_itype::BISLED: + case spu_itype::IRET: + { + bb.terminator = term_type::interrupt_call; + break; + } + case spu_itype::BISL: + case spu_itype::BIZ: + case spu_itype::BINZ: + case spu_itype::BIHZ: + case spu_itype::BIHNZ: + { + if (op.d || op.e || bb.targets.size() != 1) + { + bb.terminator = term_type::interrupt_call; + } + else if (last_inst != spu_itype::BISL && bb.targets[0] == tia + 4 && op.ra == s_reg_lr) + { + // Conditional return (TODO) + bb.terminator = term_type::ret; + } + else if (last_inst == spu_itype::BISL) + { + // Indirect call + bb.terminator = term_type::indirect_call; + } + else + { + // TODO + bb.terminator = term_type::interrupt_call; + } + + break; + } + default: + { + // Normal instruction + bb.terminator = term_type::fallthrough; + break; + } + } + } + + // Check function blocks + for (auto& f : m_funcs) + { + if (g_cfg.core.spu_block_size != spu_block_size_type::giga) + { + break; + } + + bool is_ok = true; + + u32 used_stack = 0; + + for (auto it = m_bbs.lower_bound(f.first); it != m_bbs.end() && it->second.func == f.first; ++it) + { + auto& bb = it->second; + auto& func = m_funcs.at(bb.func); + const u32 addr = it->first; + const u32 flim = bb.func + func.size * 4; + + used_stack |= bb.stack_sub; + + if (bb.stack_sub == 0x80000000) + { + is_ok = false; + } + + if (is_ok && bb.terminator >= term_type::indirect_call) + { + is_ok = false; + } + + if (is_ok && bb.terminator == term_type::ret) + { + // Check $LR (alternative return registers are currently not supported) + if (u32 lr_orig = bb.reg_mod[s_reg_lr] ? addr : bb.reg_origin_abs[s_reg_lr]; lr_orig < 0x40000) + { + auto& src = m_bbs.at(lr_orig); + + if (src.reg_load_mod[s_reg_lr] != func.reg_save_off[s_reg_lr]) + { + LOG_ERROR(SPU, "Function 0x%05x: $LR mismatch (src=0x%x; 0x%x vs 0x%x)", f.first, lr_orig, src.reg_load_mod[0], func.reg_save_off[0]); + is_ok = false; + } + else if (src.reg_load_mod[s_reg_lr] == 0) + { + LOG_ERROR(SPU, "Function 0x%05x: $LR modified (src=0x%x)", f.first, lr_orig); + is_ok = false; + } + } + else if (lr_orig > 0x40000) + { + LOG_ERROR(SPU, "Function 0x%05x: $LR unpredictable", f.first); + is_ok = false; + } + + // Check $SP (should be restored or unmodified) + if (bb.stack_sub != 0) + { + LOG_WARNING(SPU, "Function 0x%05x: return with stack frame 0x%x", f.first, bb.stack_sub); + is_ok = false; + } + + // Check $80..$127 (should be restored or unmodified) + for (u32 i = s_reg_80; i <= s_reg_127; i++) + { + if (u32 orig = bb.reg_mod[i] ? addr : bb.reg_origin_abs[i]; orig < 0x40000) + { + auto& src = m_bbs.at(orig); + + if (src.reg_load_mod[i] != func.reg_save_off[i]) + { + LOG_ERROR(SPU, "Function 0x%05x: $%u mismatch (src=0x%x; 0x%x vs 0x%x)", f.first, i, orig, src.reg_load_mod[i], func.reg_save_off[i]); + is_ok = false; } } - - if (tb.chunk != block.chunk && !(m_entry_info[target / 4] && m_ret_info[target / 4])) + else if (orig > 0x40000) { - // Skip call targets completely - continue; + LOG_ERROR(SPU, "Function 0x%05x: $%u unpredictable", f.first, i); + is_ok = false; } - if (tb.reg_origin_abs[i] != -2) + if (func.reg_save_off[i] == -1) { - const u32 expected = block.reg_mod[i] ? addr : block.reg_origin_abs[i]; + LOG_ERROR(SPU, "Function 0x%05x: $%u used incorrectly", f.first, i); + is_ok = false; + } + } + } - if (tb.reg_origin_abs[i] == 0x80000000) - { - tb.reg_origin_abs[i] = expected; - } - else if (tb.reg_origin_abs[i] != expected) - { - if (tb.reg_origin_abs[i] == 0x40000 || expected == -2 || expected == 0x40000) - { - // Set -2: sticky value indicating possible external reg origin (0x40000) - tb.reg_origin_abs[i] = -2; + if (is_ok && bb.terminator == term_type::call) + { + // Check call instruction (TODO) + if (bb.stack_sub == 0) + { + // Call without a stack frame + LOG_WARNING(SPU, "Function 0x%05x: frameless call", f.first); + is_ok = false; + } + } - must_repeat |= !tb.targets.empty(); - } - else if (tb.reg_origin_abs[i] != -1) - { - tb.reg_origin_abs[i] = -1; + if (is_ok && bb.terminator == term_type::fallthrough) + { + // Can't just fall out of the function + if (bb.targets.size() != 1 || bb.targets[0] >= flim) + { + LOG_ERROR(SPU, "Function 0x%05x: bad fallthrough to 0x%x", f.first, bb.targets[0]); + is_ok = false; + } + } - must_repeat |= !tb.targets.empty(); - } - } + // Fill external function targets (calls, possibly tail calls) + for (u32 target : bb.targets) + { + if (target < bb.func || target >= flim || (bb.terminator == term_type::call && target == bb.func)) + { + if (func.calls.find_first_of(target) == -1) + { + func.calls.push_back(target); } } } } - if (!must_repeat) + if (is_ok && used_stack && f.first == entry_point) { - break; + LOG_ERROR(SPU, "Function 0x%05x: disabled as possible chunk", f.first); + is_ok = false; } - for (u32 wi = 0; wi < workload.size(); wi++) + // if (is_ok && f.first > 0x1d240 && f.first < 0x1e000) + // { + // LOG_ERROR(SPU, "Function 0x%05x: manually disabled", f.first); + // is_ok = false; + // } + + f.second.good = is_ok; + } + + // Check function call graph + while (g_cfg.core.spu_block_size == spu_block_size_type::giga) + { + bool need_repeat = false; + + for (auto& f : m_funcs) { - const u32 addr = workload[wi]; - auto& block = m_bbs.at(addr); + if (!f.second.good) + { + continue; + } - // Reset values for the next attempt (keep negative values) - for (u32 i = 0; i < s_reg_max; i++) + for (u32 call : f.second.calls) { - if (block.reg_origin[i] <= 0x40000) - block.reg_origin[i] = 0x80000000; - if (block.reg_origin_abs[i] <= 0x40000) - block.reg_origin_abs[i] = 0x80000000; + const auto ffound = std::as_const(m_funcs).find(call); + + if (ffound == m_funcs.cend() || ffound->second.good == false) + { + need_repeat = true; + + if (f.second.good) + { + LOG_ERROR(SPU, "Function 0x%05x: calls bad function", f.first); + f.second.good = false; + } + } } } + + if (!need_repeat) + { + break; + } } if (result.size() == 1) @@ -2178,7 +2818,9 @@ void spu_recompiler_base::dump(std::string& out) { if (m_block_info[bb.first / 4]) { - fmt::append(out, "?: [0x%05x] %s\n", bb.first, m_entry_info[bb.first / 4] ? (m_ret_info[bb.first / 4] ? "Chunk" : "Entry") : "Block"); + fmt::append(out, "A: [0x%05x] %s\n", bb.first, m_entry_info[bb.first / 4] ? (m_ret_info[bb.first / 4] ? "Chunk" : "Entry") : "Block"); + + fmt::append(out, "\tF: 0x%05x\n", bb.second.func); for (u32 pred : bb.second.preds) { @@ -2187,12 +2829,24 @@ void spu_recompiler_base::dump(std::string& out) for (u32 target : bb.second.targets) { - fmt::append(out, "\t-> 0x%05x\n", target); + fmt::append(out, "\t-> 0x%05x%s\n", target, m_bbs.count(target) ? "" : " (null)"); } } else { - fmt::append(out, "?: [0x%05x] ?\n", bb.first); + fmt::append(out, "A: [0x%05x] ?\n", bb.first); + } + } + + for (auto& f : m_funcs) + { + fmt::append(out, "F: [0x%05x]%s\n", f.first, f.second.good ? " (good)" : " (bad)"); + + fmt::append(out, "\tN: 0x%05x\n", f.second.size * 4 + f.first); + + for (u32 call : f.second.calls) + { + fmt::append(out, "\t>> 0x%05x%s\n", call, m_funcs.count(call) ? "" : " (null)"); } } @@ -2261,6 +2915,9 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator struct block_info { + // Pointer to the analyser + spu_recompiler_base::block_info* bb{}; + // Current block's entry block llvm::BasicBlock* block; @@ -2277,27 +2934,23 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator std::array store{}; }; - struct chunk_info + struct function_info { - // Callable function - llvm::Function* func; - - // Constants in non-volatile registers at the entry point - std::array reg{}; + // Standard callable chunk + llvm::Function* chunk{}; - chunk_info() = default; + // Callable function + llvm::Function* fn{}; - chunk_info(llvm::Function* func) - : func(func) - { - } + // Registers possibly loaded in the entry block + std::array load{}; }; // Current block block_info* m_block; - // Current chunk - chunk_info* m_finfo; + // Current function or chunk + function_info* m_finfo; // All blocks in the current function chunk std::unordered_map> m_blocks; @@ -2306,17 +2959,22 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator std::vector m_block_queue; // All function chunks in current SPU compile unit - std::unordered_map> m_functions; + std::unordered_map> m_functions; // Function chunk list for processing std::vector m_function_queue; - // Helper - std::vector m_scan_queue; - // Add or get the function chunk - llvm::Function* add_function(u32 addr) + function_info* add_function(u32 addr) { + // Enqueue if necessary + const auto empl = m_functions.try_emplace(addr); + + if (!empl.second) + { + return &empl.first->second; + } + // Get function chunk name const std::string name = fmt::format("spu-chunk-0x%05x", addr); llvm::Function* result = llvm::cast(m_module->getOrInsertFunction(name, get_ftype()).getCallee()); @@ -2326,32 +2984,79 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator result->addAttribute(1, llvm::Attribute::NoAlias); result->addAttribute(2, llvm::Attribute::NoAlias); - // Enqueue if necessary - const auto empl = m_functions.emplace(addr, chunk_info{result}); + empl.first->second.chunk = result; - if (empl.second) + if (g_cfg.core.spu_block_size == spu_block_size_type::giga) { - m_function_queue.push_back(addr); + // Find good real function + const auto ffound = m_funcs.find(addr); + + if (ffound != m_funcs.end() && ffound->second.good) + { + const std::string fname = fmt::format("spu-function-0x%05x", addr); + llvm::Function* fn = llvm::cast(m_module->getOrInsertFunction(fname, get_ftype()).getCallee()); + + fn->setLinkage(llvm::GlobalValue::InternalLinkage); + fn->addAttribute(1, llvm::Attribute::NoAlias); + fn->addAttribute(2, llvm::Attribute::NoAlias); + empl.first->second.fn = fn; + } + } + + // Enqueue + m_function_queue.push_back(addr); - if (m_block && g_cfg.core.spu_block_size != spu_block_size_type::safe) + return &empl.first->second; + } + + // Call the real function + void call_function(llvm::Function* fn, bool tail = false) + { + llvm::Value* lr{}; + + if (!m_finfo->fn) + { + if (m_block) + { + lr = m_ir->CreateExtractElement(get_reg_fixed(s_reg_lr).value, 3); + } + else { - // Initialize constants for non-volatile registers (TODO) - auto& regs = empl.first->second.reg; + lr = m_ir->CreateLoad(spu_ptr(&spu_thread::gpr, 0, &v128::_u32, 3)); + } + } + + const auto _call = m_ir->CreateCall(verify(HERE, fn), {m_thread, m_lsptr}); + + // Tail call using loaded LR value (gateway from a chunk) + if (!m_finfo->fn) + { + lr = m_ir->CreateAnd(lr, 0x3fffc); - for (u32 i = 80; i <= 127; i++) + m_ir->CreateStore(lr, spu_ptr(&spu_thread::pc)); + m_ir->CreateBr(add_block_indirect({}, value(lr))); + } + else if (tail) + { + _call->setTailCall(); + } + else + { + // TODO: initialize $LR with a constant + for (u32 i = 0; i < s_reg_max; i++) + { + if (i != s_reg_lr && i != s_reg_sp && (i < s_reg_80 || i > s_reg_127)) { - if (auto c = llvm::dyn_cast_or_null(m_block->reg[i])) - { - if (m_bbs.at(addr).reg_origin_abs[i] < 0x40000) - { - regs[i] = c; - } - } + m_block->reg[i] = m_ir->CreateLoad(init_reg_fixed(i)); } } } + } - return result; + // Emit return from the real function + void ret_function() + { + m_ir->CreateRetVoid(); } void set_function(llvm::Function* func) @@ -2378,15 +3083,56 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator if (m_blocks.empty()) { // Special case: first block, proceed normally + if (auto fn = std::exchange(m_finfo->fn, nullptr)) + { + // Create a gateway + call_function(fn, true); + + m_finfo->fn = fn; + m_function = fn; + m_thread = &*fn->arg_begin(); + m_lsptr = &*(fn->arg_begin() + 1); + m_ir->SetInsertPoint(llvm::BasicBlock::Create(m_context, "", fn)); + m_memptr = m_ir->CreateIntToPtr(m_ir->getInt64((u64)vm::g_base_addr), get_type()); + + // Load registers at the entry chunk + for (u32 i = 0; i < s_reg_max; i++) + { + if (i >= s_reg_80 && i <= s_reg_127) + { + // TODO + //m_finfo->load[i] = llvm::UndefValue::get(get_reg_type(i)); + } + + m_finfo->load[i] = m_ir->CreateLoad(init_reg_fixed(i)); + } + } } - else if (m_block_info[target / 4] && m_entry_info[target / 4] && !(pred_found && m_entry == target)) + else if (m_block_info[target / 4] && m_entry_info[target / 4] && !(pred_found && m_entry == target) && (!m_finfo->fn || !m_ret_info[target / 4])) { // Generate a tail call to the function chunk const auto cblock = m_ir->GetInsertBlock(); const auto result = llvm::BasicBlock::Create(m_context, "", m_function); m_ir->SetInsertPoint(result); - m_ir->CreateStore(m_ir->getInt32(target), spu_ptr(&spu_thread::pc)); - tail(add_function(target)); + const auto pfinfo = add_function(target); + + if (pfinfo->fn) + { + // Tail call to the real function + call_function(pfinfo->fn, true); + + if (!result->getTerminator()) + { + ret_function(); + } + } + else + { + // Just a boring tail call to another chunk + m_ir->CreateStore(m_ir->getInt32(target), spu_ptr(&spu_thread::pc)); + tail(pfinfo->chunk); + } + m_ir->SetInsertPoint(cblock); return result; } @@ -2664,7 +3410,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator if (!reg) { // Load register value if necessary - reg = m_ir->CreateLoad(init_reg_fixed(index)); + reg = m_finfo && m_finfo->load[index] ? m_finfo->load[index] : m_ir->CreateLoad(init_reg_fixed(index)); } if (reg->getType() == get_type()) @@ -2688,44 +3434,6 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator { if (const auto phi = llvm::dyn_cast(reg)) { - if (phi->getNumUses()) - { - LOG_WARNING(SPU, "[0x%x] $%u: Phi has uses :(", m_pos, index); - } - else - { - const auto cblock = m_ir->GetInsertBlock(); - m_ir->SetInsertPoint(phi); - - const auto newphi = m_ir->CreatePHI(get_type(), phi->getNumIncomingValues()); - - for (u32 i = 0; i < phi->getNumIncomingValues(); i++) - { - const auto iblock = phi->getIncomingBlock(i); - m_ir->SetInsertPoint(iblock->getTerminator()); - const auto ivalue = phi->getIncomingValue(i); - newphi->addIncoming(xfloat_to_double(ivalue), iblock); - } - - for (auto& b : m_blocks) - { - if (b.second.phi[index] == phi) - { - b.second.phi[index] = newphi; - } - - if (b.second.reg[index] == phi) - { - b.second.reg[index] = newphi; - } - } - - reg = newphi; - - m_ir->SetInsertPoint(cblock); - phi->eraseFromParent(); - return reg; - } } if (auto c = llvm::dyn_cast(reg)) @@ -2895,6 +3603,15 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator _store->eraseFromParent(); } + if (m_finfo && m_finfo->fn) + { + if (index == s_reg_lr || (index >= s_reg_80 && index <= s_reg_127)) + { + // Don't save some registers in true functions + return; + } + } + // Write register to the context _store = m_ir->CreateStore(is_xfloat ? double_to_xfloat(saved_value) : m_ir->CreateBitCast(value, addr->getType()->getPointerElementType()), addr); } @@ -3297,7 +4014,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator // Call the entry function chunk const auto entry_chunk = add_function(m_pos); - m_ir->CreateCall(entry_chunk, {m_thread, m_lsptr, m_ir->getInt32(0)})->setTailCall(); + m_ir->CreateCall(entry_chunk->chunk, {m_thread, m_lsptr, m_ir->getInt32(0)})->setTailCall(); m_ir->CreateRetVoid(); m_ir->SetInsertPoint(label_stop); @@ -3317,14 +4034,14 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator } // Create function table (uninitialized) - m_function_table = new llvm::GlobalVariable(*m_module, llvm::ArrayType::get(entry_chunk->getType(), m_size / 4), true, llvm::GlobalValue::InternalLinkage, nullptr); + m_function_table = new llvm::GlobalVariable(*m_module, llvm::ArrayType::get(entry_chunk->chunk->getType(), m_size / 4), true, llvm::GlobalValue::InternalLinkage, nullptr); // Create function chunks for (std::size_t fi = 0; fi < m_function_queue.size(); fi++) { // Initialize function info m_entry = m_function_queue[fi]; - set_function(m_functions[m_entry].func); + set_function(m_functions[m_entry].chunk); m_finfo = &m_functions[m_entry]; m_ir->CreateBr(add_block(m_entry)); @@ -3337,18 +4054,21 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator m_ir->SetInsertPoint(m_block->block); auto& bb = m_bbs.at(baddr); bool need_check = false; + m_block->bb = &bb; if (bb.preds.size()) { // Initialize registers and build PHI nodes if necessary for (u32 i = 0; i < s_reg_max; i++) { - const u32 src = bb.reg_origin[i]; + const u32 src = m_finfo->fn ? bb.reg_origin_abs[i] : bb.reg_origin[i]; - if (src == -1) + if (src > 0x40000) { - // TODO: type - const auto _phi = m_ir->CreatePHI(get_reg_type(i), ::size32(bb.preds)); + // Use the xfloat hint to create 256-bit (4x double) PHI + llvm::Type* type = g_cfg.core.spu_accurate_xfloat && bb.reg_maybe_xf[i] ? get_type() : get_reg_type(i); + + const auto _phi = m_ir->CreatePHI(type, ::size32(bb.preds)); m_block->phi[i] = _phi; m_block->reg[i] = _phi; @@ -3369,13 +4089,17 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator if (!value) { // Value hasn't been loaded yet - value = m_finfo->reg[i] ? m_finfo->reg[i] : m_ir->CreateLoad(regptr); + value = m_finfo && m_finfo->load[i] ? m_finfo->load[i] : m_ir->CreateLoad(regptr); } - if (value->getType() == get_type()) + if (value->getType() == get_type() && type != get_type()) { value = double_to_xfloat(value); } + else if (value->getType() != get_type() && type == get_type()) + { + value = xfloat_to_double(m_ir->CreateBitCast(value, get_type())); + } else if (i < 128 && llvm::isa(value)) { // Bitcast the constant @@ -3402,7 +4126,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator const auto regptr = init_reg_fixed(i); const auto cblock = m_ir->GetInsertBlock(); m_ir->SetInsertPoint(m_function->getEntryBlock().getTerminator()); - const auto value = m_finfo->reg[i] ? m_finfo->reg[i] : m_ir->CreateLoad(regptr); + const auto value = m_finfo && m_finfo->load[i] ? m_finfo->load[i] : m_ir->CreateLoad(regptr); m_ir->SetInsertPoint(cblock); _phi->addIncoming(value, &m_function->getEntryBlock()); } @@ -3421,10 +4145,9 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator LOG_ERROR(SPU, "[0x%05x] Value not found ($%u from 0x%05x)", baddr, i, src); } } - else if (baddr == m_entry) + else { - // Passthrough constant from a different chunk (will be removed in future) - m_block->reg[i] = m_finfo->reg[i]; + m_block->reg[i] = m_finfo->load[i]; } } @@ -3523,29 +4246,14 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator if (found == m_functions.end()) { - if (m_entry_info[i / 4]) - { - LOG_ERROR(SPU, "[0x%x] Function chunk not compiled: 0x%x", func[0], i); - } - chunks.push_back(null); continue; } - chunks.push_back(found->second.func); - - // If a chunk has incoming constants, we can't add it to the function table (TODO) - for (const auto c : found->second.reg) - { - if (c != nullptr) - { - chunks.back() = null; - break; - } - } + chunks.push_back(found->second.chunk); } - m_function_table->setInitializer(llvm::ConstantArray::get(llvm::ArrayType::get(entry_chunk->getType(), m_size / 4), chunks)); + m_function_table->setInitializer(llvm::ConstantArray::get(llvm::ArrayType::get(entry_chunk->chunk->getType(), m_size / 4), chunks)); } else { @@ -3566,7 +4274,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator for (const auto& func : m_functions) { - const auto f = func.second.func; + const auto f = func.second.fn ? func.second.fn : func.second.chunk; pm.run(*f); for (auto& bb : *f) @@ -3581,31 +4289,6 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator li->eraseFromParent(); break; } - - // Replace volatile fake store with return - if (auto si = dyn_cast(&i); si && si->getOperand(1) == m_fake_global1) - { - const auto br = bb.getTerminator(); - - for (auto& j : *br->getSuccessor(0)) - { - // Cleanup PHI nodes if exist - if (auto phi = dyn_cast(&j)) - { - phi->removeIncomingValue(&bb, false); - } - else - { - break; - } - } - - m_ir->SetInsertPoint(bb.getTerminator()); - m_ir->CreateRetVoid(); - si->eraseFromParent(); - br->eraseFromParent(); - break; - } } } } @@ -3615,7 +4298,6 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator m_block_queue.clear(); m_functions.clear(); m_function_queue.clear(); - m_scan_queue.clear(); m_function_table = nullptr; std::string log; @@ -6443,6 +7125,17 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void STQD(spu_opcode_t op) { + if (m_finfo && m_finfo->fn) + { + if (op.rt == s_reg_lr || (op.rt >= s_reg_80 && op.rt <= s_reg_127)) + { + if (m_block->bb->reg_save_dom[op.rt] && get_reg_raw(op.rt) == m_finfo->load[op.rt]) + { + return; + } + } + } + value_t addr = eval(zext((extract(get_vr(op.ra), 3) + (get_imm(op.si10) << 4)) & 0x3fff0)); make_store_ls(addr, get_vr(op.rt)); } @@ -6578,7 +7271,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator } // Convert an indirect branch into a static one if possible - if (const auto _int = llvm::dyn_cast(addr.value)) + if (const auto _int = llvm::dyn_cast(addr.value); _int && op.opcode) { const u32 target = ::narrow(_int->getZExtValue(), HERE); @@ -6601,17 +7294,34 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator // Fixed branch excludes the possibility it's a function return (TODO) ret = false; } - else if (llvm::isa(addr.value)) + else if (llvm::isa(addr.value) && op.opcode) { LOG_ERROR(SPU, "[0x%x] Unexpected constant (add_block_indirect)", m_pos); } + if (m_finfo && m_finfo->fn && op.opcode) + { + const auto cblock = m_ir->GetInsertBlock(); + const auto result = llvm::BasicBlock::Create(m_context, "", m_function); + m_ir->SetInsertPoint(result); + ret_function(); + m_ir->SetInsertPoint(cblock); + return result; + } + // Load stack addr if necessary value_t sp; if (ret && g_cfg.core.spu_block_size != spu_block_size_type::safe) { - sp = eval(extract(get_reg_fixed(1), 3) & 0x3fff0); + if (op.opcode) + { + sp = eval(extract(get_reg_fixed(1), 3) & 0x3fff0); + } + else + { + sp.value = m_ir->CreateLoad(spu_ptr(&spu_thread::gpr, 1, &v128::_u32, 3)); + } } const auto cblock = m_ir->GetInsertBlock(); @@ -6920,6 +7630,23 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void BRASL(spu_opcode_t op) // { set_link(op); + + const u32 target = spu_branch_target(0, op.i16); + + if (m_finfo && m_finfo->fn && target != m_pos + 4) + { + if (auto fn = add_function(target)->fn) + { + call_function(fn); + return; + } + else + { + LOG_FATAL(SPU, "[0x%x] Can't add function 0x%x", m_pos, target); + return; + } + } + BRA(op); } @@ -6946,6 +7673,23 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void BRSL(spu_opcode_t op) // { set_link(op); + + const u32 target = spu_branch_target(m_pos, op.i16); + + if (m_finfo && m_finfo->fn && target != m_pos + 4) + { + if (auto fn = add_function(target)->fn) + { + call_function(fn); + return; + } + else + { + LOG_FATAL(SPU, "[0x%x] Can't add function 0x%x", m_pos, target); + return; + } + } + BR(op); } @@ -6961,13 +7705,18 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator set_vr(op.rt, build(0, 0, 0, spu_branch_target(m_pos + 4))); + if (m_finfo && m_finfo->fn) + { + return; + } + if (g_cfg.core.spu_block_size != spu_block_size_type::safe && m_block_info[m_pos / 4 + 1] && m_entry_info[m_pos / 4 + 1]) { // Store the return function chunk address at the stack mirror - const auto func = add_function(m_pos + 4); + const auto pfunc = add_function(m_pos + 4); const auto stack0 = eval(zext(extract(get_reg_fixed(1), 3) & 0x3fff0) + ::offset32(&spu_thread::stack_mirror)); const auto stack1 = eval(stack0 + 8); - m_ir->CreateStore(func, m_ir->CreateBitCast(m_ir->CreateGEP(m_thread, stack0.value), func->getType()->getPointerTo())); + m_ir->CreateStore(pfunc->chunk, m_ir->CreateBitCast(m_ir->CreateGEP(m_thread, stack0.value), pfunc->chunk->getType()->getPointerTo())); m_ir->CreateStore(m_ir->getInt64(m_pos + 4), m_ir->CreateBitCast(m_ir->CreateGEP(m_thread, stack1.value), get_type())); } } diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index af5ad3c70f66..b02369b2cd05 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -199,6 +199,17 @@ class spu_recompiler_base s_reg_max }; + // Classify terminator instructions + enum class term_type : unsigned char + { + br, + ret, + call, + fallthrough, + indirect_call, + interrupt_call, + }; + protected: std::shared_ptr m_spurt; @@ -239,12 +250,39 @@ class spu_recompiler_base // Internal use flag bool analysed = false; + // Terminator instruction type + term_type terminator; + // Bit mask of the registers modified in the block std::bitset reg_mod{}; + // Set if last modifying instruction produces xfloat + std::bitset reg_mod_xf{}; + + // Set if the initial register value in this block may be xfloat + std::bitset reg_maybe_xf{}; + // Bit mask of the registers used (before modified) std::bitset reg_use{}; + // Bit mask of the trivial (u32 x 4) constant value resulting in this block + std::bitset reg_const{}; + + // Bit mask of register saved onto the stack before use + std::bitset reg_save_dom{}; + + // Address of the function + u32 func = 0x40000; + + // Value subtracted from $SP in this block, negative if something funny is done on $SP + u32 stack_sub = 0; + + // Constant values associated with reg_const + std::array reg_val32; + + // Registers loaded from the stack in this block (stack offset) + std::array reg_load_mod{}; + // Single source of the reg value (dominating block address within the same chunk) or a negative number std::array reg_origin, reg_origin_abs; @@ -258,13 +296,27 @@ class spu_recompiler_base // Sorted basic block info std::map m_bbs; - // Advanced block (chunk) information - struct chunk_info + // Sorted advanced block (chunk) list + std::basic_string m_chunks; + + // Function information + struct func_info { + // Size to the end of last basic block + u16 size = 0; + + // Determines whether a function is eligible for optimizations + bool good = false; + + // Call targets + std::basic_string calls; + + // Register save info (stack offset) + std::array reg_save_off{}; }; - // Sorted chunk info - std::map m_chunks; + // Sorted function info + std::map m_funcs; std::shared_ptr m_cache;