diff --git a/rpcs3/Emu/Cell/SPUAnalyser.h b/rpcs3/Emu/Cell/SPUAnalyser.h index adaa4ebc6489..65ac1d5d9710 100644 --- a/rpcs3/Emu/Cell/SPUAnalyser.h +++ b/rpcs3/Emu/Cell/SPUAnalyser.h @@ -11,6 +11,7 @@ struct spu_itype static constexpr struct branch_tag{} branch{}; // Branch Instructions static constexpr struct floating_tag{} floating{}; // Floating-Point Instructions static constexpr struct quadrop_tag{} _quadrop{}; // 4-op Instructions + static constexpr struct xfloat_tag{} xfloat{}; // Instructions producing xfloat values enum type : unsigned char { @@ -146,24 +147,26 @@ struct spu_itype FMS, // quadrop_tag last FA, - DFA, FS, - DFS, FM, + FREST, + FRSQEST, + FI, + CSFLT, + CUFLT, + FRDS, // xfloat_tag last + + DFA, + DFS, DFM, DFMA, DFNMS, DFMS, DFNMA, - FREST, - FRSQEST, - FI, - CSFLT, + FESD, + CFLTS, - CUFLT, CFLTU, - FRDS, - FESD, FCEQ, FCMEQ, FCGT, @@ -252,6 +255,12 @@ struct spu_itype { return value >= MPYA && value <= FMS; } + + // Test for xfloat instruction + friend constexpr bool operator &(type value, xfloat_tag) + { + return value >= FMA && value <= FRDS; + } }; struct spu_iflag diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index 54ef3a8cd2b4..5ec798081774 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -915,6 +915,8 @@ const std::vector& spu_recompiler_base::analyse(const be_t* ls, u32 en m_preds.clear(); m_preds[entry_point]; m_bbs.clear(); + m_chunks.clear(); + m_funcs.clear(); // Value flags (TODO) enum class vf : u32 @@ -1885,13 +1887,36 @@ const std::vector& spu_recompiler_base::analyse(const be_t* ls, u32 en { block.size++; + // Decode instruction + const spu_opcode_t op{se_storage::swap(result[(ia - lsa) / 4 + 1])}; + + const auto type = s_spu_itype.decode(op.opcode); + + u8 reg_save = 255; + + if (type == spu_itype::STQD && op.ra == s_reg_sp && !block.reg_mod[op.rt] && !block.reg_use[op.rt]) + { + // Register saved onto the stack before use + block.reg_save_dom[op.rt] = true; + + reg_save = op.rt; + } + for (auto* _use : {&m_use_ra, &m_use_rb, &m_use_rc}) { if (u8 reg = (*_use)[ia / 4]; reg < s_reg_max) { // Register reg use only if it happens before reg mod if (!block.reg_mod[reg]) + { block.reg_use.set(reg); + + if (reg_save != reg && block.reg_save_dom[reg]) + { + // Register is still used after saving; probably not eligible for optimization + block.reg_save_dom[reg] = false; + } + } } } @@ -1909,6 +1934,16 @@ const std::vector& spu_recompiler_base::analyse(const be_t* ls, u32 en if (u8 reg = m_regmod[ia / 4]; reg < s_reg_max) { block.reg_mod.set(reg); + block.reg_mod_xf.set(reg, type & spu_itype::xfloat); + + if (type == spu_itype::SELB && (block.reg_mod_xf[op.ra] || block.reg_mod_xf[op.rb])) + block.reg_mod_xf.set(reg); + + // Possible post-dominating register load + if (type == spu_itype::LQD && op.ra == s_reg_sp) + block.reg_load_mod[reg] = ia + 1; + else + block.reg_load_mod[reg] = 0; } // Find targets (also means end of the block) @@ -1918,6 +1953,25 @@ const std::vector& spu_recompiler_base::analyse(const be_t* ls, u32 en { // Copy targets block.targets = tfound->second; + + // Assume that the call modifies all volatile registers + if (type == spu_itype::BRSL || type == spu_itype::BRASL) + { + const u32 target = spu_branch_target(type == spu_itype::BRASL ? 0 : ia, op.i16); + + if (target != ia + 4) + { + for (u32 i = 0; i < s_reg_max; ++i) + { + if (i == s_reg_lr || (i >= s_reg_80 && i <= s_reg_127)) + { + block.reg_mod.set(i); + block.reg_mod_xf[i] = false; + } + } + } + } + break; } } @@ -1926,10 +1980,91 @@ const std::vector& spu_recompiler_base::analyse(const be_t* ls, u32 en // Fixup block predeccessors to point to basic blocks, not last instructions for (auto& bb : m_bbs) { + const u32 addr = bb.first; + for (u32& pred : bb.second.preds) { pred = std::prev(m_bbs.upper_bound(pred))->first; } + + if (m_entry_info[addr / 4]) + { + // Register empty chunk + m_chunks.push_back(addr); + + // Register function if necessary + if (!m_ret_info[addr / 4]) + { + m_funcs[addr]; + } + } + } + + // Ensure there is a function at the lowest address + if (auto emp = m_funcs.try_emplace(m_bbs.begin()->first); emp.second) + { + const u32 addr = emp.first->first; + LOG_ERROR(SPU, "Fixed first function at 0x%05x", addr); + m_entry_info[addr / 4] = true; + m_ret_info[addr / 4] = false; + } + + // Split functions + while (true) + { + bool need_repeat = false; + + u32 start = 0; + u32 limit = 0x40000; + + // Walk block list in ascending order + for (auto& block : m_bbs) + { + const u32 addr = block.first; + + if (m_entry_info[addr / 4] && !m_ret_info[addr / 4]) + { + const auto upper = m_funcs.upper_bound(addr); + start = addr; + limit = upper == m_funcs.end() ? 0x40000 : upper->first; + } + + // Find targets that exceed [start; limit) range and make new functions from them + for (u32 target : block.second.targets) + { + const auto tfound = m_bbs.find(target); + + if (tfound == m_bbs.end()) + { + continue; + } + + if (target < start || target >= limit) + { + if (!m_entry_info[target / 4] || m_ret_info[target / 4]) + { + // Create new function entry (likely a tail call) + m_entry_info[target / 4] = true; + + m_ret_info[target / 4] = false; + + m_funcs.try_emplace(target); + + if (target < limit) + { + need_repeat = true; + } + } + } + } + + block.second.func = start; + } + + if (!need_repeat) + { + break; + } } // Fill entry map, add chunk addresses @@ -1951,7 +2086,7 @@ const std::vector& spu_recompiler_base::analyse(const be_t* ls, u32 en // Check block predecessors for (u32 pred : block.preds) { - const u32 _old = m_bbs[pred].chunk; + const u32 _old = m_bbs.at(pred).chunk; if (_old < 0x40000 && _old != _new) { @@ -2040,6 +2175,12 @@ const std::vector& spu_recompiler_base::analyse(const be_t* ls, u32 en workload.push_back(target); tb.analysed = true; } + + // Limited xfloat hint propagation (possibly TODO) + if (tb.chunk == block.chunk) + { + tb.reg_maybe_xf |= block.reg_mod_xf; + } } block.reg_origin.fill(0x80000000); @@ -2103,64 +2244,563 @@ const std::vector& spu_recompiler_base::analyse(const be_t* ls, u32 en // Set -1 if multiple origins merged (requires PHI node) tb.reg_origin[i] = -1; - must_repeat |= !tb.targets.empty(); + must_repeat |= !tb.targets.empty(); + } + } + + if (tb.chunk != block.chunk && !(m_entry_info[target / 4] && m_ret_info[target / 4])) + { + // Skip call targets completely + continue; + } + + if (tb.reg_origin_abs[i] != -2) + { + const u32 expected = block.reg_mod[i] ? addr : block.reg_origin_abs[i]; + + if (tb.reg_origin_abs[i] == 0x80000000) + { + tb.reg_origin_abs[i] = expected; + } + else if (tb.reg_origin_abs[i] != expected) + { + if (tb.reg_origin_abs[i] == 0x40000 || expected == -2 || expected == 0x40000) + { + // Set -2: sticky value indicating possible external reg origin (0x40000) + tb.reg_origin_abs[i] = -2; + + must_repeat |= !tb.targets.empty(); + } + else if (tb.reg_origin_abs[i] != -1) + { + tb.reg_origin_abs[i] = -1; + + must_repeat |= !tb.targets.empty(); + } + } + } + } + } + } + + if (!must_repeat) + { + break; + } + + for (u32 wi = 0; wi < workload.size(); wi++) + { + const u32 addr = workload[wi]; + auto& block = m_bbs.at(addr); + + // Reset values for the next attempt (keep negative values) + for (u32 i = 0; i < s_reg_max; i++) + { + if (block.reg_origin[i] <= 0x40000) + block.reg_origin[i] = 0x80000000; + if (block.reg_origin_abs[i] <= 0x40000) + block.reg_origin_abs[i] = 0x80000000; + } + } + } + + // Fill more block info + for (u32 wi = 0; wi < workload.size(); wi++) + { + const u32 addr = workload[wi]; + auto& bb = m_bbs.at(addr); + auto& func = m_funcs.at(bb.func); + + // Update function size + func.size = std::max(func.size, bb.size + (addr - bb.func) / 4); + + // Copy constants according to reg_origin info + for (u32 i = 0; i < s_reg_max; i++) + { + // Select reg origin (this isn't completely safe) + const u32 orig = bb.reg_origin_abs[i]; + + if (orig < 0x40000) + { + auto& src = m_bbs.at(orig); + bb.reg_const[i] = src.reg_const[i]; + bb.reg_val32[i] = src.reg_val32[i]; + } + + if (!bb.reg_save_dom[i] && bb.reg_use[i] && (orig == 0x40000 || orig == -2)) + { + // Destroy offset if external reg value is used + func.reg_save_off[i] = -1; + } + } + + if (u32 orig = bb.reg_origin_abs[s_reg_sp]; orig < 0x40000) + { + auto& prologue = m_bbs.at(orig); + + // Copy stack offset (from the assumed prologue) + bb.stack_sub = prologue.stack_sub; + } + else if (orig > 0x40000) + { + // Unpredictable stack + bb.stack_sub = 0x80000000; + } + + spu_opcode_t op; + + auto last_inst = spu_itype::UNK; + + for (u32 ia = addr; ia < addr + bb.size * 4; ia += 4) + { + // Decode instruction again + op.opcode = se_storage::swap(result[(ia - lsa) / 4 + 1]); + last_inst = s_spu_itype.decode(op.opcode); + + // Propagate some constants + switch (last_inst) + { + case spu_itype::IL: + { + bb.reg_const[op.rt] = true; + bb.reg_val32[op.rt] = op.si16; + break; + } + case spu_itype::ILA: + { + bb.reg_const[op.rt] = true; + bb.reg_val32[op.rt] = op.i18; + break; + } + case spu_itype::ILHU: + { + bb.reg_const[op.rt] = true; + bb.reg_val32[op.rt] = op.i16 << 16; + break; + } + case spu_itype::ILH: + { + bb.reg_const[op.rt] = true; + bb.reg_val32[op.rt] = op.i16 << 16 | op.i16; + break; + } + case spu_itype::IOHL: + { + bb.reg_val32[op.rt] = bb.reg_val32[op.rt] | op.i16; + break; + } + case spu_itype::ORI: + { + bb.reg_const[op.rt] = bb.reg_const[op.ra]; + bb.reg_val32[op.rt] = bb.reg_val32[op.ra] | op.si10; + break; + } + case spu_itype::OR: + { + bb.reg_const[op.rt] = bb.reg_const[op.ra] & bb.reg_const[op.rb]; + bb.reg_val32[op.rt] = bb.reg_val32[op.ra] | bb.reg_val32[op.rb]; + break; + } + case spu_itype::AI: + { + bb.reg_const[op.rt] = bb.reg_const[op.ra]; + bb.reg_val32[op.rt] = bb.reg_val32[op.ra] + op.si10; + break; + } + case spu_itype::A: + { + bb.reg_const[op.rt] = bb.reg_const[op.ra] & bb.reg_const[op.rb]; + bb.reg_val32[op.rt] = bb.reg_val32[op.ra] + bb.reg_val32[op.rb]; + break; + } + case spu_itype::SFI: + { + bb.reg_const[op.rt] = bb.reg_const[op.ra]; + bb.reg_val32[op.rt] = op.si10 - bb.reg_val32[op.ra]; + break; + } + case spu_itype::SF: + { + bb.reg_const[op.rt] = bb.reg_const[op.ra] & bb.reg_const[op.rb]; + bb.reg_val32[op.rt] = bb.reg_val32[op.rb] - bb.reg_val32[op.ra]; + break; + } + case spu_itype::STQD: + { + if (op.ra == s_reg_sp && bb.stack_sub != 0x80000000 && bb.reg_save_dom[op.rt]) + { + const u32 offset = 0x80000000 + op.si10 * 16 - bb.stack_sub; + + if (func.reg_save_off[op.rt] == 0) + { + // Store reg save offset + func.reg_save_off[op.rt] = offset; + } + else if (func.reg_save_off[op.rt] != offset) + { + // Conflict of different offsets + func.reg_save_off[op.rt] = -1; + } + } + + break; + } + case spu_itype::LQD: + { + if (op.ra == s_reg_sp && bb.stack_sub != 0x80000000 && bb.reg_load_mod[op.rt] == ia + 1) + { + // Adjust reg load offset + bb.reg_load_mod[op.rt] = 0x80000000 + op.si10 * 16 - bb.stack_sub; + } + + // Clear const + bb.reg_const[op.rt] = false; + break; + } + default: + { + // Clear const if reg is modified here + if (u8 reg = m_regmod[ia / 4]; reg < s_reg_max) + bb.reg_const[reg] = false; + break; + } + } + + // $SP is modified + if (m_regmod[ia / 4] == s_reg_sp) + { + if (bb.reg_const[s_reg_sp]) + { + // Making $SP a constant is a funny thing too. + bb.stack_sub = 0x80000000; + } + + if (bb.stack_sub != 0x80000000) + { + switch (last_inst) + { + case spu_itype::AI: + { + if (op.ra == s_reg_sp) + bb.stack_sub -= op.si10; + else + bb.stack_sub = 0x80000000; + break; + } + case spu_itype::A: + { + if (op.ra == s_reg_sp && bb.reg_const[op.rb]) + bb.stack_sub -= bb.reg_val32[op.rb]; + else if (op.rb == s_reg_sp && bb.reg_const[op.ra]) + bb.stack_sub -= bb.reg_val32[op.ra]; + else + bb.stack_sub = 0x80000000; + break; + } + case spu_itype::SF: + { + if (op.rb == s_reg_sp && bb.reg_const[op.ra]) + bb.stack_sub += bb.reg_val32[op.ra]; + else + bb.stack_sub = 0x80000000; + break; + } + default: + { + bb.stack_sub = 0x80000000; + break; + } + } + } + + // Check for funny values. + if (bb.stack_sub >= 0x40000 || bb.stack_sub % 16) + { + bb.stack_sub = 0x80000000; + } + } + } + + // Analyse terminator instruction + const u32 tia = addr + bb.size * 4 - 4; + + switch (last_inst) + { + case spu_itype::BR: + case spu_itype::BRA: + case spu_itype::BRNZ: + case spu_itype::BRZ: + case spu_itype::BRHNZ: + case spu_itype::BRHZ: + case spu_itype::BRSL: + case spu_itype::BRASL: + { + const u32 target = spu_branch_target(last_inst == spu_itype::BRA || last_inst == spu_itype::BRASL ? 0 : tia, op.i16); + + if (target == tia + 4) + { + bb.terminator = term_type::fallthrough; + } + else if (last_inst != spu_itype::BRSL && last_inst != spu_itype::BRASL) + { + // No-op terminator or simple branch instruction + bb.terminator = term_type::br; + + if (target == bb.func) + { + // Recursive tail call + bb.terminator = term_type::ret; + } + } + else if (op.rt == s_reg_lr) + { + bb.terminator = term_type::call; + } + else + { + bb.terminator = term_type::interrupt_call; + } + + break; + } + case spu_itype::BI: + { + if (op.d || op.e || bb.targets.size() == 1) + { + bb.terminator = term_type::interrupt_call; + } + else if (bb.targets.size() > 1) + { + // Jump table + bb.terminator = term_type::br; + } + else if (op.ra == s_reg_lr) + { + // Return (TODO) + bb.terminator = term_type::ret; + } + else + { + // Indirect tail call (TODO) + bb.terminator = term_type::interrupt_call; + } + + break; + } + case spu_itype::BISLED: + case spu_itype::IRET: + { + bb.terminator = term_type::interrupt_call; + break; + } + case spu_itype::BISL: + case spu_itype::BIZ: + case spu_itype::BINZ: + case spu_itype::BIHZ: + case spu_itype::BIHNZ: + { + if (op.d || op.e || bb.targets.size() != 1) + { + bb.terminator = term_type::interrupt_call; + } + else if (last_inst != spu_itype::BISL && bb.targets[0] == tia + 4 && op.ra == s_reg_lr) + { + // Conditional return (TODO) + bb.terminator = term_type::ret; + } + else if (last_inst == spu_itype::BISL) + { + // Indirect call + bb.terminator = term_type::indirect_call; + } + else + { + // TODO + bb.terminator = term_type::interrupt_call; + } + + break; + } + default: + { + // Normal instruction + bb.terminator = term_type::fallthrough; + break; + } + } + } + + // Check function blocks + for (auto& f : m_funcs) + { + if (g_cfg.core.spu_block_size != spu_block_size_type::giga) + { + break; + } + + bool is_ok = true; + + u32 used_stack = 0; + + for (auto it = m_bbs.lower_bound(f.first); it != m_bbs.end() && it->second.func == f.first; ++it) + { + auto& bb = it->second; + auto& func = m_funcs.at(bb.func); + const u32 addr = it->first; + const u32 flim = bb.func + func.size * 4; + + used_stack |= bb.stack_sub; + + if (bb.stack_sub == 0x80000000) + { + is_ok = false; + } + + if (is_ok && bb.terminator >= term_type::indirect_call) + { + is_ok = false; + } + + if (is_ok && bb.terminator == term_type::ret) + { + // Check $LR (alternative return registers are currently not supported) + if (u32 lr_orig = bb.reg_mod[s_reg_lr] ? addr : bb.reg_origin_abs[s_reg_lr]; lr_orig < 0x40000) + { + auto& src = m_bbs.at(lr_orig); + + if (src.reg_load_mod[s_reg_lr] != func.reg_save_off[s_reg_lr]) + { + LOG_ERROR(SPU, "Function 0x%05x: $LR mismatch (src=0x%x; 0x%x vs 0x%x)", f.first, lr_orig, src.reg_load_mod[0], func.reg_save_off[0]); + is_ok = false; + } + else if (src.reg_load_mod[s_reg_lr] == 0) + { + LOG_ERROR(SPU, "Function 0x%05x: $LR modified (src=0x%x)", f.first, lr_orig); + is_ok = false; + } + } + else if (lr_orig > 0x40000) + { + LOG_ERROR(SPU, "Function 0x%05x: $LR unpredictable", f.first); + is_ok = false; + } + + // Check $SP (should be restored or unmodified) + if (bb.stack_sub != 0) + { + LOG_WARNING(SPU, "Function 0x%05x: return with stack frame 0x%x", f.first, bb.stack_sub); + is_ok = false; + } + + // Check $80..$127 (should be restored or unmodified) + for (u32 i = s_reg_80; i <= s_reg_127; i++) + { + if (u32 orig = bb.reg_mod[i] ? addr : bb.reg_origin_abs[i]; orig < 0x40000) + { + auto& src = m_bbs.at(orig); + + if (src.reg_load_mod[i] != func.reg_save_off[i]) + { + LOG_ERROR(SPU, "Function 0x%05x: $%u mismatch (src=0x%x; 0x%x vs 0x%x)", f.first, i, orig, src.reg_load_mod[i], func.reg_save_off[i]); + is_ok = false; } } - - if (tb.chunk != block.chunk && !(m_entry_info[target / 4] && m_ret_info[target / 4])) + else if (orig > 0x40000) { - // Skip call targets completely - continue; + LOG_ERROR(SPU, "Function 0x%05x: $%u unpredictable", f.first, i); + is_ok = false; } - if (tb.reg_origin_abs[i] != -2) + if (func.reg_save_off[i] == -1) { - const u32 expected = block.reg_mod[i] ? addr : block.reg_origin_abs[i]; + LOG_ERROR(SPU, "Function 0x%05x: $%u used incorrectly", f.first, i); + is_ok = false; + } + } + } - if (tb.reg_origin_abs[i] == 0x80000000) - { - tb.reg_origin_abs[i] = expected; - } - else if (tb.reg_origin_abs[i] != expected) - { - if (tb.reg_origin_abs[i] == 0x40000 || expected == -2 || expected == 0x40000) - { - // Set -2: sticky value indicating possible external reg origin (0x40000) - tb.reg_origin_abs[i] = -2; + if (is_ok && bb.terminator == term_type::call) + { + // Check call instruction (TODO) + if (bb.stack_sub == 0) + { + // Call without a stack frame + LOG_WARNING(SPU, "Function 0x%05x: frameless call", f.first); + is_ok = false; + } + } - must_repeat |= !tb.targets.empty(); - } - else if (tb.reg_origin_abs[i] != -1) - { - tb.reg_origin_abs[i] = -1; + if (is_ok && bb.terminator == term_type::fallthrough) + { + // Can't just fall out of the function + if (bb.targets.size() != 1 || bb.targets[0] >= flim) + { + LOG_ERROR(SPU, "Function 0x%05x: bad fallthrough to 0x%x", f.first, bb.targets[0]); + is_ok = false; + } + } - must_repeat |= !tb.targets.empty(); - } - } + // Fill external function targets (calls, possibly tail calls) + for (u32 target : bb.targets) + { + if (target < bb.func || target >= flim || (bb.terminator == term_type::call && target == bb.func)) + { + if (func.calls.find_first_of(target) == -1) + { + func.calls.push_back(target); } } } } - if (!must_repeat) + if (is_ok && used_stack && f.first == entry_point) { - break; + LOG_ERROR(SPU, "Function 0x%05x: disabled as possible chunk", f.first); + is_ok = false; } - for (u32 wi = 0; wi < workload.size(); wi++) + // if (is_ok && f.first > 0x1d240 && f.first < 0x1e000) + // { + // LOG_ERROR(SPU, "Function 0x%05x: manually disabled", f.first); + // is_ok = false; + // } + + f.second.good = is_ok; + } + + // Check function call graph + while (g_cfg.core.spu_block_size == spu_block_size_type::giga) + { + bool need_repeat = false; + + for (auto& f : m_funcs) { - const u32 addr = workload[wi]; - auto& block = m_bbs.at(addr); + if (!f.second.good) + { + continue; + } - // Reset values for the next attempt (keep negative values) - for (u32 i = 0; i < s_reg_max; i++) + for (u32 call : f.second.calls) { - if (block.reg_origin[i] <= 0x40000) - block.reg_origin[i] = 0x80000000; - if (block.reg_origin_abs[i] <= 0x40000) - block.reg_origin_abs[i] = 0x80000000; + const auto ffound = std::as_const(m_funcs).find(call); + + if (ffound == m_funcs.cend() || ffound->second.good == false) + { + need_repeat = true; + + if (f.second.good) + { + LOG_ERROR(SPU, "Function 0x%05x: calls bad function", f.first); + f.second.good = false; + } + } } } + + if (!need_repeat) + { + break; + } } if (result.size() == 1) @@ -2178,7 +2818,9 @@ void spu_recompiler_base::dump(std::string& out) { if (m_block_info[bb.first / 4]) { - fmt::append(out, "?: [0x%05x] %s\n", bb.first, m_entry_info[bb.first / 4] ? (m_ret_info[bb.first / 4] ? "Chunk" : "Entry") : "Block"); + fmt::append(out, "A: [0x%05x] %s\n", bb.first, m_entry_info[bb.first / 4] ? (m_ret_info[bb.first / 4] ? "Chunk" : "Entry") : "Block"); + + fmt::append(out, "\tF: 0x%05x\n", bb.second.func); for (u32 pred : bb.second.preds) { @@ -2187,12 +2829,24 @@ void spu_recompiler_base::dump(std::string& out) for (u32 target : bb.second.targets) { - fmt::append(out, "\t-> 0x%05x\n", target); + fmt::append(out, "\t-> 0x%05x%s\n", target, m_bbs.count(target) ? "" : " (null)"); } } else { - fmt::append(out, "?: [0x%05x] ?\n", bb.first); + fmt::append(out, "A: [0x%05x] ?\n", bb.first); + } + } + + for (auto& f : m_funcs) + { + fmt::append(out, "F: [0x%05x]%s\n", f.first, f.second.good ? " (good)" : " (bad)"); + + fmt::append(out, "\tN: 0x%05x\n", f.second.size * 4 + f.first); + + for (u32 call : f.second.calls) + { + fmt::append(out, "\t>> 0x%05x%s\n", call, m_funcs.count(call) ? "" : " (null)"); } } @@ -2261,6 +2915,9 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator struct block_info { + // Pointer to the analyser + spu_recompiler_base::block_info* bb{}; + // Current block's entry block llvm::BasicBlock* block; @@ -2277,27 +2934,23 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator std::array store{}; }; - struct chunk_info + struct function_info { - // Callable function - llvm::Function* func; - - // Constants in non-volatile registers at the entry point - std::array reg{}; + // Standard callable chunk + llvm::Function* chunk{}; - chunk_info() = default; + // Callable function + llvm::Function* fn{}; - chunk_info(llvm::Function* func) - : func(func) - { - } + // Registers possibly loaded in the entry block + std::array load{}; }; // Current block block_info* m_block; - // Current chunk - chunk_info* m_finfo; + // Current function or chunk + function_info* m_finfo; // All blocks in the current function chunk std::unordered_map> m_blocks; @@ -2306,17 +2959,22 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator std::vector m_block_queue; // All function chunks in current SPU compile unit - std::unordered_map> m_functions; + std::unordered_map> m_functions; // Function chunk list for processing std::vector m_function_queue; - // Helper - std::vector m_scan_queue; - // Add or get the function chunk - llvm::Function* add_function(u32 addr) + function_info* add_function(u32 addr) { + // Enqueue if necessary + const auto empl = m_functions.try_emplace(addr); + + if (!empl.second) + { + return &empl.first->second; + } + // Get function chunk name const std::string name = fmt::format("spu-chunk-0x%05x", addr); llvm::Function* result = llvm::cast(m_module->getOrInsertFunction(name, get_ftype()).getCallee()); @@ -2326,32 +2984,79 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator result->addAttribute(1, llvm::Attribute::NoAlias); result->addAttribute(2, llvm::Attribute::NoAlias); - // Enqueue if necessary - const auto empl = m_functions.emplace(addr, chunk_info{result}); + empl.first->second.chunk = result; - if (empl.second) + if (g_cfg.core.spu_block_size == spu_block_size_type::giga) { - m_function_queue.push_back(addr); + // Find good real function + const auto ffound = m_funcs.find(addr); + + if (ffound != m_funcs.end() && ffound->second.good) + { + const std::string fname = fmt::format("spu-function-0x%05x", addr); + llvm::Function* fn = llvm::cast(m_module->getOrInsertFunction(fname, get_ftype()).getCallee()); + + fn->setLinkage(llvm::GlobalValue::InternalLinkage); + fn->addAttribute(1, llvm::Attribute::NoAlias); + fn->addAttribute(2, llvm::Attribute::NoAlias); + empl.first->second.fn = fn; + } + } + + // Enqueue + m_function_queue.push_back(addr); - if (m_block && g_cfg.core.spu_block_size != spu_block_size_type::safe) + return &empl.first->second; + } + + // Call the real function + void call_function(llvm::Function* fn, bool tail = false) + { + llvm::Value* lr{}; + + if (!m_finfo->fn) + { + if (m_block) + { + lr = m_ir->CreateExtractElement(get_reg_fixed(s_reg_lr).value, 3); + } + else { - // Initialize constants for non-volatile registers (TODO) - auto& regs = empl.first->second.reg; + lr = m_ir->CreateLoad(spu_ptr(&spu_thread::gpr, 0, &v128::_u32, 3)); + } + } + + const auto _call = m_ir->CreateCall(verify(HERE, fn), {m_thread, m_lsptr}); + + // Tail call using loaded LR value (gateway from a chunk) + if (!m_finfo->fn) + { + lr = m_ir->CreateAnd(lr, 0x3fffc); - for (u32 i = 80; i <= 127; i++) + m_ir->CreateStore(lr, spu_ptr(&spu_thread::pc)); + m_ir->CreateBr(add_block_indirect({}, value(lr))); + } + else if (tail) + { + _call->setTailCall(); + } + else + { + // TODO: initialize $LR with a constant + for (u32 i = 0; i < s_reg_max; i++) + { + if (i != s_reg_lr && i != s_reg_sp && (i < s_reg_80 || i > s_reg_127)) { - if (auto c = llvm::dyn_cast_or_null(m_block->reg[i])) - { - if (m_bbs.at(addr).reg_origin_abs[i] < 0x40000) - { - regs[i] = c; - } - } + m_block->reg[i] = m_ir->CreateLoad(init_reg_fixed(i)); } } } + } - return result; + // Emit return from the real function + void ret_function() + { + m_ir->CreateRetVoid(); } void set_function(llvm::Function* func) @@ -2378,15 +3083,56 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator if (m_blocks.empty()) { // Special case: first block, proceed normally + if (auto fn = std::exchange(m_finfo->fn, nullptr)) + { + // Create a gateway + call_function(fn, true); + + m_finfo->fn = fn; + m_function = fn; + m_thread = &*fn->arg_begin(); + m_lsptr = &*(fn->arg_begin() + 1); + m_ir->SetInsertPoint(llvm::BasicBlock::Create(m_context, "", fn)); + m_memptr = m_ir->CreateIntToPtr(m_ir->getInt64((u64)vm::g_base_addr), get_type()); + + // Load registers at the entry chunk + for (u32 i = 0; i < s_reg_max; i++) + { + if (i >= s_reg_80 && i <= s_reg_127) + { + // TODO + //m_finfo->load[i] = llvm::UndefValue::get(get_reg_type(i)); + } + + m_finfo->load[i] = m_ir->CreateLoad(init_reg_fixed(i)); + } + } } - else if (m_block_info[target / 4] && m_entry_info[target / 4] && !(pred_found && m_entry == target)) + else if (m_block_info[target / 4] && m_entry_info[target / 4] && !(pred_found && m_entry == target) && (!m_finfo->fn || !m_ret_info[target / 4])) { // Generate a tail call to the function chunk const auto cblock = m_ir->GetInsertBlock(); const auto result = llvm::BasicBlock::Create(m_context, "", m_function); m_ir->SetInsertPoint(result); - m_ir->CreateStore(m_ir->getInt32(target), spu_ptr(&spu_thread::pc)); - tail(add_function(target)); + const auto pfinfo = add_function(target); + + if (pfinfo->fn) + { + // Tail call to the real function + call_function(pfinfo->fn, true); + + if (!result->getTerminator()) + { + ret_function(); + } + } + else + { + // Just a boring tail call to another chunk + m_ir->CreateStore(m_ir->getInt32(target), spu_ptr(&spu_thread::pc)); + tail(pfinfo->chunk); + } + m_ir->SetInsertPoint(cblock); return result; } @@ -2664,7 +3410,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator if (!reg) { // Load register value if necessary - reg = m_ir->CreateLoad(init_reg_fixed(index)); + reg = m_finfo && m_finfo->load[index] ? m_finfo->load[index] : m_ir->CreateLoad(init_reg_fixed(index)); } if (reg->getType() == get_type()) @@ -2688,44 +3434,6 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator { if (const auto phi = llvm::dyn_cast(reg)) { - if (phi->getNumUses()) - { - LOG_WARNING(SPU, "[0x%x] $%u: Phi has uses :(", m_pos, index); - } - else - { - const auto cblock = m_ir->GetInsertBlock(); - m_ir->SetInsertPoint(phi); - - const auto newphi = m_ir->CreatePHI(get_type(), phi->getNumIncomingValues()); - - for (u32 i = 0; i < phi->getNumIncomingValues(); i++) - { - const auto iblock = phi->getIncomingBlock(i); - m_ir->SetInsertPoint(iblock->getTerminator()); - const auto ivalue = phi->getIncomingValue(i); - newphi->addIncoming(xfloat_to_double(ivalue), iblock); - } - - for (auto& b : m_blocks) - { - if (b.second.phi[index] == phi) - { - b.second.phi[index] = newphi; - } - - if (b.second.reg[index] == phi) - { - b.second.reg[index] = newphi; - } - } - - reg = newphi; - - m_ir->SetInsertPoint(cblock); - phi->eraseFromParent(); - return reg; - } } if (auto c = llvm::dyn_cast(reg)) @@ -2895,6 +3603,15 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator _store->eraseFromParent(); } + if (m_finfo && m_finfo->fn) + { + if (index == s_reg_lr || (index >= s_reg_80 && index <= s_reg_127)) + { + // Don't save some registers in true functions + return; + } + } + // Write register to the context _store = m_ir->CreateStore(is_xfloat ? double_to_xfloat(saved_value) : m_ir->CreateBitCast(value, addr->getType()->getPointerElementType()), addr); } @@ -3297,7 +4014,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator // Call the entry function chunk const auto entry_chunk = add_function(m_pos); - m_ir->CreateCall(entry_chunk, {m_thread, m_lsptr, m_ir->getInt32(0)})->setTailCall(); + m_ir->CreateCall(entry_chunk->chunk, {m_thread, m_lsptr, m_ir->getInt32(0)})->setTailCall(); m_ir->CreateRetVoid(); m_ir->SetInsertPoint(label_stop); @@ -3317,14 +4034,14 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator } // Create function table (uninitialized) - m_function_table = new llvm::GlobalVariable(*m_module, llvm::ArrayType::get(entry_chunk->getType(), m_size / 4), true, llvm::GlobalValue::InternalLinkage, nullptr); + m_function_table = new llvm::GlobalVariable(*m_module, llvm::ArrayType::get(entry_chunk->chunk->getType(), m_size / 4), true, llvm::GlobalValue::InternalLinkage, nullptr); // Create function chunks for (std::size_t fi = 0; fi < m_function_queue.size(); fi++) { // Initialize function info m_entry = m_function_queue[fi]; - set_function(m_functions[m_entry].func); + set_function(m_functions[m_entry].chunk); m_finfo = &m_functions[m_entry]; m_ir->CreateBr(add_block(m_entry)); @@ -3337,18 +4054,21 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator m_ir->SetInsertPoint(m_block->block); auto& bb = m_bbs.at(baddr); bool need_check = false; + m_block->bb = &bb; if (bb.preds.size()) { // Initialize registers and build PHI nodes if necessary for (u32 i = 0; i < s_reg_max; i++) { - const u32 src = bb.reg_origin[i]; + const u32 src = m_finfo->fn ? bb.reg_origin_abs[i] : bb.reg_origin[i]; - if (src == -1) + if (src > 0x40000) { - // TODO: type - const auto _phi = m_ir->CreatePHI(get_reg_type(i), ::size32(bb.preds)); + // Use the xfloat hint to create 256-bit (4x double) PHI + llvm::Type* type = g_cfg.core.spu_accurate_xfloat && bb.reg_maybe_xf[i] ? get_type() : get_reg_type(i); + + const auto _phi = m_ir->CreatePHI(type, ::size32(bb.preds)); m_block->phi[i] = _phi; m_block->reg[i] = _phi; @@ -3369,13 +4089,17 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator if (!value) { // Value hasn't been loaded yet - value = m_finfo->reg[i] ? m_finfo->reg[i] : m_ir->CreateLoad(regptr); + value = m_finfo && m_finfo->load[i] ? m_finfo->load[i] : m_ir->CreateLoad(regptr); } - if (value->getType() == get_type()) + if (value->getType() == get_type() && type != get_type()) { value = double_to_xfloat(value); } + else if (value->getType() != get_type() && type == get_type()) + { + value = xfloat_to_double(m_ir->CreateBitCast(value, get_type())); + } else if (i < 128 && llvm::isa(value)) { // Bitcast the constant @@ -3402,7 +4126,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator const auto regptr = init_reg_fixed(i); const auto cblock = m_ir->GetInsertBlock(); m_ir->SetInsertPoint(m_function->getEntryBlock().getTerminator()); - const auto value = m_finfo->reg[i] ? m_finfo->reg[i] : m_ir->CreateLoad(regptr); + const auto value = m_finfo && m_finfo->load[i] ? m_finfo->load[i] : m_ir->CreateLoad(regptr); m_ir->SetInsertPoint(cblock); _phi->addIncoming(value, &m_function->getEntryBlock()); } @@ -3421,10 +4145,9 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator LOG_ERROR(SPU, "[0x%05x] Value not found ($%u from 0x%05x)", baddr, i, src); } } - else if (baddr == m_entry) + else { - // Passthrough constant from a different chunk (will be removed in future) - m_block->reg[i] = m_finfo->reg[i]; + m_block->reg[i] = m_finfo->load[i]; } } @@ -3523,29 +4246,14 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator if (found == m_functions.end()) { - if (m_entry_info[i / 4]) - { - LOG_ERROR(SPU, "[0x%x] Function chunk not compiled: 0x%x", func[0], i); - } - chunks.push_back(null); continue; } - chunks.push_back(found->second.func); - - // If a chunk has incoming constants, we can't add it to the function table (TODO) - for (const auto c : found->second.reg) - { - if (c != nullptr) - { - chunks.back() = null; - break; - } - } + chunks.push_back(found->second.chunk); } - m_function_table->setInitializer(llvm::ConstantArray::get(llvm::ArrayType::get(entry_chunk->getType(), m_size / 4), chunks)); + m_function_table->setInitializer(llvm::ConstantArray::get(llvm::ArrayType::get(entry_chunk->chunk->getType(), m_size / 4), chunks)); } else { @@ -3566,7 +4274,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator for (const auto& func : m_functions) { - const auto f = func.second.func; + const auto f = func.second.fn ? func.second.fn : func.second.chunk; pm.run(*f); for (auto& bb : *f) @@ -3581,31 +4289,6 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator li->eraseFromParent(); break; } - - // Replace volatile fake store with return - if (auto si = dyn_cast(&i); si && si->getOperand(1) == m_fake_global1) - { - const auto br = bb.getTerminator(); - - for (auto& j : *br->getSuccessor(0)) - { - // Cleanup PHI nodes if exist - if (auto phi = dyn_cast(&j)) - { - phi->removeIncomingValue(&bb, false); - } - else - { - break; - } - } - - m_ir->SetInsertPoint(bb.getTerminator()); - m_ir->CreateRetVoid(); - si->eraseFromParent(); - br->eraseFromParent(); - break; - } } } } @@ -3615,7 +4298,6 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator m_block_queue.clear(); m_functions.clear(); m_function_queue.clear(); - m_scan_queue.clear(); m_function_table = nullptr; std::string log; @@ -6443,6 +7125,17 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void STQD(spu_opcode_t op) { + if (m_finfo && m_finfo->fn) + { + if (op.rt == s_reg_lr || (op.rt >= s_reg_80 && op.rt <= s_reg_127)) + { + if (m_block->bb->reg_save_dom[op.rt] && get_reg_raw(op.rt) == m_finfo->load[op.rt]) + { + return; + } + } + } + value_t addr = eval(zext((extract(get_vr(op.ra), 3) + (get_imm(op.si10) << 4)) & 0x3fff0)); make_store_ls(addr, get_vr(op.rt)); } @@ -6578,7 +7271,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator } // Convert an indirect branch into a static one if possible - if (const auto _int = llvm::dyn_cast(addr.value)) + if (const auto _int = llvm::dyn_cast(addr.value); _int && op.opcode) { const u32 target = ::narrow(_int->getZExtValue(), HERE); @@ -6601,17 +7294,34 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator // Fixed branch excludes the possibility it's a function return (TODO) ret = false; } - else if (llvm::isa(addr.value)) + else if (llvm::isa(addr.value) && op.opcode) { LOG_ERROR(SPU, "[0x%x] Unexpected constant (add_block_indirect)", m_pos); } + if (m_finfo && m_finfo->fn && op.opcode) + { + const auto cblock = m_ir->GetInsertBlock(); + const auto result = llvm::BasicBlock::Create(m_context, "", m_function); + m_ir->SetInsertPoint(result); + ret_function(); + m_ir->SetInsertPoint(cblock); + return result; + } + // Load stack addr if necessary value_t sp; if (ret && g_cfg.core.spu_block_size != spu_block_size_type::safe) { - sp = eval(extract(get_reg_fixed(1), 3) & 0x3fff0); + if (op.opcode) + { + sp = eval(extract(get_reg_fixed(1), 3) & 0x3fff0); + } + else + { + sp.value = m_ir->CreateLoad(spu_ptr(&spu_thread::gpr, 1, &v128::_u32, 3)); + } } const auto cblock = m_ir->GetInsertBlock(); @@ -6920,6 +7630,23 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void BRASL(spu_opcode_t op) // { set_link(op); + + const u32 target = spu_branch_target(0, op.i16); + + if (m_finfo && m_finfo->fn && target != m_pos + 4) + { + if (auto fn = add_function(target)->fn) + { + call_function(fn); + return; + } + else + { + LOG_FATAL(SPU, "[0x%x] Can't add function 0x%x", m_pos, target); + return; + } + } + BRA(op); } @@ -6946,6 +7673,23 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void BRSL(spu_opcode_t op) // { set_link(op); + + const u32 target = spu_branch_target(m_pos, op.i16); + + if (m_finfo && m_finfo->fn && target != m_pos + 4) + { + if (auto fn = add_function(target)->fn) + { + call_function(fn); + return; + } + else + { + LOG_FATAL(SPU, "[0x%x] Can't add function 0x%x", m_pos, target); + return; + } + } + BR(op); } @@ -6961,13 +7705,18 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator set_vr(op.rt, build(0, 0, 0, spu_branch_target(m_pos + 4))); + if (m_finfo && m_finfo->fn) + { + return; + } + if (g_cfg.core.spu_block_size != spu_block_size_type::safe && m_block_info[m_pos / 4 + 1] && m_entry_info[m_pos / 4 + 1]) { // Store the return function chunk address at the stack mirror - const auto func = add_function(m_pos + 4); + const auto pfunc = add_function(m_pos + 4); const auto stack0 = eval(zext(extract(get_reg_fixed(1), 3) & 0x3fff0) + ::offset32(&spu_thread::stack_mirror)); const auto stack1 = eval(stack0 + 8); - m_ir->CreateStore(func, m_ir->CreateBitCast(m_ir->CreateGEP(m_thread, stack0.value), func->getType()->getPointerTo())); + m_ir->CreateStore(pfunc->chunk, m_ir->CreateBitCast(m_ir->CreateGEP(m_thread, stack0.value), pfunc->chunk->getType()->getPointerTo())); m_ir->CreateStore(m_ir->getInt64(m_pos + 4), m_ir->CreateBitCast(m_ir->CreateGEP(m_thread, stack1.value), get_type())); } } diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index af5ad3c70f66..b02369b2cd05 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -199,6 +199,17 @@ class spu_recompiler_base s_reg_max }; + // Classify terminator instructions + enum class term_type : unsigned char + { + br, + ret, + call, + fallthrough, + indirect_call, + interrupt_call, + }; + protected: std::shared_ptr m_spurt; @@ -239,12 +250,39 @@ class spu_recompiler_base // Internal use flag bool analysed = false; + // Terminator instruction type + term_type terminator; + // Bit mask of the registers modified in the block std::bitset reg_mod{}; + // Set if last modifying instruction produces xfloat + std::bitset reg_mod_xf{}; + + // Set if the initial register value in this block may be xfloat + std::bitset reg_maybe_xf{}; + // Bit mask of the registers used (before modified) std::bitset reg_use{}; + // Bit mask of the trivial (u32 x 4) constant value resulting in this block + std::bitset reg_const{}; + + // Bit mask of register saved onto the stack before use + std::bitset reg_save_dom{}; + + // Address of the function + u32 func = 0x40000; + + // Value subtracted from $SP in this block, negative if something funny is done on $SP + u32 stack_sub = 0; + + // Constant values associated with reg_const + std::array reg_val32; + + // Registers loaded from the stack in this block (stack offset) + std::array reg_load_mod{}; + // Single source of the reg value (dominating block address within the same chunk) or a negative number std::array reg_origin, reg_origin_abs; @@ -258,13 +296,27 @@ class spu_recompiler_base // Sorted basic block info std::map m_bbs; - // Advanced block (chunk) information - struct chunk_info + // Sorted advanced block (chunk) list + std::basic_string m_chunks; + + // Function information + struct func_info { + // Size to the end of last basic block + u16 size = 0; + + // Determines whether a function is eligible for optimizations + bool good = false; + + // Call targets + std::basic_string calls; + + // Register save info (stack offset) + std::array reg_save_off{}; }; - // Sorted chunk info - std::map m_chunks; + // Sorted function info + std::map m_funcs; std::shared_ptr m_cache;