From 000a5052367a97a61c25c04b42a2d1011360579b Mon Sep 17 00:00:00 2001 From: Eladash <18193363+elad335@users.noreply.github.com> Date: Wed, 6 Mar 2024 17:28:07 +0200 Subject: [PATCH] SPU LLVM: PUTLLC 16 Optimization --- rpcs3/Emu/Cell/SPUCommonRecompiler.cpp | 2096 ++++++++++++++++++++++-- rpcs3/Emu/Cell/SPULLVMRecompiler.cpp | 182 ++ rpcs3/Emu/Cell/SPURecompiler.h | 25 + rpcs3/Emu/Cell/SPUThread.h | 1 + rpcs3/Emu/System.cpp | 57 +- rpcs3/Emu/System.h | 2 +- rpcs3/headless_application.cpp | 2 +- rpcs3/rpcs3qt/gui_application.cpp | 42 +- 8 files changed, 2267 insertions(+), 140 deletions(-) diff --git a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp index 1acb81ff3963..840849763ebc 100644 --- a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp @@ -29,6 +29,8 @@ const extern spu_decoder g_spu_itype; const extern spu_decoder g_spu_iname; const extern spu_decoder g_spu_iflag; +constexpr u32 s_reg_max = spu_recompiler_base::s_reg_max; + // Move 4 args for calling native function from a GHC calling convention function #if defined(ARCH_X64) static u8* move_args_ghc_to_native(u8* raw) @@ -2472,6 +2474,262 @@ std::vector spu_thread::discover_functions(u32 base_addr, std::span flag{+vf::is_null}; + u32 value{}; + u32 tag = umax; + u32 known_ones{}; + u32 known_zeroes{}; + + bool is_const() const + { + return !!(flag & vf::is_const); + } + + bool operator&(vf to_test) const + { + return this->flag.all_of(to_test); + } + + bool is_less_than(u32 imm) const + { + if (flag & vf::is_const && value < imm) + { + return true; + } + + if (flag & vf::is_mask && ~known_zeroes < imm) + { + return true; + } + + return false; + } + + bool operator==(const reg_state_t& r) const + { + if (flag != r.flag) + { + return false; + } + + return (flag & vf::is_const ? value == r.value : (tag == r.tag && known_ones == r.known_ones && known_zeroes == r.known_zeroes)); + } + + bool operator!=(const reg_state_t& r) const + { + return !(*this == r); + } + + // Compare equality but try to ignore changes in unmasked bits + bool compare_with_mask_indifference(const reg_state_t& r, u32 mask_bits) const + { + return *this == r || + (tag == r.tag && flag == r.flag && flag & vf::is_mask && !!(known_ones & ~r.known_ones & mask_bits) && !!(known_zeroes & ~r.known_zeroes & mask_bits)) || + (flag == r.flag && flag & vf::is_const && ((value ^ r.value) & mask_bits) == 0); + } + + reg_state_t merge(reg_state_t rhs) const + { + if (rhs == *this) + { + // Perfect state: no conflicts + return rhs; + } + + if ((rhs.flag + flag).all_of(vf::is_const + vf::is_mask)) + { + // Try to downgrade to a known-bits type value + reg_state_t _rhs = rhs; + + if (_rhs.flag & vf::is_const) + { + _rhs = reg_state_t{vf::is_mask, 0, umax, _rhs.value, ~_rhs.value}; + } + + reg_state_t _this = *this; + + if (_this.flag & vf::is_const) + { + _this = reg_state_t{vf::is_mask, 0, umax, _this.value, ~_this.value}; + } + + if ((_rhs.flag & _this.flag) & vf::is_mask) + { + // Now it is possible to merge the two values + const reg_state_t res{vf::is_mask, 0, umax, _rhs.known_ones & _this.known_ones, _rhs.known_zeroes & _this.known_zeroes}; + + if (res.known_zeroes | res.known_ones) + { + // Success + return res; + } + } + } + + return make_unknown(); + } + + reg_state_t build_on_top_of(reg_state_t rhs) const + { + if (rhs == *this) + { + // Perfect state: no conflicts + return rhs; + } + + if (flag & vf::is_null) + { + // Value unmodified + return rhs; + } + + return *this; + } + + template + static std::conditional_t> make_unknown() noexcept + { + if constexpr (Count == 1) + { + reg_state_t v{}; + v.tag = alloc_tag(); + v.flag = {}; + return v; + } + else + { + std::array result{}; + + for (reg_state_t& state : result) + { + state = make_unknown<1>(); + } + + return result; + } + } + + static reg_state_t from_value(u32 value) noexcept + { + reg_state_t v{}; + v.value = value; + v.flag = vf::is_const; + return v; + } + + static u32 alloc_tag(bool reset = false) noexcept + { + static thread_local u32 g_tls_tag = 0; + + if (reset) + { + g_tls_tag = 0; + } + + return ++g_tls_tag; + } +}; + +// Converge 2 register states to the same flow in execution +template +static std::array merge(const std::array& lhs, const std::array& rhs) +{ + std::array result{}; + + usz index = umax; + + for (reg_state_t& state : result) + { + index++; + + state = lhs[index].merge(rhs[index]); + } + + return result; +} + +// Override RHS state with the newer LHS state +template +static std::array build_on_top_of(const std::array& lhs, const std::array& rhs) +{ + std::array result{}; + + usz index = umax; + + for (reg_state_t& state : result) + { + index++; + + state = lhs[index].build_on_top_of(rhs[index]); + } + + return result; +} + +struct block_reg_info +{ + u32 pc = SPU_LS_SIZE; // Address + std::array local_state{}; + bool has_true_state = false; + std::array true_start_state{}; + std::array end_state{}; + std::array addend_state{}; + std::array walkby_state{}; // State that is made by merging state_predecessor and iterating over instructions for final instrucion walk + + usz next_nodes_count = 0; + + struct node_t + { + u32 prev_pc = umax; + }; + + std::vector prev_nodes; + + static std::unique_ptr create(u32 pc) noexcept + { + auto ptr = new block_reg_info{ pc, reg_state_t::make_unknown() }; + + for (reg_state_t& f : ptr->local_state) + { + f.flag += vf::is_null; + } + + return std::unique_ptr(ptr); + } + + // Evaluate registers state + std::array& evaluate_start_state(const std::map>& map); + + // This function creates new node if not found and links the proceeding node to the old node + // In a manner in which no duplicate paths are formed + static void create_node(u32 pc_rhs, u32 parent_pc, std::map>& map) + { + //ensure(parent_node != pc_rhs); + ensure(map[parent_pc]); + + if (!map[pc_rhs]) + { + map[pc_rhs] = create(pc_rhs); + } + + node_t prev_node{parent_pc}; + map[parent_pc]->next_nodes_count++; + map[pc_rhs]->prev_nodes.emplace_back(prev_node); + } +}; spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, std::map>* out_target_list) { @@ -2503,22 +2761,6 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s m_chunks.clear(); m_funcs.clear(); - // Value flags (TODO: only is_const is implemented) - enum class vf : u32 - { - is_const, - is_mask, - is_rel, - - __bitset_enum_max - }; - - // Weak constant propagation context (for guessing branch targets) - std::array, 128> vflags{}; - - // Associated constant values for 32-bit preferred slot - std::array values; - // SYNC instruction found bool sync = false; @@ -2533,6 +2775,12 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s { } + // Weak constant propagation context (for guessing branch targets) + std::array, 128> vflags{}; + + // Associated constant values for 32-bit preferred slot + std::array values; + for (u32 wi = 0, wa = workload[0]; wi < workload.size();) { const auto next_block = [&] @@ -2607,7 +2855,6 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s case spu_itype::DFCMGT: case spu_itype::DFTSV: { - // Stop before invalid instructions (TODO) next_block(); continue; } @@ -3343,6 +3590,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s } limit = std::min(limit, lsa + ::size32(result.data) * 4); + result.inst_attrs.resize(result.data.size()); // Cleanup block info for (u32 i = 0; i < workload.size(); i++) @@ -3444,115 +3692,1445 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s } } - // Fill block info - for (auto& pred : m_preds) + struct putllc16_statistics_t { - auto& block = m_bbs[pred.first]; - - // Copy predeccessors (wrong at this point, needs a fixup later) - block.preds = pred.second; + atomic_t all = 0; + atomic_t single = 0; + atomic_t nowrite = 0; + std::array, 128> breaking_reason{}; - // Fill register usage info - for (u32 ia = pred.first; ia < limit; ia += 4) + std::vector> get_reasons() { - block.size++; - - // Decode instruction - const spu_opcode_t op{std::bit_cast>(result.data[(ia - lsa) / 4])}; + std::vector> map; + for (usz i = 0; i < breaking_reason.size(); i++) + { + if (u64 v = breaking_reason[i]) + { + map.emplace_back(i, v); + } + } - const auto type = g_spu_itype.decode(op.opcode); + std::stable_sort(map.begin(), map.end(), FN(x.second > y.second)); + return map; + } + }; - u8 reg_save = 255; + struct atomic16_t + { + bool active; // GETLLAR happened + u32 lsa_pc; // PC of first LSA write + u32 put_pc; // PC of PUTLLC + bool lsa_write; // LSA written + reg_state_t ls; // state of LS load/store register + reg_state_t lsa; // state of LSA register on GETLLAR + bool ls_pc_rel; // For STQR/LQR + bool ls_access; // LS accessed + bool ls_write; // LS written + bool ls_invalid; // From this point and on, any store will cancel the optimization + bool select_16_or_0_at_runtime; + u32 ls_offs; // LS offset from register (0 if const) + u32 reg; // Source of address register of LS load/store + bool put_active; // PUTLLC happened + bool get_rdatomic = false; // True if MFC_RdAtomicStat was read after GETLLAR - if (type == spu_itype::STQD && op.ra == s_reg_sp && !block.reg_mod[op.rt] && !block.reg_use[op.rt]) + // Return old state for error reporting + atomic16_t discard() + { + if (!active) { - // Register saved onto the stack before use - block.reg_save_dom[op.rt] = true; - - reg_save = op.rt; + return atomic16_t{}; } - for (auto* _use : {&m_use_ra, &m_use_rb, &m_use_rc}) - { - if (u8 reg = (*_use)[ia / 4]; reg < s_reg_max) - { - // Register reg use only if it happens before reg mod - if (!block.reg_mod[reg]) - { - block.reg_use.set(reg); + const u32 pc = lsa_pc; + const bool write = lsa_write; - if (reg_save != reg && block.reg_save_dom[reg]) - { - // Register is still used after saving; probably not eligible for optimization - block.reg_save_dom[reg] = false; - } - } - } - } + const atomic16_t old = *this; + *this = atomic16_t{}; - if (m_use_rb[ia / 4] == s_reg_mfc_eal) + // Keep some members + lsa_pc = pc; + lsa_write = write; + return old; + } + + // Conditional breakage (break if a full 128-byte reservation is needed) + atomic16_t set_invalid_ls(bool write) + { + if (!write) { - // Expand MFC_Cmd reg use - for (u8 reg : {s_reg_mfc_lsa, s_reg_mfc_tag, s_reg_mfc_size}) + ls_invalid = true; + + if (ls_write) { - if (!block.reg_mod[reg]) - block.reg_use.set(reg); + return discard(); } } - - // Register reg modification - if (u8 reg = m_regmod[ia / 4]; reg < s_reg_max) + else { - block.reg_mod.set(reg); - block.reg_mod_xf.set(reg, type & spu_itype::xfloat); + return discard(); + } - if (type == spu_itype::SELB && (block.reg_mod_xf[op.ra] || block.reg_mod_xf[op.rb])) - block.reg_mod_xf.set(reg); + return atomic16_t{}; + } + }; - // Possible post-dominating register load - if (type == spu_itype::LQD && op.ra == s_reg_sp) - block.reg_load_mod[reg] = ia + 1; - else - block.reg_load_mod[reg] = 0; - } + // Reset tags + reg_state_t::alloc_tag(true); - // Find targets (also means end of the block) - const auto tfound = m_targets.find(ia); + std::map> infos; + infos.emplace(entry_point, block_reg_info::create(entry_point)); - if (tfound != m_targets.end()) - { - // Copy targets - block.targets = tfound->second; + struct block_reg_state_iterator + { + u32 pc{}; + usz parent_iterator_index = umax; - // Assume that the call reads and modifies all volatile registers (TODO) - bool is_call = false; - bool is_tail = false; - switch (type) - { - case spu_itype::BRSL: - is_call = spu_branch_target(ia, op.i16) != ia + 4; - break; - case spu_itype::BRASL: - is_call = spu_branch_target(0, op.i16) != ia + 4; - break; - case spu_itype::BISL: - case spu_itype::BISLED: - is_call = true; - break; - default: - break; - } + // PUTLLC16 optimization analysis tracker + atomic16_t atomic16{}; - if (is_call) - { - for (u32 i = 0; i < s_reg_max; ++i) - { - if (i == s_reg_lr || (i >= 2 && i < s_reg_80) || i > s_reg_127) - { - if (!block.reg_mod[i]) - block.reg_use.set(i); + block_reg_state_iterator(u32 _pc, usz _parent_iterator_index = umax) noexcept + : pc(_pc) + , parent_iterator_index(_parent_iterator_index) + { + } + }; - if (!is_tail) + std::vector> reg_state_it; + + std::map atomic16_all; // RdAtomicStat location -> atomic loop optimization state + std::map getllar_starts; // True for failed loops + std::map run_on_block; + + std::array* true_state_walkby = nullptr; + + atomic16_t dummy16{}; + + bool likely_putllc_loop = false; + + for (u32 i = 0, count = 0; i < result.data.size(); i++) + { + const u32 inst = std::bit_cast>(result.data[i]); + + if (spu_opcode_t{inst}.ra == MFC_RdAtomicStat && g_spu_itype.decode(inst) == spu_itype::RDCH) + { + count++; + + if (count == 2) + { + likely_putllc_loop = true; + break; + } + } + } + + usz target_count = 0; + + for (auto& [pc, loc] : m_targets) + { + target_count += loc.size(); + } + + const bool should_search_patters = likely_putllc_loop && target_count < 100; + + if (should_search_patters) + { + // Initialize + reg_state_it.emplace_back(std::make_unique(entry_point)); + run_on_block[entry_point / 4] = true; + } + + for (u32 wf = should_search_patters ? 0 : u32{umax}, wi = 0, wa = entry_point, bpc = wa; wf <= 1;) + { + const bool is_form_block = wf == 0; + const bool is_pattern_match = wf == 1; + + dummy16.active = false; + + if (!is_form_block && wa == bpc) + { + true_state_walkby = &infos[bpc]->evaluate_start_state(infos); + } + + auto& vregs = is_form_block ? infos[bpc]->local_state : *true_state_walkby; + auto& atomic16 = is_pattern_match ? ::at32(reg_state_it, wi)->atomic16 : dummy16; + + if (wa == bpc) + { + for (reg_state_t& f : vregs) + { + if (f.flag & vf::is_null) + { + // Evaluate locally + f.tag = reg_state_t::alloc_tag(); + } + } + } + + if (is_form_block && wa == bpc) + { + for (reg_state_t& f : vregs) + { + f.flag += vf::is_null; + } + } + + const u32 pos = wa; + + wa += 4; + + const auto break_putllc16 = [&](u32 cause, atomic16_t previous) + { + if (previous.active && likely_putllc_loop && getllar_starts.contains(previous.lsa_pc)) + { + const bool is_first = !std::exchange(getllar_starts[previous.lsa_pc], true); + + if (!is_first) + { + return; + } + + g_fxo->get().breaking_reason[cause]++; + + spu_log.notice("PUTLLC pattern breakage [%x, cause=%u]", wa - 4, cause); + + const auto values = g_fxo->get().get_reasons(); + + std::string tracing = "Most common breaking reasons:"; + + usz i = 0; + for (auto it = values.begin(); it != values.end() && i < 4; i++, it++) + { + fmt::append(tracing, " [cause=%d, n=%d]", it->first, it->second); + } + + fmt::append(tracing, " of %d failures", g_fxo->get().all - g_fxo->get().single); + spu_log.notice("%s", tracing); + } + }; + + const auto next_block = [&]() + { + // Reset value information + wi++; + + if (wf == 0) + { + auto& block = infos[bpc]; + block->addend_state = block->local_state; + } + + if (wi >= reg_state_it.size()) + { + wf++; + wi = 0; + + if (wf == 1) + { + reg_state_it.clear(); + + if (!infos.empty()) + { + auto& start_node = *infos.begin()->second; + start_node.has_true_state = true; + start_node.true_start_state = {}; + start_node.end_state = start_node.addend_state; + + for (reg_state_t& f : start_node.end_state) + { + f.flag -= vf::is_null; + } + + reg_state_it.emplace_back(std::make_unique(infos.begin()->second->pc)); + } + } + } + + if (wi < reg_state_it.size()) + { + wa = ::at32(reg_state_it, wi)->pc; + bpc = wa; + } + }; + + const auto get_reg = [&](u32 reg) -> const reg_state_t& + { + return vregs[reg]; + }; + + const auto move_reg = [&](u32 dst, u32 src) + { + if (dst == src) + { + return; + } + + vregs[dst] = vregs[src]; + vregs[dst].flag -= vf::is_null; + }; + + const auto set_const_value = [&](u32 reg, u32 value) + { + vregs[reg] = reg_state_t::from_value(value); + }; + + const auto inherit_const_value = [&](u32 reg, bs_t flag, u32 value) + { + flag -= vf::is_null; + vregs[reg] = reg_state_t{flag, value, flag & vf::is_const ? u32{umax} : reg_state_t::alloc_tag()}; + }; + + const auto inherit_const_mask_value = [&](u32 reg, reg_state_t state, u32 mask_ones, u32 mask_zeroes) + { + if ((mask_ones | mask_zeroes) == 0) + { + state.flag -= vf::is_null; + vregs[reg] = state; + return; + } + + if (state.flag & vf::is_const) + { + vregs[reg] = reg_state_t::from_value((state.value | mask_ones) & ~mask_zeroes); + return; + } + + if (vf::is_mask - state.flag) + { + state.known_ones = 0; + state.known_zeroes = 0; + } + + const u32 ones = (state.known_ones | mask_ones) & ~mask_zeroes; + const u32 zeroes = (state.known_zeroes | mask_zeroes) & ~mask_ones; + + if ((ones ^ zeroes) == umax) + { + // Special case: create a constant from full masks + vregs[reg] = reg_state_t::from_value(ones); + return; + } + + vregs[reg] = reg_state_t{vf::is_mask, 0, state.tag, ones, zeroes}; + }; + + const auto unconst = [&](u32 reg) + { + vregs[reg] = {{}, {}, reg_state_t::alloc_tag()}; + }; + + const auto add_block = [&](u32 target) + { + // Validate new target (TODO) + if (target >= lsa && (target & -4) < limit) + { + if (!infos[target]) + { + infos[target] = block_reg_info::create(target); + } + + if (is_form_block) + { + block_reg_info::create_node(target, bpc, infos); + + if (!run_on_block[target / 4]) + { + reg_state_it.emplace_back(std::make_unique(target)); + run_on_block[target / 4] = true; + } + + return; + } + + // Check block duplication (terminating infinite loops) + // Even if duplicated, this still has impact by registering the end of the possible code path outcome + std::set positions; + + for (usz i = wi;;) + { + auto& entry = ::at32(reg_state_it, i); + AUDIT(positions.emplace(entry->pc).second); + + if (entry->pc == target) + { + return; + } + + const usz parent = entry->parent_iterator_index; + + if (parent == umax) + { + break; + } + + ensure(i != parent); + i = parent; + } + + auto& next = reg_state_it.emplace_back(std::make_unique(target, wi)); + next->atomic16 = ::at32(reg_state_it, wi)->atomic16; + } + }; + + if (pos < lsa || pos >= limit) + { + // Don't analyse if already beyond the limit + next_block(); + continue; + } + + const u32 data = std::bit_cast>(::at32(result.data, (pos - lsa) / 4)); + const auto op = spu_opcode_t{data}; + const auto type = g_spu_itype.decode(data); + + // For debugging + if (false && likely_putllc_loop && is_pattern_match) + { + SPUDisAsm dis_asm(cpu_disasm_mode::dump, reinterpret_cast(ls) + pos, pos); + dis_asm.disasm(pos); + + std::string consts; + + for (u8 reg_file : {m_use_ra[pos / 4], m_use_rb[pos / 4], m_use_rc[pos / 4]}) + { + if (reg_file == umax) + { + continue; + } + + auto reg = get_reg(reg_file); + if (reg.is_const()) + { + if (!consts.empty()) + { + consts += ','; + } + + fmt::append(consts, " r%d=0x%x", reg_file, reg.value); + } + } + + if (!consts.empty()) + { + consts = " {" + consts + " }"; + } + + if (dis_asm.last_opcode.ends_with('\n')) + { + dis_asm.last_opcode.pop_back(); + } + + spu_log.always()("[SPU=0%x, wi=%d] %s%s", pos, wi, dis_asm.last_opcode, consts); + } + + // Analyse instruction + switch (type) + { + case spu_itype::UNK: + case spu_itype::DFCEQ: + case spu_itype::DFCMEQ: + case spu_itype::DFCGT: + case spu_itype::DFCMGT: + case spu_itype::DFTSV: + { + // Stop before invalid instructions (TODO) + next_block(); + continue; + } + + case spu_itype::SYNC: + case spu_itype::STOP: + case spu_itype::STOPD: + { + if (data == 0) + { + // Stop before null data + next_block(); + continue; + } + + if (g_cfg.core.spu_block_size == spu_block_size_type::safe) + { + // Stop on special instructions (TODO) + next_block(); + break; + } + + if (type == spu_itype::SYNC) + { + // Remember + sync = true; + } + + break; + } + + case spu_itype::IRET: + { + next_block(); + break; + } + + case spu_itype::BI: + case spu_itype::BISL: + case spu_itype::BISLED: + case spu_itype::BIZ: + case spu_itype::BINZ: + case spu_itype::BIHZ: + case spu_itype::BIHNZ: + { + break; + } + + case spu_itype::BRSL: + case spu_itype::BRASL: + { + break; + } + + case spu_itype::BRA: + { + break; + } + + case spu_itype::BR: + case spu_itype::BRZ: + case spu_itype::BRNZ: + case spu_itype::BRHZ: + case spu_itype::BRHNZ: + { + break; + } + + case spu_itype::DSYNC: + case spu_itype::HEQ: + case spu_itype::HEQI: + case spu_itype::HGT: + case spu_itype::HGTI: + case spu_itype::HLGT: + case spu_itype::HLGTI: + case spu_itype::LNOP: + case spu_itype::NOP: + case spu_itype::MTSPR: + case spu_itype::FSCRWR: + { + // Do nothing + break; + } + + case spu_itype::WRCH: + { + switch (op.ra) + { + case MFC_EAL: + { + move_reg(s_reg_mfc_eal, op.rt); + break; + } + case MFC_LSA: + { + auto rt = get_reg(op.rt); + inherit_const_mask_value(s_reg_mfc_lsa, rt, 0, ~0x3ffff); + + if (!atomic16.active) + { + atomic16.discard(); + atomic16.lsa_write = true; + atomic16.lsa_pc = pos; + } + + break; + } + case MFC_TagID: + { + break; + } + case MFC_Size: + { + break; + } + case MFC_Cmd: + { + const auto [af, av, atagg, _3, _5] = get_reg(op.rt); + + if (!is_pattern_match) + { + // + } + else if (af & vf::is_const) + { + switch (av) + { + case MFC_GETLLAR_CMD: + { + if (!atomic16.lsa_write) + { + atomic16.discard(); + atomic16.lsa_pc = bpc; // Start of the block: where we last had track of LSA value + atomic16.lsa_write = true; + } + + atomic16.active = true; + + auto lsa = get_reg(s_reg_mfc_lsa); + inherit_const_mask_value(s_reg_mfc_lsa, lsa, 0, ~0x3ff80); + atomic16.lsa = get_reg(s_reg_mfc_lsa); + + if (likely_putllc_loop) + { + // Register loop entry + if (getllar_starts.emplace(atomic16.lsa_pc, false).second) + { + g_fxo->get().all++; + spu_log.notice("[0x%05x] GETLLAR pattern entry point", pos); + } + } + + break; + } + case MFC_PUTLLC_CMD: + { + if (atomic16.active) + { + const auto _lsa = get_reg(s_reg_mfc_lsa); + + if (atomic16.ls_access && atomic16.ls_write && !atomic16.ls_pc_rel && !atomic16.ls.is_const()) + { + bool found = false; + for (u32 i = 0; i < s_reg_max; i++) + { + const auto& _reg = vregs[i]; + + if (_reg.is_const()) + { + continue; + } + + if (_reg == atomic16.ls) + { + atomic16.reg = i; + found = true; + break; + } + } + + if (!found) + { + break_putllc16(3, atomic16.discard()); + break; + } + } + + if (atomic16.ls_access && atomic16.ls_write && !atomic16.lsa.compare_with_mask_indifference(_lsa, 0x3ff80)) + { + // LSA latest value mismatches with the one written with GETLLAR + + if (atomic16.lsa.flag != _lsa.flag) + { + break_putllc16(1, atomic16.discard()); + } + else + { + break_putllc16(2, atomic16.discard()); + } + + break; + } + + if (atomic16.ls_access && atomic16.ls_write) + { + bool ok = false; + + if (atomic16.ls_pc_rel) + { + // + } + else if (atomic16.lsa.is_const()) + { + if (atomic16.ls.is_const()) + { + if (atomic16.ls_offs) + { + // Rebase constant so we can get rid of ls_offs + atomic16.ls.value = spu_ls_target(atomic16.ls_offs + atomic16.ls.value); + atomic16.ls_offs = 0; + } + + if (atomic16.ls.value >= (atomic16.lsa.value & -128) && atomic16.ls.value < utils::align(atomic16.lsa.value + 1, 128)) + { + ok = true; + } + } + else if (atomic16.ls_offs >= (atomic16.lsa.value & -128) && atomic16.ls_offs < utils::align(atomic16.lsa.value + 1, 128) && atomic16.ls.is_less_than(128 - (atomic16.lsa.value & 127))) + { + ok = true; + } + } + else if (!atomic16.lsa.is_const() && atomic16.lsa == atomic16.ls && atomic16.ls_offs < 0x80) + { + // Unknown value with known offset of less than 128 bytes + ok = true; + } + + if (!ok) + { + // This is quite common.. let's try to select between putllc16 and putllc0 at runtime! + // break_putllc16(100); + // atomic16.discard(); + // break; + atomic16.select_16_or_0_at_runtime = true; + } + } + + if (!atomic16.get_rdatomic) + { + // MFC_RdAtomicStat must have been read, otherwise GETLLAR may not be executed (according to HW tests) + break_putllc16(21, atomic16.discard()); + } + + atomic16.put_pc = pos; + atomic16.put_active = true; + } + + break; + } + default: + { + break_putllc16(4, atomic16.discard()); + break; + } + } + } + else + { + break_putllc16(5, atomic16.discard()); + } + + m_use_rb[pos / 4] = s_reg_mfc_eal; + break; + } + case MFC_EAH: + case SPU_WrDec: + case SPU_WrSRR0: + case SPU_WrEventAck: + break; + default: + { + break_putllc16(6, atomic16.discard()); + break; + } + } + + break; + } + + case spu_itype::RDCH: + { + bool invalidate = true; + + switch (op.ra) + { + case MFC_RdAtomicStat: + { + if (atomic16.active) + { + if (atomic16.put_active) + { + if (getllar_starts.contains(atomic16.lsa_pc) && getllar_starts[atomic16.lsa_pc]) + { + break_putllc16(24, atomic16.discard()); + break; + } + + const auto it = atomic16_all.find(pos); + + if (it == atomic16_all.end()) + { + atomic16_all.emplace(pos, atomic16); + } + else + { + atomic16_t& existing = it->second; + + if (existing.lsa_pc != atomic16.lsa_pc || existing.put_pc != atomic16.put_pc || existing.lsa != atomic16.lsa) + { + break_putllc16(22, atomic16.discard()); + existing.active = false; + } + } + + atomic16.discard(); + } + else if (!atomic16.get_rdatomic) + { + atomic16.get_rdatomic = true; + + // Go above and beyond and also set the constant for it + set_const_value(op.rt, MFC_GETLLAR_SUCCESS); + invalidate = false; + } + } + + break; + } + default: + { + break; + } + } + + if (invalidate) + { + unconst(op.rt); + } + + break; + } + case spu_itype::STQR: + case spu_itype::LQR: + { + const bool is_store = type == spu_itype::STQR; + + if (atomic16.active) + { + const u32 offs = spu_branch_target(pos, op.i16); + + if (atomic16.ls_invalid && is_store) + { + break_putllc16(20, atomic16.set_invalid_ls(is_store)); + } + else if (atomic16.ls_access && !atomic16.ls_pc_rel) + { + break_putllc16(7, atomic16.set_invalid_ls(is_store)); + } + else if (atomic16.ls_access && offs != atomic16.ls_offs) + { + if ((offs ^ atomic16.ls_offs) & 0x3ff80) + { + atomic16.ls_write |= is_store; + } + else + { + // Sad + break_putllc16(8, atomic16.set_invalid_ls(is_store)); + } + } + else + { + atomic16.ls = {}; + atomic16.ls_offs = offs; + atomic16.ls_pc_rel = true; + atomic16.ls_write |= is_store; + atomic16.ls_access = true; + } + } + + if (is_store) + { + break; + } + + // Unconst + unconst(op.rt); + break; + } + + case spu_itype::STQX: + case spu_itype::LQX: + { + const bool is_store = type == spu_itype::STQX; + + if (atomic16.active) + { + auto ra = get_reg(op.ra); + ra.value &= 0x3ffff; + auto rb = get_reg(op.rb); + rb.value &= 0x3ffff; + + const u32 offs = ra.is_const() ? ra.value : + rb.is_const() ? rb.value : 0; + + auto add_res = ra; + add_res.value += (rb.is_const() ? rb.value : 0); + add_res.value &= 0x3fff0; + add_res.flag &= rb.flag; + add_res.tag = ra.is_const() ? rb.tag : + rb.is_const() ? ra.tag : 0; + + const u32 const_flags = u32{ra.is_const()} + u32{rb.is_const()}; + + switch (const_flags) + { + case 2: + { + if (atomic16.ls_invalid && is_store) + { + break_putllc16(20, atomic16.set_invalid_ls(is_store)); + } + else if (atomic16.ls_access && atomic16.ls_pc_rel) + { + break_putllc16(8, atomic16.set_invalid_ls(is_store)); + } + else if (auto _lsa = atomic16.lsa; _lsa.is_const() && ((add_res.value ^ _lsa.value) & 0x3ff80)) + { + // Unrelated, ignore + } + else if (atomic16.ls_access && add_res != atomic16.ls) + { + if (atomic16.ls.is_const() && ((add_res.value ^ atomic16.ls.value) & 0x3ff80)) + { + // Ok + } + else + { + // Sad + break_putllc16(9, atomic16.set_invalid_ls(is_store)); + } + } + else + { + atomic16.ls = reg_state_t::from_value(add_res.value); + atomic16.ls_offs = 0; + atomic16.ls_write |= is_store; + atomic16.ls_access = true; + } + + break; + } + case 1: + { + const auto& state = ra.is_const() ? rb : ra; + + if (atomic16.ls_invalid && is_store) + { + break_putllc16(23, atomic16.set_invalid_ls(is_store)); + } + else if (atomic16.ls_access && atomic16.ls_pc_rel) + { + break_putllc16(20, atomic16.set_invalid_ls(is_store)); + } + else if (auto _lsa = atomic16.lsa; !_lsa.is_const() && _lsa == state && offs >= 0x80) + { + // We already know it's an unrelated load/store + } + else if (atomic16.ls_access && atomic16.ls != state) + { + if (atomic16.ls.is_const() && ((state.value ^ atomic16.ls.value) & 0x3ff80)) + { + // Ok + } + else + { + // Sad + break_putllc16(11, atomic16.set_invalid_ls(is_store)); + } + } + else if (atomic16.ls_access && !atomic16.ls.is_const()) + { + if (offs / 16 == atomic16.ls_offs / 16 && offs % 16 == 0) + { + atomic16.ls_write |= is_store; + } + else + { + break_putllc16(12, atomic16.set_invalid_ls(is_store)); + } + } + else + { + atomic16.ls = state; + atomic16.ls_offs = offs; + atomic16.ls_write |= is_store; + atomic16.ls_access = true; + } + + break; + } + case 0: + { + // Unimplemnted + break_putllc16(13, atomic16.set_invalid_ls(is_store)); + break; + } + default: fmt::throw_exception("Unreachable!"); + } + } + + if (is_store) + { + break; + } + + // Unconst + unconst(op.rt); + break; + } + case spu_itype::STQA: + case spu_itype::LQA: + { + const bool is_store = type == spu_itype::STQA; + + if (atomic16.active) + { + const reg_state_t ca = reg_state_t::from_value(spu_ls_target(0, op.si16)); + + if (atomic16.ls_invalid && is_store) + { + break_putllc16(20, atomic16.set_invalid_ls(is_store)); + } + else if (atomic16.ls_access && atomic16.ls_pc_rel) + { + break_putllc16(14, atomic16.set_invalid_ls(is_store)); + } + else if (auto _lsa = atomic16.lsa; _lsa.is_const() && ((ca.value ^ _lsa.value) & 0x3ff80)) + { + // Unrelated, ignore + } + else if (atomic16.ls_access && ca != atomic16.ls) + { + if (atomic16.ls.is_const() && ((ca.value ^ atomic16.ls.value) & 0x3ff80)) + { + // Ok + } + else + { + // Sad + break_putllc16(15, atomic16.set_invalid_ls(is_store)); + } + } + else + { + atomic16.ls = ca; + atomic16.ls_offs = 0; + atomic16.ls_write |= is_store; + atomic16.ls_access = true; + } + } + + if (is_store) + { + break; + } + + // Unconst + unconst(op.rt); + break; + } + + case spu_itype::STQD: + case spu_itype::LQD: + { + const bool is_store = type == spu_itype::STQD; + + if (atomic16.active) + { + auto ra = get_reg(op.ra); + auto _lsa = atomic16.lsa; + + ra.value = spu_ls_target(ra.value, op.si10 * 4); + const u32 offs = ra.is_const() ? 0 : spu_ls_target(0, op.si10 * 4); + const u32 const_flags = u32{ra.is_const()} + u32{atomic16.ls.is_const()}; + const u32 const_lsa_flags = u32{ra.is_const()} + u32{_lsa.is_const()}; + + if (op.si10) + { + ra.known_zeroes = 0; + ra.known_ones = 0; + } + + if (atomic16.ls_access && atomic16.ls_pc_rel) + { + break_putllc16(16, atomic16.set_invalid_ls(is_store)); + } + else if ((const_lsa_flags == 2 && ((ra.value ^ _lsa.value) & 0x3ff80)) || + (const_lsa_flags == 0 && _lsa == ra && offs >= 0x80)) + { + // We already know it's an unrelated load/store + } + else if (atomic16.ls_access && atomic16.ls != ra) + { + if (const_flags == 2 && ((ra.value ^ atomic16.ls.value) & 0x3ff80)) + { + // Ok + } + else + { + // Sad + break_putllc16(17, atomic16.set_invalid_ls(is_store)); + } + } + else if (atomic16.ls_access && const_flags == 0) + { + if (offs / 16 == atomic16.ls_offs / 16) + { + atomic16.ls_write |= is_store; + } + else + { + break_putllc16(18, atomic16.set_invalid_ls(is_store)); + } + } + else + { + atomic16.ls = ra; + atomic16.ls_offs = offs; + atomic16.ls_write |= is_store; + atomic16.ls_access = true; + } + } + + if (type == spu_itype::STQD) + { + break; + } + + // Unconst + unconst(op.rt); + break; + } + + case spu_itype::HBR: + { + hbr_loc = spu_branch_target(pos, op.roh << 7 | op.rt); + const auto [af, av, at, ao, az] = get_reg(op.ra); + hbr_tg = af & vf::is_const && !op.c ? av & 0x3fffc : -1; + break; + } + + case spu_itype::HBRA: + { + hbr_loc = spu_branch_target(pos, op.r0h << 7 | op.rt); + hbr_tg = spu_branch_target(0x0, op.i16); + break; + } + + case spu_itype::HBRR: + { + hbr_loc = spu_branch_target(pos, op.r0h << 7 | op.rt); + hbr_tg = spu_branch_target(pos, op.i16); + break; + } + + case spu_itype::IL: + { + set_const_value(op.rt, op.si16); + break; + } + case spu_itype::ILA: + { + set_const_value(op.rt, op.i18); + break; + } + case spu_itype::ILH: + { + set_const_value(op.rt, op.i16 << 16 | op.i16); + break; + } + case spu_itype::ILHU: + { + set_const_value(op.rt, op.i16 << 16); + break; + } + case spu_itype::IOHL: + { + const auto rt = get_reg(op.rt); + inherit_const_mask_value(op.rt, rt, op.i16, 0); + break; + } + case spu_itype::ORI: + { + if (!op.si10) + { + move_reg(op.rt, op.ra); + break; + } + + const auto ra = get_reg(op.ra); + inherit_const_mask_value(op.rt, ra, op.si10, 0); + break; + } + case spu_itype::OR: + { + const auto [af, av, at, ao, az] = get_reg(op.ra); + const auto [bf, bv, _2, _4, _6] = get_reg(op.rb); + inherit_const_value(op.rt, af & bf, bv | av); + break; + } + case spu_itype::XORI: + { + if (!op.si10) + { + move_reg(op.rt, op.ra); + break; + } + + const auto [af, av, at, ao, az] = get_reg(op.ra); + inherit_const_value(op.rt, af, av ^ op.si10); + break; + } + case spu_itype::XOR: + { + if (op.ra == op.rb) + { + set_const_value(op.rt, 0); + break; + } + + const auto [af, av, at, ao, az] = get_reg(op.ra); + const auto [bf, bv, _2, _4, _6] = get_reg(op.rb); + inherit_const_value(op.rt, af & bf, bv ^ av); + break; + } + case spu_itype::NOR: + { + const auto [af, av, at, ao, az] = get_reg(op.ra); + const auto [bf, bv, _2, _4, _6] = get_reg(op.rb); + inherit_const_value(op.rt, af & bf, ~(bv | av)); + break; + } + case spu_itype::ANDI: + { + const auto ra = get_reg(op.ra); + inherit_const_mask_value(op.rt, ra, 0, ~op.si10); + break; + } + case spu_itype::AND: + { + const auto [af, av, at, ao, az] = get_reg(op.ra); + const auto [bf, bv, _2, _4, _6] = get_reg(op.rb); + inherit_const_value(op.rt, af & bf, bv & av); + break; + } + case spu_itype::AI: + { + if (!op.si10) + { + move_reg(op.rt, op.ra); + break; + } + + const auto [af, av, at, ao, az] = get_reg(op.ra); + inherit_const_value(op.rt, af, av + op.si10); + break; + } + case spu_itype::A: + { + const auto [af, av, at, ao, az] = get_reg(op.ra); + const auto [bf, bv, _2, _4, _6] = get_reg(op.rb); + inherit_const_value(op.rt, af & bf, bv + av); + break; + } + case spu_itype::SFI: + { + const auto [af, av, at, ao, az] = get_reg(op.ra); + inherit_const_value(op.rt, af, op.si10 - av); + break; + } + case spu_itype::SF: + { + const auto [af, av, at, ao, az] = get_reg(op.ra); + const auto [bf, bv, _2, _4, _6] = get_reg(op.rb); + inherit_const_value(op.rt, af & bf, bv - av); + break; + } + case spu_itype::FSMBI: + { + const u32 mask = (op.i16 >> 12); + + const u32 value = (mask & 1 ? 0xff : 0) | + (mask & 2 ? 0xff00 : 0) | + (mask & 4 ? 0xff0000 : 0) | + (mask & 8 ? 0xff000000u : 0); + + set_const_value(op.rt, value); + break; + } + case spu_itype::ROTMI: + { + m_regmod[pos / 4] = op.rt; + + if ((0 - op.i7) & 0x20) + { + set_const_value(op.rt, 0); + break; + } + + if (!op.i7) + { + move_reg(op.rt, op.ra); + break; + } + + const auto [af, av, at, ao, az] = get_reg(op.ra); + inherit_const_value(op.rt, af, av >> ((0 - op.i7) & 0x1f)); + break; + } + case spu_itype::SHLI: + { + if (op.i7 & 0x20) + { + set_const_value(op.rt, 0); + break; + } + + if (!op.i7) + { + move_reg(op.rt, op.ra); + break; + } + + const auto [af, av, at, ao, az] = get_reg(op.ra); + inherit_const_value(op.rt, af, av << (op.i7 & 0x1f)); + break; + } + case spu_itype::SHLQBYI: + { + if (op.i7 & 0x10) + { + set_const_value(op.rt, 0); + break; + } + + if (!op.i7) + { + move_reg(op.rt, op.ra); + break; + } + + [[fallthrough]]; + } + default: + { + // Make unknown value + if (!(type & spu_itype::zregmod)) + { + const u32 op_rt = type & spu_itype::_quadrop ? +op.rt4 : +op.rt; + unconst(op_rt); + } + + break; + } + } + + if (m_targets.count(pos)) + { + for (u32 next_target : ::at32(m_targets, pos)) + { + add_block(next_target); + } + + next_block(); + } + } + + // Fill block info + for (auto& pred : m_preds) + { + auto& block = m_bbs[pred.first]; + + // Copy predeccessors (wrong at this point, needs a fixup later) + block.preds = pred.second; + + // Fill register usage info + for (u32 ia = pred.first; ia < limit; ia += 4) + { + block.size++; + + // Decode instruction + const spu_opcode_t op{std::bit_cast>(result.data[(ia - lsa) / 4])}; + + const auto type = g_spu_itype.decode(op.opcode); + + u8 reg_save = 255; + + if (type == spu_itype::STQD && op.ra == s_reg_sp && !block.reg_mod[op.rt] && !block.reg_use[op.rt]) + { + // Register saved onto the stack before use + block.reg_save_dom[op.rt] = true; + + reg_save = op.rt; + } + + for (auto* _use : {&m_use_ra, &m_use_rb, &m_use_rc}) + { + if (u8 reg = (*_use)[ia / 4]; reg < s_reg_max) + { + // Register reg use only if it happens before reg mod + if (!block.reg_mod[reg]) + { + block.reg_use.set(reg); + + if (reg_save != reg && block.reg_save_dom[reg]) + { + // Register is still used after saving; probably not eligible for optimization + block.reg_save_dom[reg] = false; + } + } + } + } + + if (m_use_rb[ia / 4] == s_reg_mfc_eal) + { + // Expand MFC_Cmd reg use + for (u8 reg : {s_reg_mfc_lsa, s_reg_mfc_tag, s_reg_mfc_size}) + { + if (!block.reg_mod[reg]) + block.reg_use.set(reg); + } + } + + // Register reg modification + if (u8 reg = m_regmod[ia / 4]; reg < s_reg_max) + { + block.reg_mod.set(reg); + block.reg_mod_xf.set(reg, type & spu_itype::xfloat); + + if (type == spu_itype::SELB && (block.reg_mod_xf[op.ra] || block.reg_mod_xf[op.rb])) + block.reg_mod_xf.set(reg); + + // Possible post-dominating register load + if (type == spu_itype::LQD && op.ra == s_reg_sp) + block.reg_load_mod[reg] = ia + 1; + else + block.reg_load_mod[reg] = 0; + } + + // Find targets (also means end of the block) + const auto tfound = m_targets.find(ia); + + if (tfound != m_targets.end()) + { + // Copy targets + block.targets = tfound->second; + + // Assume that the call reads and modifies all volatile registers (TODO) + bool is_call = false; + bool is_tail = false; + switch (type) + { + case spu_itype::BRSL: + is_call = spu_branch_target(ia, op.i16) != ia + 4; + break; + case spu_itype::BRASL: + is_call = spu_branch_target(0, op.i16) != ia + 4; + break; + case spu_itype::BISL: + case spu_itype::BISLED: + is_call = true; + break; + default: + break; + } + + if (is_call) + { + for (u32 i = 0; i < s_reg_max; ++i) + { + if (i == s_reg_lr || (i >= 2 && i < s_reg_80) || i > s_reg_127) + { + if (!block.reg_mod[i]) + block.reg_use.set(i); + + if (!is_tail) { block.reg_mod.set(i); block.reg_mod_xf[i] = false; @@ -3659,13 +5237,6 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s } } - if (!m_bbs.count(entry_point)) - { - // Invalid code - spu_log.error("[0x%x] Invalid code", entry_point); - return {}; - } - // Fill entry map while (true) { @@ -4410,6 +5981,59 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s } } + std::string func_hash; + if (!result.data.empty()) + { + sha1_context ctx; + u8 output[20]{}; + + sha1_starts(&ctx); + sha1_update(&ctx, reinterpret_cast(result.data.data()), result.data.size() * 4); + sha1_finish(&ctx, output); + fmt::append(func_hash, "%s", fmt::base57(output)); + } + + for (const auto& [pc_commited, pattern] : atomic16_all) + { + if (!pattern.active) + { + continue; + } + + if (getllar_starts.contains(pattern.lsa_pc) && getllar_starts[pattern.lsa_pc]) + { + continue; + } + + auto& stats = g_fxo->get(); + + if (!pattern.ls_write) + { + spu_log.success("PUTLLC0 Pattern Detected! (put_pc=0x%x, %s) (putllc0=%d, putllc16+0=%d, all=%d)", pattern.put_pc, func_hash, ++stats.nowrite, ++stats.single, +stats.all); + result.add_pattern(true, spu_program::inst_attr::putllc0, pattern.put_pc - lsa); + continue; + } + + union putllc16_info + { + u32 data; + bf_t is_const; + bf_t is_pc_rel; + bf_t runtime16_select; + bf_t reg; + bf_t offs; + } value; + + value.is_const = pattern.ls.is_const(); + value.is_pc_rel = pattern.ls_pc_rel; + value.offs = value.is_const ? pattern.ls.value : pattern.ls_offs; + value.reg = pattern.reg; + value.runtime16_select = pattern.select_16_or_0_at_runtime; + result.add_pattern(false, spu_program::inst_attr::putllc16, pattern.put_pc - result.entry_point, value.data); + + spu_log.success("PUTLLC16 Pattern Detected! (put_pc=0x%x, is_pc_rel=%d, offset=0x%x, is_const=%d, %s) (putllc0=%d, putllc16+0=%d, all=%d)", pattern.put_pc, value.is_pc_rel, value.offs, value.is_const, func_hash, +stats.nowrite, ++stats.single, +stats.all); + } + if (result.data.empty()) { // Blocks starting from 0x0 or invalid instruction won't be compiled, may need special interpreter fallback @@ -5202,6 +6826,258 @@ std::unique_ptr spu_recompiler_base::make_fast_llvm_recompi return std::make_unique(); } +std::array& block_reg_info::evaluate_start_state(const std::map>& map) +{ + if (!has_true_state) + { + std::basic_string been_there; + + struct iterator_info + { + u32 block_pc = SPU_LS_SIZE; + + struct state_t + { + u32 block_pc = SPU_LS_SIZE; + std::array reg_state; + bool disconnected = false; + bool state_written = false; + }; + + std::vector state_prev; + usz completed = 0; + usz parent_iterator_index = umax; + }; + + std::vector info_queue; + + iterator_info first_entry{pc, {}, 0, umax}; + info_queue.emplace_back(std::move(first_entry)); + + // info_queue may grow + for (usz qi = 0; qi < info_queue.size();) + { + auto it = &info_queue[qi]; + + if (it->state_prev.empty()) + { + // Build the list here to avoid code duplication + + const auto& node = ::at32(map, it->block_pc); + const usz real_size = node->prev_nodes.size(); + + if (real_size) + { + it->state_prev.resize(real_size); + + for (usz i = 0; i < real_size; i++) + { + it->state_prev[i].block_pc = node->prev_nodes[i].prev_pc; + } + } + } + + const usz next_entry_idx = it->completed; + + if (next_entry_idx == it->state_prev.size()) + { + // Result merge from all predecessors + auto& cur_node = ::at32(map, it->block_pc); + + // Flag to mark the state as resolved + bool is_all_resolved = true; + bool has_past_state = false; + + for (usz bi = 0; bi < it->state_prev.size(); bi++) + { + if (it->state_prev[bi].disconnected) + { + is_all_resolved = false; + continue; + } + + has_past_state = true; + + const u32 node_pc = it->state_prev[bi].block_pc; + const auto& node = ::at32(map, node_pc); + + // Check if the node is resolved + if (!node->has_true_state) + { + // Assume this block cannot be resolved at the moment + is_all_resolved = false; + break; + } + } + + std::array temp; + + if (qi == 0) + { + // TODO: First block is always resolved here, but this logic can be improved to detect more cases of opportunistic resolving + is_all_resolved = true; + } + + auto& res_state = is_all_resolved ? cur_node->true_start_state : temp; + + for (usz bi = 0; bi < it->state_prev.size(); bi++) + { + if (it->state_prev[bi].disconnected) + { + // Loop state, even if not ignored for a million times the result would still be the same + // So ignore it + continue; + } + + std::array* arg_state{}; + const auto& node = ::at32(map, it->state_prev[bi].block_pc); + + if (node->has_true_state) + { + // State is resolved, use the entry's state + arg_state = std::addressof(node->end_state); + } + else + { + // Use accumulated state from one path of code history + arg_state = std::addressof(it->state_prev[bi].reg_state); + ensure(it->state_prev[bi].state_written); + } + + if (bi == 0) + { + res_state = *arg_state; + } + else + { + res_state = merge(res_state, *arg_state); + } + } + + ensure(it->parent_iterator_index == qi - 1); + + std::array* result_storage{}; + + if (is_all_resolved) + { + // Complete state of this block + result_storage = std::addressof(cur_node->end_state); + cur_node->has_true_state = true; + } + else + { + // Patch incomplete state into saved state entry of parent block + ensure(it->parent_iterator_index != qi); + ensure(it->parent_iterator_index != umax); + + auto& state = ::at32(info_queue, it->parent_iterator_index).state_prev; + + for (usz i = 0;; i++) + { + ensure(i < state.size()); + + if (state[i].block_pc == it->block_pc) + { + result_storage = std::addressof(state[i].reg_state); + state[i].state_written = true; + break; + } + } + } + + // Stack the newer state on top of the old (if exists) + if (has_past_state) + { + *result_storage = build_on_top_of(cur_node->addend_state, res_state); + } + else + { + *result_storage = cur_node->addend_state; + } + + if (qi != 0) + { + ::at32(been_there, been_there.size() - 1); + been_there.pop_back(); + info_queue.pop_back(); + qi--; + } + else + { + ensure(cur_node->has_true_state); + break; + } + } + else + { + const u32 prev_pc = ::at32(map, it->block_pc)->prev_nodes[it->completed++].prev_pc; + const auto& prev_node = ::at32(map, prev_pc); + + // Queue for resolving if needed + if (!prev_node->has_true_state) + { + const bool loop_detected = been_there.find_first_of(prev_pc) != umax; + const bool avoid_extensive_analysis = qi >= 10'000; + + if (!loop_detected && !avoid_extensive_analysis) + { + info_queue.emplace_back(iterator_info{prev_pc, {}, 0, qi}); + been_there.push_back(prev_pc); + qi++; + } + else + { + auto& state = it->state_prev; + + for (usz i = 0;; i++) + { + ensure(i < state.size()); + + if (state[i].block_pc == prev_pc) + { + // Loop state, even if not ignored for a million times the result would be the same + // This is similar to multiplying zero a million times + // This is true at least for now, that any register difference is considered an unknown state change + // So ignore it + state[i].disconnected = true; + break; + } + } + + // Repeat + // qi += 0; + } + } + else + { + // Repeat + // qi += 0; + } + } + } + + ensure(has_true_state); + } + + walkby_state = true_start_state; + return walkby_state; +} + +void spu_program::add_pattern(bool fill_all, inst_attr attr, u32 start, u32 end) +{ + if (end == umax) + { + end = start; + } + + patterns.try_emplace(start, pattern_info{utils::address_range::start_end(start, end)}); + + for (u32 i = start; i <= (fill_all ? end : start); i += 4) + { + inst_attrs[i / 4] = attr; + } +} + extern std::string format_spu_func_info(u32 addr, cpu_thread* spu) { spu_thread* _spu = static_cast(spu); diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index 50afa7a568f6..ef532fcdc831 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -5,6 +5,7 @@ #include "Emu/system_config.h" #include "Emu/IdManager.h" #include "Emu/Cell/timers.hpp" +#include "Emu/Memory/vm_reservation.h" #include "Crypto/sha1.h" #include "Utilities/JIT.h" @@ -534,6 +535,14 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator return m_ir->CreateGEP(get_type(), base, m_ir->getInt64(offset)); } + template + llvm::Value* _ptr(llvm::Value* base, llvm::Value* offset) + { + const auto off = m_ir->CreateGEP(get_type(), base, offset); + const auto ptr = m_ir->CreateBitCast(off, get_type()); + return ptr; + } + template llvm::Value* spu_ptr(Args... offset_args) { @@ -1078,6 +1087,159 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator m_ir->SetInsertPoint(_body); } + void putllc16_pattern(const spu_program& prog, utils::address_range range) + { + // Prevent store elimination + m_block->store_context_ctr[s_reg_mfc_eal]++; + m_block->store_context_ctr[s_reg_mfc_lsa]++; + m_block->store_context_ctr[s_reg_mfc_tag]++; + m_block->store_context_ctr[s_reg_mfc_size]++; + + static const auto on_fail = [](spu_thread* _spu, u32 addr) + { + if (const u32 raddr = _spu->raddr) + { + // Last check for event before we clear the reservation + if (raddr == addr || _spu->rtime != (vm::reservation_acquire(raddr) & -128) || std::memcmp(&_spu->rdata, vm::_ptrrdata)>(raddr), 128)) + { + _spu->set_events(SPU_EVENT_LR); + } + } + }; + + union putllc16_info + { + u32 data; + bf_t is_const; + bf_t is_pc_rel; + bf_t runtime16_select; + bf_t reg; + bf_t offs; + } info; + + const auto _next = llvm::BasicBlock::Create(m_context, "", m_function); + const auto _next0 = llvm::BasicBlock::Create(m_context, "", m_function); + const auto _next1 = llvm::BasicBlock::Create(m_context, "", m_function); + const auto _next2 = llvm::BasicBlock::Create(m_context, "", m_function); + const auto _fail = llvm::BasicBlock::Create(m_context, "", m_function); + const auto _fail0 = llvm::BasicBlock::Create(m_context, "", m_function); + const auto _final = llvm::BasicBlock::Create(m_context, "", m_function); + info.data = range.end; + + const auto _eal = (get_reg_fixed(s_reg_mfc_eal) & -128).eval(m_ir); + + m_ir->CreateCondBr(m_ir->CreateICmpEQ(_eal, m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::raddr))), _next, _fail, m_md_likely); + m_ir->SetInsertPoint(_next); + + value_t eal_val; + eal_val.value = _eal; + + const auto rptr = _ptr(m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::reserv_base_addr)), ((eal_val & 0xff80) >> 1).eval(m_ir)); + const auto rval = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::rtime)); + m_ir->CreateCondBr( + m_ir->CreateExtractValue(m_ir->CreateAtomicCmpXchg(rptr, rval, m_ir->CreateOr(rval, 0x7f), llvm::MaybeAlign{16}, llvm::AtomicOrdering::SequentiallyConsistent, llvm::AtomicOrdering::SequentiallyConsistent), 1) + , _next0 + , _fail); + + m_ir->SetInsertPoint(_next0); + + const auto _lsa = (get_reg_fixed(s_reg_mfc_lsa) & 0x3ff80).eval(m_ir); + const auto _dest = info.is_pc_rel ? get_pc(info.offs) : + !info.is_const ? (extract(get_reg_fixed(info.reg), 3) + splat(info.offs)).eval(m_ir) : splat(info.offs & 0x3fff0).eval(m_ir); + const auto dest = info.is_const ? _dest : m_ir->CreateAnd(_dest, m_ir->getInt32(0x3fff0)); + + const auto diff = m_ir->CreateZExt(m_ir->CreateSub(dest, _lsa), get_type()); + + if (info.runtime16_select) + { + m_ir->CreateCondBr(m_ir->CreateICmpULT(diff, m_ir->getInt64(128)), _next1, _next2); + } + else + { + m_ir->CreateBr(_next1); + } + + m_ir->SetInsertPoint(_next1); + const auto _new = m_ir->CreateAlignedLoad(get_type(), _ptr(m_lsptr, dest), llvm::MaybeAlign{16}); + const auto _rdata = m_ir->CreateAlignedLoad(get_type(), _ptr(spu_ptr(&spu_thread::rdata), diff), llvm::MaybeAlign{16}); + + m_ir->CreateCondBr( + m_ir->CreateExtractValue(m_ir->CreateAtomicCmpXchg(_ptr(_ptr(m_memptr, _eal), diff), _rdata, _new, llvm::MaybeAlign{16}, llvm::AtomicOrdering::SequentiallyConsistent, llvm::AtomicOrdering::SequentiallyConsistent), 1) + , _next2 + , _fail0); + + m_ir->SetInsertPoint(_next2); + m_ir->CreateAlignedStore(m_ir->CreateAdd(rval, m_ir->getInt64(128)), rptr, llvm::MaybeAlign{8}); + call("atomic_wait_engine::notify_all", static_cast(atomic_wait_engine::notify_all), rptr); + m_ir->CreateStore(m_ir->getInt64(spu_channel::bit_count | MFC_PUTLLC_SUCCESS), spu_ptr(&spu_thread::ch_atomic_stat)); + m_ir->CreateBr(_final); + + m_ir->SetInsertPoint(_fail0); + m_ir->CreateStore(rval, rptr); + m_ir->CreateBr(_fail); + m_ir->SetInsertPoint(_fail); + call("PUTLLC16_fail", +on_fail, m_thread, _eal); + m_ir->CreateStore(m_ir->getInt64(spu_channel::bit_count | MFC_PUTLLC_FAILURE), spu_ptr(&spu_thread::ch_atomic_stat)); + m_ir->CreateBr(_final); + + m_ir->SetInsertPoint(_final); + m_ir->CreateStore(m_ir->getInt32(0), spu_ptr(&spu_thread::raddr)); + } + + void putllc0_pattern(const spu_program& prog, utils::address_range range) + { + // Prevent store elimination + m_block->store_context_ctr[s_reg_mfc_eal]++; + m_block->store_context_ctr[s_reg_mfc_lsa]++; + m_block->store_context_ctr[s_reg_mfc_tag]++; + m_block->store_context_ctr[s_reg_mfc_size]++; + + static const auto on_fail = [](spu_thread* _spu, u32 addr) + { + if (const u32 raddr = _spu->raddr) + { + // Last check for event before we clear the reservation + if (raddr == addr || _spu->rtime != (vm::reservation_acquire(raddr) & -128) || std::memcmp(&_spu->rdata, vm::_ptrrdata)>(raddr), 128)) + { + _spu->set_events(SPU_EVENT_LR); + } + } + }; + + const auto _next = llvm::BasicBlock::Create(m_context, "", m_function); + const auto _next0 = llvm::BasicBlock::Create(m_context, "", m_function); + const auto _fail = llvm::BasicBlock::Create(m_context, "", m_function); + const auto _final = llvm::BasicBlock::Create(m_context, "", m_function); + + const auto _eal = (get_reg_fixed(s_reg_mfc_eal) & -128).eval(m_ir); + + m_ir->CreateCondBr(m_ir->CreateICmpEQ(_eal, m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::raddr))), _next, _fail, m_md_likely); + m_ir->SetInsertPoint(_next); + + value_t eal_val; + eal_val.value = _eal; + + const auto rptr = _ptr(m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::reserv_base_addr)), ((eal_val & 0xff80) >> 1).eval(m_ir)); + const auto rval = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::rtime)); + m_ir->CreateCondBr( + m_ir->CreateExtractValue(m_ir->CreateAtomicCmpXchg(rptr, rval, m_ir->CreateAdd(rval, m_ir->getInt64(128)), llvm::MaybeAlign{16}, llvm::AtomicOrdering::SequentiallyConsistent, llvm::AtomicOrdering::SequentiallyConsistent), 1) + , _next0 + , g_cfg.core.spu_accurate_reservations ? _fail : _next0); // Succeed unconditionally + + m_ir->SetInsertPoint(_next0); + call("atomic_wait_engine::notify_all", static_cast(atomic_wait_engine::notify_all), rptr); + m_ir->CreateStore(m_ir->getInt64(spu_channel::bit_count | MFC_PUTLLC_SUCCESS), spu_ptr(&spu_thread::ch_atomic_stat)); + m_ir->CreateBr(_final); + + m_ir->SetInsertPoint(_fail); + call("PUTLLC0_fail", +on_fail, m_thread, _eal); + m_ir->CreateStore(m_ir->getInt64(spu_channel::bit_count | MFC_PUTLLC_FAILURE), spu_ptr(&spu_thread::ch_atomic_stat)); + m_ir->CreateBr(_final); + + m_ir->SetInsertPoint(_final); + m_ir->CreateStore(m_ir->getInt32(0), spu_ptr(&spu_thread::raddr)); + } + public: spu_llvm_recompiler(u8 interp_magn = 0) : spu_recompiler_base() @@ -1621,6 +1783,26 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator else m_next_op = func.data[(m_pos - start) / 4 + 1]; + switch (func.inst_attrs[(m_pos - start) / 4]) + { + case spu_program::inst_attr::putllc0: + { + putllc0_pattern(func, func.patterns.at(m_pos - start).range); + continue; + } + case spu_program::inst_attr::putllc16: + { + putllc16_pattern(func, func.patterns.at(m_pos - start).range); + continue; + } + case spu_program::inst_attr::omit: + { + // TODO + continue; + } + default: break; + } + // Execute recompiler function (TODO) (this->*decode(op))({op}); } diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index 26f59cffb046..d4a441833777 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -2,6 +2,7 @@ #include "Utilities/File.h" #include "Utilities/lockless.h" +#include "Utilities/address_range.h" #include "SPUThread.h" #include #include @@ -59,6 +60,27 @@ struct spu_program // Program data with intentionally wrong endianness (on LE platform opcode values are swapped) std::vector data; + // TODO: Add patterns + // Not a bitset to allow more possibilities + enum class inst_attr : u8 + { + none, + omit, + putllc16, + putllc0, + }; + + std::vector inst_attrs; + + struct pattern_info + { + utils::address_range range; + }; + + std::unordered_map patterns; + + void add_pattern(bool fill_all, inst_attr attr, u32 start, u32 end = -1); + bool operator==(const spu_program& rhs) const noexcept; bool operator<(const spu_program& rhs) const noexcept; @@ -212,6 +234,9 @@ class spu_recompiler_base // List of block predecessors std::unordered_map, value_hash> m_preds; + // List of loop connectors (destinatiuon -> branches) + std::unordered_map, value_hash> m_loops; + // List of function entry points and return points (set after BRSL, BRASL, BISL, BISLED) std::bitset<0x10000> m_entry_info; diff --git a/rpcs3/Emu/Cell/SPUThread.h b/rpcs3/Emu/Cell/SPUThread.h index 24914f70758b..4736f5764b09 100644 --- a/rpcs3/Emu/Cell/SPUThread.h +++ b/rpcs3/Emu/Cell/SPUThread.h @@ -668,6 +668,7 @@ class spu_thread : public cpu_thread // May be used by recompilers. u8* memory_base_addr = vm::g_base_addr; + u8* reserv_base_addr = vm::g_reservations; // General-Purpose Registers std::array gpr; diff --git a/rpcs3/Emu/System.cpp b/rpcs3/Emu/System.cpp index 7447f3317088..3261a3044ffd 100644 --- a/rpcs3/Emu/System.cpp +++ b/rpcs3/Emu/System.cpp @@ -3022,11 +3022,16 @@ void Emulator::Kill(bool allow_autoexit, bool savestate, savestate_stage* save_s *join_thread = make_ptr(new named_thread("Emulation Join Thread"sv, [join_thread, savestate, allow_autoexit, this]() mutable { fs::pending_file file; - std::shared_ptr init_mtx = std::make_shared(); - std::shared_ptr join_ended = std::make_shared(false); - atomic_ptr to_ar; - named_thread stop_watchdog("Stop Watchdog"sv, [&to_ar, init_mtx, join_ended, this]() + using std::make_shared; + + auto verbose_message = make_shared>(); + auto init_mtx = make_shared(); + auto join_ended = make_shared(false); + auto to_ar = make_shared>(); + + auto stop_watchdog = make_ptr(new named_thread("Stop Watchdog"sv, + [to_ar, init_mtx, join_ended, verbose_message, this]() { const auto closed_sucessfully = std::make_shared>(false); @@ -3058,10 +3063,10 @@ void Emulator::Kill(bool allow_autoexit, bool savestate, savestate_stage* save_s while (thread_ctrl::state() != thread_state::aborting) { - if (auto ar_ptr = to_ar.load()) + if (auto ar_ptr = to_ar->load()) { // Total amount of waiting: about 10s - GetCallbacks().on_save_state_progress(closed_sucessfully, ar_ptr, init_mtx); + GetCallbacks().on_save_state_progress(closed_sucessfully, ar_ptr, verbose_message.get(), init_mtx); while (thread_ctrl::state() != thread_state::aborting) { @@ -3076,7 +3081,7 @@ void Emulator::Kill(bool allow_autoexit, bool savestate, savestate_stage* save_s *closed_sucessfully = true; - }); + })); // Join threads for (const auto& [type, data] : *g_fxo) @@ -3097,8 +3102,15 @@ void Emulator::Kill(bool allow_autoexit, bool savestate, savestate_stage* save_s static_cast(init_mtx->init()); + auto set_progress_message = [&](std::string_view text) + { + *verbose_message = stx::make_single(text); + }; + while (savestate) { + set_progress_message("Creating File"); + path = get_savestate_file(m_title_id, m_path, 0, 0); // The function is meant for reading files, so if there is no GZ file it would not return compressed file path @@ -3124,7 +3136,7 @@ void Emulator::Kill(bool allow_autoexit, bool savestate, savestate_stage* save_s auto serial_ptr = stx::make_single(); serial_ptr->m_file_handler = make_compressed_serialization_file_handler(file.file); - to_ar = std::move(serial_ptr); + *to_ar = std::move(serial_ptr); signal_system_cache_can_stay(); break; @@ -3142,7 +3154,7 @@ void Emulator::Kill(bool allow_autoexit, bool savestate, savestate_stage* save_s return fmt::format("Emu State Capture Thread: '%s'", g_tls_serialize_name); }; - auto& ar = *to_ar.load(); + auto& ar = *to_ar->load(); read_used_savestate_versions(); // Reset version data USING_SERIALIZATION_VERSION(global_version); @@ -3217,6 +3229,8 @@ void Emulator::Kill(bool allow_autoexit, bool savestate, savestate_stage* save_s ar(std::string{}); }; + set_progress_message("Creating Header"); + ar("RPCS3SAV"_u64); ar(std::endian::native == std::endian::little); ar(g_cfg.savestate.state_inspection_mode.get()); @@ -3256,12 +3270,22 @@ void Emulator::Kill(bool allow_autoexit, bool savestate, savestate_stage* save_s ar(klic.empty() ? std::array{} : std::bit_cast>(klic[0])); ar(m_game_dir); + + set_progress_message("Saving HDD1"); save_hdd1(); + set_progress_message("Saving HDD0"); save_hdd0(); + ar(std::array{}); // Reserved for future use + + set_progress_message("Saving VMemory"); vm::save(ar); + + set_progress_message("Saving FXO"); g_fxo->save(ar); + set_progress_message("Finalizing File"); + bs_t extension_flags{SaveStateExtentionFlags1::SupportsMenuOpenResume}; if (g_fxo->get().active) @@ -3285,18 +3309,17 @@ void Emulator::Kill(bool allow_autoexit, bool savestate, savestate_stage* save_s if (emu_state_cap_thread == thread_state::errored) { sys_log.error("Saving savestate failed due to fatal error!"); - to_ar.reset(); + to_ar->reset(); savestate = false; } } - stop_watchdog = thread_state::finished; - static_cast(init_mtx->reset()); - if (savestate) { fs::stat_t file_stat{}; + set_progress_message("Commiting File"); + if (!file.commit() || !fs::get_stat(path, file_stat)) { sys_log.error("Failed to write savestate to file! (path='%s', %s)", path, fs::g_tls_error); @@ -3394,19 +3417,25 @@ void Emulator::Kill(bool allow_autoexit, bool savestate, savestate_stage* save_s } } + set_progress_message("Resetting Objects"); + // Final termination from main thread (move the last ownership of join thread in order to destroy it) - CallFromMainThread([join_thread = std::move(join_thread), allow_autoexit, this]() mutable + CallFromMainThread([join_thread = std::move(join_thread), verbose_message, stop_watchdog, init_mtx, allow_autoexit, this]() { cpu_thread::cleanup(); lv2_obj::cleanup(); + g_fxo->reset(); sys_log.notice("Objects cleared..."); vm::close(); + *stop_watchdog = thread_state::finished; + static_cast(init_mtx->reset()); + jit_runtime::finalize(); perf_stat_base::report(); diff --git a/rpcs3/Emu/System.h b/rpcs3/Emu/System.h index f4b86cf01af5..adbc97e35841 100644 --- a/rpcs3/Emu/System.h +++ b/rpcs3/Emu/System.h @@ -64,7 +64,7 @@ struct EmuCallbacks std::function on_ready; std::function on_missing_fw; std::function>, int)> on_emulation_stop_no_response; - std::function>, stx::shared_ptr, std::shared_ptr)> on_save_state_progress; + std::function>, stx::shared_ptr, stx::atomic_ptr*, std::shared_ptr)> on_save_state_progress; std::function enable_disc_eject; std::function enable_disc_insert; std::function)> try_to_quit; // (force_quit, on_exit) Try to close RPCS3 diff --git a/rpcs3/headless_application.cpp b/rpcs3/headless_application.cpp index eb57704d09f8..1204ba85efe1 100644 --- a/rpcs3/headless_application.cpp +++ b/rpcs3/headless_application.cpp @@ -148,7 +148,7 @@ void headless_application::InitializeCallbacks() } }; - callbacks.on_save_state_progress = [](std::shared_ptr>, stx::shared_ptr, std::shared_ptr) + callbacks.on_save_state_progress = [](std::shared_ptr>, stx::shared_ptr, stx::atomic_ptr*, std::shared_ptr) { }; diff --git a/rpcs3/rpcs3qt/gui_application.cpp b/rpcs3/rpcs3qt/gui_application.cpp index a334e929bfef..f45e00e4210e 100644 --- a/rpcs3/rpcs3qt/gui_application.cpp +++ b/rpcs3/rpcs3qt/gui_application.cpp @@ -685,9 +685,9 @@ void gui_application::InitializeCallbacks() }); }; - callbacks.on_save_state_progress = [this](std::shared_ptr> closed_successfully, stx::shared_ptr ar_ptr, std::shared_ptr init_mtx) + callbacks.on_save_state_progress = [this](std::shared_ptr> closed_successfully, stx::shared_ptr ar_ptr, stx::atomic_ptr* code_location, std::shared_ptr init_mtx) { - Emu.CallFromMainThread([this, closed_successfully, ar_ptr, init_mtx] + Emu.CallFromMainThread([this, closed_successfully, ar_ptr, code_location, init_mtx] { const auto half_seconds = std::make_shared(1); @@ -696,30 +696,44 @@ void gui_application::InitializeCallbacks() pdlg->setAutoClose(true); pdlg->show(); - QString text_base = tr("Waiting for %0 second(s), %1 written"); + QString text_base = tr("%0 written, %1 second(s) passed%2"); - pdlg->setLabelText(text_base.arg(0).arg("0B")); + pdlg->setLabelText(text_base.arg("0B").arg(1).arg("")); pdlg->setAttribute(Qt::WA_DeleteOnClose); QTimer* update_timer = new QTimer(pdlg); - connect(update_timer, &QTimer::timeout, [pdlg, ar_ptr, half_seconds, text_base, closed_successfully, init_mtx]() + connect(update_timer, &QTimer::timeout, [pdlg, ar_ptr, half_seconds, text_base, closed_successfully, code_location, init_mtx]() { - auto init = static_cast(init_mtx.get())->access(); + std::string verbose_message; + usz bytes_written = 0; - if (!init) { - pdlg->reject(); - return; - } + auto init = static_cast(init_mtx.get())->access(); - *half_seconds += 1; + if (!init) + { + pdlg->reject(); + return; + } - const usz bytes_written = ar_ptr->get_size(); - pdlg->setLabelText(text_base.arg(*half_seconds / 2).arg(gui::utils::format_byte_size(bytes_written))); + if (auto str_ptr = code_location->load()) + { + verbose_message = "\n" + *str_ptr; + } + + *half_seconds += 1; + + bytes_written = ar_ptr->get_size(); + } + + pdlg->setLabelText(text_base.arg(gui::utils::format_byte_size(bytes_written)).arg(*half_seconds / 2).arg(qstr(verbose_message))); // 300MB -> 50%, 600MB -> 75%, 1200MB -> 87.5% etc - pdlg->setValue(std::clamp(static_cast(100. - 100. / std::pow(2., std::fmax(0.01, bytes_written * 1. / (300 * 1024 * 1024)))), 2, 100)); + const int percent = std::clamp(static_cast(100. - 100. / std::pow(2., std::fmax(0.01, bytes_written * 1. / (300 * 1024 * 1024)))), 2, 100); + + // Add a third of the remaining progress when the keyword is found + pdlg->setValue(verbose_message.find("Finalizing") != umax ? 100 - ((100 - percent) * 2 / 3) : percent); if (*closed_successfully) {