diff --git a/rpcs3/Emu/CPU/CPUTranslator.h b/rpcs3/Emu/CPU/CPUTranslator.h index 9a61cf769f4d..f1488e11320b 100644 --- a/rpcs3/Emu/CPU/CPUTranslator.h +++ b/rpcs3/Emu/CPU/CPUTranslator.h @@ -923,6 +923,12 @@ class cpu_translator return llvm_value_t::get_type(m_context); } + template + llvm::FunctionType* get_ftype() + { + return llvm::FunctionType::get(get_type(), {get_type()...}, false); + } + template using value_t = llvm_value_t; @@ -1083,6 +1089,15 @@ class cpu_translator return result; } + template + auto vsplat(V v) + { + value_t result; + static_assert(result.is_vector); + result.value = m_ir->CreateVectorSplat(result.is_vector, v.eval(m_ir)); + return result; + } + // Min template auto min(T a, T b) @@ -1257,6 +1272,19 @@ class cpu_translator return result; } + llvm::Value* load_const(llvm::GlobalVariable* g, llvm::Value* i) + { + return m_ir->CreateLoad(m_ir->CreateGEP(g, {m_ir->getInt64(0), m_ir->CreateZExtOrTrunc(i, get_type())})); + } + + template + value_t load_const(llvm::GlobalVariable* g, I i) + { + value_t result; + result.value = load_const(g, i.eval(m_ir)); + return result; + } + template R get_const_vector(llvm::Constant*, u32 a, u32 b); diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index b9c800bfd825..788f26660460 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -66,6 +66,8 @@ DECLARE(spu_runtime::g_dispatcher) = [] return ptr; }(); +DECLARE(spu_runtime::g_interpreter) = nullptr; + spu_cache::spu_cache(const std::string& loc) : m_file(loc, fs::read + fs::write + fs::create + fs::append) { @@ -132,6 +134,8 @@ void spu_cache::add(const std::vector& func) void spu_cache::initialize() { + spu_runtime::g_interpreter = nullptr; + const std::string ppu_cache = Emu.PPUCache(); if (ppu_cache.empty()) @@ -160,6 +164,20 @@ void spu_cache::initialize() u32 thread_count = max_threads > 0 ? std::min(max_threads, std::thread::hardware_concurrency()) : std::thread::hardware_concurrency(); std::vector> compilers{thread_count}; + if (g_cfg.core.spu_decoder == spu_decoder_type::fast) + { + if (auto compiler = spu_recompiler_base::make_llvm_recompiler(11)) + { + compiler->init(); + + if (compiler->compile(0, {}) && spu_runtime::g_interpreter) + { + LOG_SUCCESS(SPU, "SPU Runtime: built interpreter."); + return; + } + } + } + for (auto& compiler : compilers) { if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit) @@ -2069,14 +2087,29 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator // JIT Instance jit_compiler m_jit{{}, jit_compiler::cpu(g_cfg.core.llvm_cpu)}; - // Current function (chunk) - llvm::Function* m_function; + // Interpreter table size power + const u8 m_interp_magn; + + // Constant opcode bits + u32 m_op_const_mask = -1; // Current function chunk entry point u32 m_entry; + // Current function (chunk) + llvm::Function* m_function; + llvm::Value* m_thread; llvm::Value* m_lsptr; + llvm::Value* m_interp_op; + llvm::Value* m_interp_pc; + llvm::Value* m_interp_table; + llvm::Value* m_interp_7f0; + llvm::Value* m_interp_regs; + + // Helpers + llvm::Value* m_interp_pc_next; + llvm::BasicBlock* m_interp_bblock; // i8*, contains constant vm::g_base_addr value llvm::Value* m_memptr; @@ -2087,6 +2120,10 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator // Global variable (function table) llvm::GlobalVariable* m_function_table{}; + // Helpers (interpreter) + llvm::GlobalVariable* m_scale_float_to{}; + llvm::GlobalVariable* m_scale_to_float{}; + llvm::MDNode* m_md_unlikely; llvm::MDNode* m_md_likely; @@ -2263,7 +2300,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator if (const auto phi = m_blocks[target].phi[i]) { const auto typ = phi->getType() == get_type() ? get_type() : get_reg_type(i); - phi->addIncoming(get_vr(i, typ), m_block->block_end); + phi->addIncoming(get_reg_fixed(i, typ), m_block->block_end); } } } @@ -2333,8 +2370,13 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator } } - llvm::Value* init_vr(u32 index) + llvm::Value* init_reg_fixed(u32 index) { + if (!m_block) + { + return m_ir->CreateBitCast(_ptr(m_thread, get_reg_offset(index)), get_reg_type(index)->getPointerTo()); + } + auto& ptr = m_reg_addr.at(index); if (!ptr) @@ -2352,6 +2394,25 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator return ptr; } + // Get pointer to the vector register (interpreter only) + template + llvm::Value* init_vr(const bf_t& index) + { + if (!m_interp_magn) + { + m_interp_7f0 = m_ir->getInt32(0x7f0); + m_interp_regs = _ptr(m_thread, get_reg_offset(0)); + } + + // Extract reg index + const auto isl = I >= 4 ? m_interp_op : m_ir->CreateShl(m_interp_op, u64{4 - I}); + const auto isr = I <= 4 ? m_interp_op : m_ir->CreateLShr(m_interp_op, u64{I - 4}); + const auto idx = m_ir->CreateAnd(I > 4 ? isr : isl, m_interp_7f0); + + // Pointer to the register + return m_ir->CreateBitCast(m_ir->CreateGEP(m_interp_regs, m_ir->CreateZExt(idx, get_type())), get_type()); + } + llvm::Value* double_as_uint64(llvm::Value* val) { if (llvm::isa(val)) @@ -2479,14 +2540,26 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator return m_ir->CreateOr(s, e); } - llvm::Value* get_vr(u32 index, llvm::Type* type) + llvm::Value* get_reg_raw(u32 index) + { + if (!m_block || index >= m_block->reg.size()) + { + return nullptr; + } + + return m_block->reg[index]; + } + + llvm::Value* get_reg_fixed(u32 index, llvm::Type* type) { - auto& reg = m_block->reg.at(index); + llvm::Value* dummy{}; + + auto& reg = *(m_block ? &m_block->reg.at(index) : &dummy); if (!reg) { // Load register value if necessary - reg = m_ir->CreateLoad(init_vr(index)); + reg = m_ir->CreateLoad(init_reg_fixed(index)); } if (reg->getType() == get_type()) @@ -2564,17 +2637,50 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator } template - value_t get_vr(u32 index) + value_t get_reg_fixed(u32 index) { value_t r; - r.value = get_vr(index, get_type()); + r.value = get_reg_fixed(index, get_type()); + return r; + } + + template + value_t get_vr(const bf_t& index) + { + value_t r; + + if ((m_op_const_mask & index.data_mask()) != index.data_mask()) + { + // Update const mask if necessary + if (I >= (32 - m_interp_magn)) + { + m_op_const_mask |= index.data_mask(); + } + + // Load reg + if (get_type() == get_type()) + { + r.value = xfloat_to_double(m_ir->CreateLoad(init_vr(index))); + } + else + { + r.value = m_ir->CreateLoad(init_vr(index)); + } + } + else + { + r.value = get_reg_fixed(index, get_type()); + } + return r; } - void set_vr(u32 index, llvm::Value* value, bool fixup = true) + void set_reg_fixed(u32 index, llvm::Value* value, bool fixup = true) { + llvm::StoreInst* dummy{}; + // Check - verify(HERE), m_regmod[m_pos / 4] == index; + verify(HERE), !m_block || m_regmod[m_pos / 4] == index; // Test for special case const bool is_xfloat = value->getType() == get_type(); @@ -2583,26 +2689,120 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator const auto saved_value = is_xfloat && fixup ? xfloat_in_double(value) : value; // Set register value - m_block->reg.at(index) = saved_value; + if (m_block) + { + m_block->reg.at(index) = saved_value; + } // Get register location - const auto addr = init_vr(index); + const auto addr = init_reg_fixed(index); + + auto& _store = *(m_block ? &m_block->store[index] : &dummy); // Erase previous dead store instruction if necessary - if (m_block->store[index]) + if (_store) { // TODO: better cross-block dead store elimination - m_block->store[index]->eraseFromParent(); + _store->eraseFromParent(); } // Write register to the context - m_block->store[index] = m_ir->CreateStore(is_xfloat ? double_to_xfloat(saved_value) : m_ir->CreateBitCast(value, addr->getType()->getPointerElementType()), addr); + _store = m_ir->CreateStore(is_xfloat ? double_to_xfloat(saved_value) : m_ir->CreateBitCast(value, addr->getType()->getPointerElementType()), addr); } - template - void set_vr(u32 index, T expr, bool fixup = true) + template + void set_vr(const bf_t& index, T expr, bool fixup = true) + { + // Process expression + const auto value = expr.eval(m_ir); + + // Test for special case + const bool is_xfloat = value->getType() == get_type(); + + if ((m_op_const_mask & index.data_mask()) != index.data_mask()) + { + // Update const mask if necessary + if (I >= (32 - m_interp_magn)) + { + m_op_const_mask |= index.data_mask(); + } + + // Clamp value if necessary + const auto saved_value = is_xfloat && fixup ? xfloat_in_double(value) : value; + + // Store value + m_ir->CreateStore(is_xfloat ? double_to_xfloat(saved_value) : m_ir->CreateBitCast(value, get_type()), init_vr(index)); + return; + } + + set_reg_fixed(index, value, fixup); + } + + template + value_t get_imm(const bf_t& imm, bool mask = true) { - set_vr(index, expr.eval(m_ir), fixup); + if ((m_op_const_mask & imm.data_mask()) != imm.data_mask()) + { + // Update const mask if necessary + if (I >= (32 - m_interp_magn)) + { + m_op_const_mask |= imm.data_mask(); + } + + // Extract unsigned immediate (skip AND if mask == false or truncated anyway) + value_t r; + r.value = m_interp_op; + r.value = I == 0 ? r.value : m_ir->CreateLShr(r.value, u64{I}); + r.value = !mask || N >= r.esize ? r.value : m_ir->CreateAnd(r.value, imm.data_mask() >> I); + + if (r.esize != 32) + { + r.value = m_ir->CreateZExtOrTrunc(r.value, get_type()->getScalarType()); + } + + if (r.is_vector) + { + r.value = m_ir->CreateVectorSplat(r.is_vector, r.value); + } + + return r; + } + + return splat(imm); + } + + template + value_t get_imm(const bf_t& imm) + { + if ((m_op_const_mask & imm.data_mask()) != imm.data_mask()) + { + // Update const mask if necessary + if (I >= (32 - m_interp_magn)) + { + m_op_const_mask |= imm.data_mask(); + } + + // Extract signed immediate (skip sign ext if truncated anyway) + value_t r; + r.value = m_interp_op; + r.value = I + N == 32 || N >= r.esize ? r.value : m_ir->CreateShl(r.value, u64{32 - I - N}); + r.value = N == 32 || N >= r.esize ? r.value : m_ir->CreateAShr(r.value, u64{32 - N}); + r.value = I == 0 || N < r.esize ? r.value : m_ir->CreateLShr(r.value, u64{I}); + + if (r.esize != 32) + { + r.value = m_ir->CreateSExtOrTrunc(r.value, get_type()->getScalarType()); + } + + if (r.is_vector) + { + r.value = m_ir->CreateVectorSplat(r.is_vector, r.value); + } + + return r; + } + + return splat(imm); } // Return either basic block addr with single dominating value, or negative number of PHI entries @@ -2766,9 +2966,10 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator } public: - spu_llvm_recompiler() + spu_llvm_recompiler(u8 interp_magn = 0) : spu_recompiler_base() , cpu_translator(nullptr, false) + , m_interp_magn(interp_magn) { } @@ -2794,6 +2995,11 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator virtual bool compile(u64 last_reset_count, const std::vector& func) override { + if (func.empty() && last_reset_count == 0 && m_interp_magn) + { + return compile_interpreter(); + } + const auto fn_location = m_spurt->find(last_reset_count, func); if (fn_location == spu_runtime::g_dispatcher) @@ -3070,7 +3276,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator if (!value || value->getType() != _phi->getType()) { - const auto regptr = init_vr(i); + const auto regptr = init_reg_fixed(i); const auto cblock = m_ir->GetInsertBlock(); m_ir->SetInsertPoint(bfound->second.block_end->getTerminator()); @@ -3107,7 +3313,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator if (baddr == m_entry) { // Load value at the function chunk's entry block if necessary - const auto regptr = init_vr(i); + const auto regptr = init_reg_fixed(i); const auto cblock = m_ir->GetInsertBlock(); m_ir->SetInsertPoint(m_function->getEntryBlock().getTerminator()); const auto value = m_finfo->reg[i] ? m_finfo->reg[i] : m_ir->CreateLoad(regptr); @@ -3340,6 +3546,365 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator return true; } + static void interp_check(spu_thread* _spu, bool after) + { + static const spu_decoder s_dec; + + static thread_local std::array s_gpr; + + if (!after) + { + // Preserve reg state + s_gpr = _spu->gpr; + + // Execute interpreter instruction + const u32 op = *reinterpret_cast*>(_spu->_ptr(0) + _spu->pc); + if (!s_dec.decode(op)(*_spu, {op})) + LOG_FATAL(SPU, "Bad instruction" HERE); + + // Swap state + for (u32 i = 0; i < s_gpr.size(); ++i) + std::swap(_spu->gpr[i], s_gpr[i]); + } + else + { + // Check saved state + for (u32 i = 0; i < s_gpr.size(); ++i) + { + if (_spu->gpr[i] != s_gpr[i]) + { + LOG_FATAL(SPU, "Register mismatch: $%u\n%s\n%s", i, _spu->gpr[i], s_gpr[i]); + _spu->state += cpu_flag::dbg_pause; + } + } + } + } + + bool compile_interpreter() + { + using namespace llvm; + + // Create LLVM module + std::unique_ptr module = std::make_unique("spu_interpreter.obj", m_context); + module->setTargetTriple(Triple::normalize(sys::getProcessTriple())); + m_module = module.get(); + + // Initialize IR Builder + IRBuilder<> irb(m_context); + m_ir = &irb; + + // Create interpreter table + const auto if_type = get_ftype(); + const auto if_pptr = if_type->getPointerTo()->getPointerTo(); + m_function_table = new GlobalVariable(*m_module, ArrayType::get(if_type->getPointerTo(), 1u << m_interp_magn), true, GlobalValue::InternalLinkage, nullptr); + + // Add return function + const auto ret_func = cast(module->getOrInsertFunction("spu_ret", if_type)); + ret_func->setCallingConv(CallingConv::GHC); + ret_func->setLinkage(GlobalValue::InternalLinkage); + m_ir->SetInsertPoint(BasicBlock::Create(m_context, "", ret_func)); + m_thread = &*(ret_func->arg_begin() + 1); + m_interp_pc = &*(ret_func->arg_begin() + 2); + m_ir->CreateStore(m_interp_pc, spu_ptr(&spu_thread::pc)); + m_ir->CreateRetVoid(); + + // Add entry function, serves as a trampoline + const auto main_func = llvm::cast(m_module->getOrInsertFunction("spu_interpreter", get_ftype())); + set_function(main_func); + + // Load pc and opcode + m_interp_pc = m_ir->CreateLoad(spu_ptr(&spu_thread::pc)); + m_interp_op = m_ir->CreateLoad(m_ir->CreateBitCast(m_ir->CreateGEP(m_lsptr, m_ir->CreateZExt(m_interp_pc, get_type())), get_type())); + m_interp_op = m_ir->CreateCall(get_intrinsic(Intrinsic::bswap), {m_interp_op}); + + // Pinned constant, address of interpreter table + m_interp_table = m_ir->CreateBitCast(m_ir->CreateGEP(m_function_table, {m_ir->getInt64(0), m_ir->getInt64(0)}), get_type()); + + // Pinned constant, mask for shifted register index + m_interp_7f0 = m_ir->getInt32(0x7f0); + + // Pinned constant, address of first register + m_interp_regs = _ptr(m_thread, get_reg_offset(0)); + + // Decode (shift) and load function pointer + const auto first = m_ir->CreateLoad(m_ir->CreateGEP(m_ir->CreateBitCast(m_interp_table, if_pptr), m_ir->CreateLShr(m_interp_op, 32 - m_interp_magn))); + const auto call0 = m_ir->CreateCall(first, {m_lsptr, m_thread, m_interp_pc, m_interp_op, m_interp_table, m_interp_7f0, m_interp_regs}); + call0->setCallingConv(CallingConv::GHC); + m_ir->CreateRetVoid(); + + // Create helper globals + { + std::vector float_to; + std::vector to_float; + float_to.reserve(256); + to_float.reserve(256); + + for (int i = 0; i < 256; ++i) + { + float_to.push_back(ConstantFP::get(get_type(), std::exp2(173 - i))); + to_float.push_back(ConstantFP::get(get_type(), std::exp2(i - 155))); + } + + const auto atype = ArrayType::get(get_type(), 256); + m_scale_float_to = new GlobalVariable(*m_module, atype, true, GlobalValue::InternalLinkage, ConstantArray::get(atype, float_to)); + m_scale_to_float = new GlobalVariable(*m_module, atype, true, GlobalValue::InternalLinkage, ConstantArray::get(atype, to_float)); + } + + // Fill interpreter table + std::vector iptrs; + iptrs.reserve(1u << m_interp_magn); + + m_block = nullptr; + + auto last_itype = spu_itype::UNK; + + for (u32 i = 0; i < 1u << m_interp_magn;) + { + // Fake opcode + const u32 op = i << (32 - m_interp_magn); + + // Instruction type + const auto itype = s_spu_itype.decode(op); + + // Function name + std::string fname = fmt::format("spu_%s", s_spu_iname.decode(op)); + + if (last_itype != itype) + { + // Trigger automatic information collection (probing) + m_op_const_mask = 0; + } + else + { + // Inject const mask into function name + fmt::append(fname, "_%X", (i & (m_op_const_mask >> (32 - m_interp_magn))) | (1u << m_interp_magn)); + } + + // Decode instruction name, access function + const auto f = cast(module->getOrInsertFunction(fname, if_type)); + + // Build if necessary + if (f->empty()) + { + f->setCallingConv(CallingConv::GHC); + f->setLinkage(GlobalValue::InternalLinkage); + + m_function = f; + m_lsptr = &*(f->arg_begin() + 0); + m_thread = &*(f->arg_begin() + 1); + m_interp_pc = &*(f->arg_begin() + 2); + m_interp_op = &*(f->arg_begin() + 3); + m_interp_table = &*(f->arg_begin() + 4); + m_interp_7f0 = &*(f->arg_begin() + 5); + m_interp_regs = &*(f->arg_begin() + 6); + + m_ir->SetInsertPoint(BasicBlock::Create(m_context, "", f)); + + switch (itype) + { + case spu_itype::UNK: + case spu_itype::DFCEQ: + case spu_itype::DFCMEQ: + case spu_itype::DFCGT: + //case spu_itype::DFCMGT: + case spu_itype::DFTSV: + case spu_itype::STOP: + case spu_itype::STOPD: + case spu_itype::RDCH: + case spu_itype::WRCH: + { + // Invalid or abortable instruction. Save current address. + m_ir->CreateStore(m_interp_pc, spu_ptr(&spu_thread::pc)); + [[fallthrough]]; + } + default: + { + break; + } + } + + try + { + m_interp_bblock = nullptr; + + // Next instruction (no wraparound at the end of LS) + m_interp_pc_next = m_ir->CreateAdd(m_interp_pc, m_ir->getInt32(4)); + + bool check = false; + + if (itype == spu_itype::WRCH || + itype == spu_itype::RDCH || + itype == spu_itype::RCHCNT || + itype == spu_itype::STOP || + itype == spu_itype::STOPD || + itype & spu_itype::floating || + itype & spu_itype::branch) + { + check = false; + } + + if (itype & spu_itype::branch) + { + // Instruction changes pc - change order. + (this->*g_decoder.decode(op))({op}); + + if (m_interp_bblock) + { + m_ir->SetInsertPoint(m_interp_bblock); + m_interp_bblock = nullptr; + } + } + + if (!m_ir->GetInsertBlock()->getTerminator()) + { + if (check) + { + m_ir->CreateStore(m_interp_pc, spu_ptr(&spu_thread::pc)); + } + + // Decode next instruction. + const auto next_pc = itype & spu_itype::branch ? m_interp_pc : m_interp_pc_next; + const auto be32_op = m_ir->CreateLoad(m_ir->CreateBitCast(m_ir->CreateGEP(m_lsptr, m_ir->CreateZExt(next_pc, get_type())), get_type())); + const auto next_op = m_ir->CreateCall(get_intrinsic(Intrinsic::bswap), {be32_op}); + const auto next_if = m_ir->CreateLoad(m_ir->CreateGEP(m_ir->CreateBitCast(m_interp_table, if_pptr), m_ir->CreateLShr(next_op, 32 - m_interp_magn))); + llvm::cast(next_if)->setVolatile(true); + + if (!(itype & spu_itype::branch)) + { + if (check) + { + call(&interp_check, m_thread, m_ir->getFalse()); + } + + // Normal instruction. + (this->*g_decoder.decode(op))({op}); + + if (check && !m_ir->GetInsertBlock()->getTerminator()) + { + call(&interp_check, m_thread, m_ir->getTrue()); + } + + m_interp_pc = m_interp_pc_next; + } + + if (!m_ir->GetInsertBlock()->getTerminator()) + { + // Call next instruction. + const auto _stop = BasicBlock::Create(m_context, "", f); + const auto _next = BasicBlock::Create(m_context, "", f); + m_ir->CreateCondBr(m_ir->CreateIsNotNull(m_ir->CreateLoad(spu_ptr(&spu_thread::state))), _stop, _next, m_md_unlikely); + m_ir->SetInsertPoint(_next); + + if (itype == spu_itype::WRCH || + itype == spu_itype::RDCH || + itype == spu_itype::RCHCNT || + itype == spu_itype::STOP || + itype == spu_itype::STOPD) + { + m_interp_7f0 = m_ir->getInt32(0x7f0); + m_interp_regs = _ptr(m_thread, get_reg_offset(0)); + } + + const auto ncall = m_ir->CreateCall(next_if, {m_lsptr, m_thread, m_interp_pc, next_op, m_interp_table, m_interp_7f0, m_interp_regs}); + ncall->setCallingConv(CallingConv::GHC); + ncall->setTailCall(); + m_ir->CreateRetVoid(); + m_ir->SetInsertPoint(_stop); + m_ir->CreateStore(m_interp_pc, spu_ptr(&spu_thread::pc)); + m_ir->CreateRetVoid(); + } + } + } + catch (const std::exception& e) + { + std::string dump; + raw_string_ostream out(dump); + out << *module; // print IR + out.flush(); + LOG_ERROR(SPU, "[0x%x] LLVM dump:\n%s", m_pos, dump); + throw; + } + } + + if (last_itype != itype) + { + // Repeat after probing + last_itype = itype; + } + else + { + // Add to the table + iptrs.push_back(f); + i++; + } + } + + m_function_table->setInitializer(ConstantArray::get(ArrayType::get(if_type->getPointerTo(), 1u << m_interp_magn), iptrs)); + m_function_table = nullptr; + + // Initialize pass manager + legacy::FunctionPassManager pm(module.get()); + + // Basic optimizations + pm.add(createEarlyCSEPass()); + pm.add(createCFGSimplificationPass()); + pm.add(createDeadStoreEliminationPass()); + pm.add(createAggressiveDCEPass()); + //pm.add(createLintPass()); + + std::string log; + + raw_string_ostream out(log); + + if (g_cfg.core.spu_debug) + { + fmt::append(log, "LLVM IR (interpreter):\n"); + out << *module; // print IR + out << "\n\n"; + } + + if (verifyModule(*module, &out)) + { + out.flush(); + LOG_ERROR(SPU, "LLVM: Verification failed:\n%s", log); + + if (g_cfg.core.spu_debug) + { + fs::file(m_spurt->get_cache_path() + "spu.log", fs::write + fs::append).write(log); + } + + fmt::raw_error("Compilation failed"); + } + + if (g_cfg.core.spu_debug) + { + // Testing only + m_jit.add(std::move(module), m_spurt->get_cache_path() + "llvm/"); + } + else + { + m_jit.add(std::move(module)); + } + + m_jit.fin(); + + // Register interpreter entry point + spu_runtime::g_interpreter = reinterpret_cast(m_jit.get_engine().getPointerToFunction(main_func)); + + if (!spu_runtime::g_interpreter) + { + return false; + } + + if (g_cfg.core.spu_debug) + { + out.flush(); + fs::file(m_spurt->get_cache_path() + "spu.log", fs::write + fs::append).write(log); + } + + return true; + } + static bool exec_check_state(spu_thread* _spu) { return _spu->check_state(); @@ -3357,6 +3922,12 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator template void fall(spu_opcode_t op) { + if (m_interp_magn) + { + call(F, m_thread, m_interp_op); + return; + } + update_pc(); call(&exec_fall, m_thread, m_ir->getInt32(op.opcode)); } @@ -3368,6 +3939,13 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void UNK(spu_opcode_t op_unk) { + if (m_interp_magn) + { + m_ir->CreateStore(m_interp_pc, spu_ptr(&spu_thread::pc)); + call(&exec_unk, m_thread, m_ir->getInt32(op_unk.opcode)); + return; + } + m_block->block_end = m_ir->GetInsertBlock(); update_pc(); tail(&exec_unk, m_thread, m_ir->getInt32(op_unk.opcode)); @@ -3380,6 +3958,18 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void STOP(spu_opcode_t op) // { + if (m_interp_magn) + { + const auto succ = call(&exec_stop, m_thread, m_ir->CreateAnd(m_interp_op, m_ir->getInt32(0x3fff))); + const auto next = llvm::BasicBlock::Create(m_context, "", m_function); + const auto stop = llvm::BasicBlock::Create(m_context, "", m_function); + m_ir->CreateCondBr(succ, next, stop); + m_ir->SetInsertPoint(stop); + m_ir->CreateRetVoid(); + m_ir->SetInsertPoint(next); + return; + } + update_pc(); const auto succ = call(&exec_stop, m_thread, m_ir->getInt32(op.opcode & 0x3fff)); const auto next = llvm::BasicBlock::Create(m_context, "", m_function); @@ -3403,6 +3993,18 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void STOPD(spu_opcode_t op) // { + if (m_interp_magn) + { + const auto succ = call(&exec_stop, m_thread, m_ir->getInt32(0x3fff)); + const auto next = llvm::BasicBlock::Create(m_context, "", m_function); + const auto stop = llvm::BasicBlock::Create(m_context, "", m_function); + m_ir->CreateCondBr(succ, next, stop); + m_ir->SetInsertPoint(stop); + m_ir->CreateRetVoid(); + m_ir->SetInsertPoint(next); + return; + } + STOP(spu_opcode_t{0x3fff}); } @@ -3478,6 +4080,20 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator { value_t res; + if (m_interp_magn) + { + res.value = call(&exec_rdch, m_thread, get_imm(op.ra).value); + const auto next = llvm::BasicBlock::Create(m_context, "", m_function); + const auto stop = llvm::BasicBlock::Create(m_context, "", m_function); + m_ir->CreateCondBr(m_ir->CreateICmpSLT(res.value, m_ir->getInt64(0)), stop, next); + m_ir->SetInsertPoint(stop); + m_ir->CreateRetVoid(); + m_ir->SetInsertPoint(next); + res.value = m_ir->CreateTrunc(res.value, get_type()); + set_vr(op.rt, insert(splat(0), 3, res)); + return; + } + switch (op.ra) { case SPU_RdSRR0: @@ -3596,6 +4212,13 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator { value_t res; + if (m_interp_magn) + { + res.value = call(&exec_rchcnt, m_thread, get_imm(op.ra).value); + set_vr(op.rt, insert(splat(0), 3, res)); + return; + } + switch (op.ra) { case SPU_WrOutMbox: @@ -3703,6 +4326,18 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator { const auto val = extract(get_vr(op.rt), 3); + if (m_interp_magn) + { + const auto succ = call(&exec_wrch, m_thread, get_imm(op.ra).value, val.value); + const auto next = llvm::BasicBlock::Create(m_context, "", m_function); + const auto stop = llvm::BasicBlock::Create(m_context, "", m_function); + m_ir->CreateCondBr(succ, next, stop); + m_ir->SetInsertPoint(stop); + m_ir->CreateRetVoid(); + m_ir->SetInsertPoint(next); + return; + } + switch (op.ra) { case SPU_WrSRR0: @@ -3766,7 +4401,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator } case MFC_LSA: { - set_vr(s_reg_mfc_lsa, val); + set_reg_fixed(s_reg_mfc_lsa, val.value); return; } case MFC_EAH: @@ -3785,17 +4420,17 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator } case MFC_EAL: { - set_vr(s_reg_mfc_eal, val); + set_reg_fixed(s_reg_mfc_eal, val.value); return; } case MFC_Size: { - set_vr(s_reg_mfc_size, trunc(val & 0x7fff)); + set_reg_fixed(s_reg_mfc_size, trunc(val & 0x7fff).value); return; } case MFC_TagID: { - set_vr(s_reg_mfc_tag, trunc(val & 0x1f)); + set_reg_fixed(s_reg_mfc_tag, trunc(val & 0x1f).value); return; } case MFC_Cmd: @@ -3814,11 +4449,11 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator if (auto ci = llvm::dyn_cast(trunc(val).value)) { - const auto eal = get_vr(s_reg_mfc_eal); - const auto lsa = get_vr(s_reg_mfc_lsa); - const auto tag = get_vr(s_reg_mfc_tag); + const auto eal = get_reg_fixed(s_reg_mfc_eal); + const auto lsa = get_reg_fixed(s_reg_mfc_lsa); + const auto tag = get_reg_fixed(s_reg_mfc_tag); - const auto size = get_vr(s_reg_mfc_size); + const auto size = get_reg_fixed(s_reg_mfc_size); const auto mask = m_ir->CreateShl(m_ir->getInt32(1), zext(tag).value); const auto exec = llvm::BasicBlock::Create(m_context, "", m_function); const auto fail = llvm::BasicBlock::Create(m_context, "", m_function); @@ -4131,7 +4766,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator // This instruction must be used following a store instruction that modifies the instruction stream. m_ir->CreateFence(llvm::AtomicOrdering::SequentiallyConsistent); - if (g_cfg.core.spu_block_size == spu_block_size_type::safe) + if (g_cfg.core.spu_block_size == spu_block_size_type::safe && !m_interp_magn) { m_block->block_end = m_ir->GetInsertBlock(); m_ir->CreateStore(m_ir->getInt32(m_pos + 4), spu_ptr(&spu_thread::pc)); @@ -4156,12 +4791,12 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator // Check SPUInterpreter for notes. } - void SF(spu_opcode_t op) // + void SF(spu_opcode_t op) { set_vr(op.rt, get_vr(op.rb) - get_vr(op.ra)); } - void OR(spu_opcode_t op) // + void OR(spu_opcode_t op) { set_vr(op.rt, get_vr(op.ra) | get_vr(op.rb)); } @@ -4173,12 +4808,12 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator set_vr(op.rt, zext(a <= b)); } - void SFH(spu_opcode_t op) // + void SFH(spu_opcode_t op) { set_vr(op.rt, get_vr(op.rb) - get_vr(op.ra)); } - void NOR(spu_opcode_t op) // + void NOR(spu_opcode_t op) { set_vr(op.rt, ~(get_vr(op.ra) | get_vr(op.rb))); } @@ -4190,118 +4825,139 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator set_vr(op.rt, max(a, b) - min(a, b)); } + template + void make_spu_rol(spu_opcode_t op, value_t by) + { + set_vr(op.rt, rol(get_vr(op.ra), by)); + } + + template + void make_spu_rotate_mask(spu_opcode_t op, value_t by) + { + value_t sh; + sh.value = m_ir->CreateAnd(m_ir->CreateNeg(by.value), by.esize * 2 - 1); + if constexpr (sh.esize != by.esize) + sh.value = m_ir->CreateZExtOrTrunc(sh.value, m_ir->getIntNTy(sh.esize)); + if constexpr (!by.is_vector) + sh.value = m_ir->CreateVectorSplat(sh.is_vector, sh.value); + + set_vr(op.rt, select(sh < by.esize, eval(get_vr(op.ra) >> sh), splat(0))); + } + + template + void make_spu_rotate_sext(spu_opcode_t op, value_t by) + { + value_t sh; + sh.value = m_ir->CreateAnd(m_ir->CreateNeg(by.value), by.esize * 2 - 1); + if constexpr (sh.esize != by.esize) + sh.value = m_ir->CreateZExtOrTrunc(sh.value, m_ir->getIntNTy(sh.esize)); + if constexpr (!by.is_vector) + sh.value = m_ir->CreateVectorSplat(sh.is_vector, sh.value); + + value_t max_sh = splat(by.esize - 1); + sh.value = m_ir->CreateSelect(m_ir->CreateICmpUGT(max_sh.value, sh.value), sh.value, max_sh.value); + set_vr(op.rt, get_vr(op.ra) >> sh); + } + + template + void make_spu_shift_left(spu_opcode_t op, value_t by) + { + value_t sh; + sh.value = m_ir->CreateAnd(by.value, by.esize * 2 - 1); + if constexpr (sh.esize != by.esize) + sh.value = m_ir->CreateZExtOrTrunc(sh.value, m_ir->getIntNTy(sh.esize)); + if constexpr (!by.is_vector) + sh.value = m_ir->CreateVectorSplat(sh.is_vector, sh.value); + + set_vr(op.rt, select(sh < by.esize, eval(get_vr(op.ra) << sh), splat(0))); + } + void ROT(spu_opcode_t op) { - set_vr(op.rt, rol(get_vr(op.ra), get_vr(op.rb))); + make_spu_rol(op, get_vr(op.rb)); } void ROTM(spu_opcode_t op) { - const auto sh = eval(-get_vr(op.rb) & 0x3f); - set_vr(op.rt, select(sh < 0x20, eval(get_vr(op.ra) >> sh), splat(0))); + make_spu_rotate_mask(op, get_vr(op.rb)); } void ROTMA(spu_opcode_t op) { - const auto sh = eval(-get_vr(op.rb) & 0x3f); - set_vr(op.rt, get_vr(op.ra) >> bitcast(min(sh, splat(0x1f)))); + make_spu_rotate_sext(op, get_vr(op.rb)); } void SHL(spu_opcode_t op) { - const auto sh = eval(get_vr(op.rb) & 0x3f); - set_vr(op.rt, select(sh < 0x20, eval(get_vr(op.ra) << sh), splat(0))); + make_spu_shift_left(op, get_vr(op.rb)); } void ROTH(spu_opcode_t op) { - set_vr(op.rt, rol(get_vr(op.ra), get_vr(op.rb))); + make_spu_rol(op, get_vr(op.rb)); } void ROTHM(spu_opcode_t op) { - const auto sh = eval(-get_vr(op.rb) & 0x1f); - set_vr(op.rt, select(sh < 0x10, eval(get_vr(op.ra) >> sh), splat(0))); + make_spu_rotate_mask(op, get_vr(op.rb)); } void ROTMAH(spu_opcode_t op) { - const auto sh = eval(-get_vr(op.rb) & 0x1f); - set_vr(op.rt, get_vr(op.ra) >> bitcast(min(sh, splat(0xf)))); + make_spu_rotate_sext(op, get_vr(op.rb)); } void SHLH(spu_opcode_t op) { - const auto sh = eval(get_vr(op.rb) & 0x1f); - set_vr(op.rt, select(sh < 0x10, eval(get_vr(op.ra) << sh), splat(0))); + make_spu_shift_left(op, get_vr(op.rb)); } void ROTI(spu_opcode_t op) { - set_vr(op.rt, rol(get_vr(op.ra), op.i7)); + make_spu_rol(op, get_imm(op.i7, false)); } void ROTMI(spu_opcode_t op) { - if (-op.i7 & 0x20) - { - return set_vr(op.rt, splat(0)); - } - - set_vr(op.rt, get_vr(op.ra) >> (-op.i7 & 0x1f)); + make_spu_rotate_mask(op, get_imm(op.i7, false)); } void ROTMAI(spu_opcode_t op) { - set_vr(op.rt, get_vr(op.ra) >> (-op.i7 & 0x20 ? 0x1f : -op.i7 & 0x1f)); + make_spu_rotate_sext(op, get_imm(op.i7, false)); } void SHLI(spu_opcode_t op) { - if (op.i7 & 0x20) - { - return set_vr(op.rt, splat(0)); - } - - set_vr(op.rt, get_vr(op.ra) << (op.i7 & 0x1f)); + make_spu_shift_left(op, get_imm(op.i7, false)); } void ROTHI(spu_opcode_t op) { - set_vr(op.rt, rol(get_vr(op.ra), op.i7)); + make_spu_rol(op, get_imm(op.i7, false)); } void ROTHMI(spu_opcode_t op) { - if (-op.i7 & 0x10) - { - return set_vr(op.rt, splat(0)); - } - - set_vr(op.rt, get_vr(op.ra) >> (-op.i7 & 0xf)); + make_spu_rotate_mask(op, get_imm(op.i7, false)); } void ROTMAHI(spu_opcode_t op) { - set_vr(op.rt, get_vr(op.ra) >> (-op.i7 & 0x10 ? 0xf : -op.i7 & 0xf)); + make_spu_rotate_sext(op, get_imm(op.i7, false)); } void SHLHI(spu_opcode_t op) { - if (op.i7 & 0x10) - { - return set_vr(op.rt, splat(0)); - } - - set_vr(op.rt, get_vr(op.ra) << (op.i7 & 0xf)); + make_spu_shift_left(op, get_imm(op.i7, false)); } - void A(spu_opcode_t op) // + void A(spu_opcode_t op) { set_vr(op.rt, get_vr(op.ra) + get_vr(op.rb)); } - void AND(spu_opcode_t op) // + void AND(spu_opcode_t op) { set_vr(op.rt, get_vr(op.ra) & get_vr(op.rb)); } @@ -4313,12 +4969,12 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator set_vr(op.rt, zext(a + b < a)); } - void AH(spu_opcode_t op) // + void AH(spu_opcode_t op) { set_vr(op.rt, get_vr(op.ra) + get_vr(op.rb)); } - void NAND(spu_opcode_t op) // + void NAND(spu_opcode_t op) { set_vr(op.rt, ~(get_vr(op.ra) & get_vr(op.rb))); } @@ -4542,7 +5198,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void CBD(spu_opcode_t op) { - const auto a = eval(extract(get_vr(op.ra), 3) + op.i7); + const auto a = eval(extract(get_vr(op.ra), 3) + get_imm(op.i7)); const auto i = eval(~a & 0xf); auto r = build(0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10); r.value = m_ir->CreateInsertElement(r.value, m_ir->getInt8(0x3), i.value); @@ -4551,7 +5207,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void CHD(spu_opcode_t op) { - const auto a = eval(extract(get_vr(op.ra), 3) + op.i7); + const auto a = eval(extract(get_vr(op.ra), 3) + get_imm(op.i7)); const auto i = eval(~a >> 1 & 0x7); auto r = build(0x1e1f, 0x1c1d, 0x1a1b, 0x1819, 0x1617, 0x1415, 0x1213, 0x1011); r.value = m_ir->CreateInsertElement(r.value, m_ir->getInt16(0x0203), i.value); @@ -4560,7 +5216,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void CWD(spu_opcode_t op) { - const auto a = eval(extract(get_vr(op.ra), 3) + op.i7); + const auto a = eval(extract(get_vr(op.ra), 3) + get_imm(op.i7)); const auto i = eval(~a >> 2 & 0x3); auto r = build(0x1c1d1e1f, 0x18191a1b, 0x14151617, 0x10111213); r.value = m_ir->CreateInsertElement(r.value, m_ir->getInt32(0x010203), i.value); @@ -4569,7 +5225,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void CDD(spu_opcode_t op) { - const auto a = eval(extract(get_vr(op.ra), 3) + op.i7); + const auto a = eval(extract(get_vr(op.ra), 3) + get_imm(op.i7)); const auto i = eval(~a >> 3 & 0x1); auto r = build(0x18191a1b1c1d1e1f, 0x1011121314151617); r.value = m_ir->CreateInsertElement(r.value, m_ir->getInt64(0x01020304050607), i.value); @@ -4578,85 +5234,47 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void ROTQBII(spu_opcode_t op) { - const auto s = op.i7 & 0x7; - const auto a = get_vr(op.ra); - - if (s == 0) - { - return set_vr(op.rt, a); - } - - set_vr(op.rt, a << s | zshuffle(a, 3, 0, 1, 2) >> (32 - s)); + const auto a = get_vr(op.ra); + const auto b = eval(get_imm(op.i7, false) & 0x7); + set_vr(op.rt, a << b | zshuffle(a, 1, 0) >> 56 >> (8 - b)); } void ROTQMBII(spu_opcode_t op) { - const auto s = -op.i7 & 0x7; - const auto a = get_vr(op.ra); - - if (s == 0) - { - return set_vr(op.rt, a); - } - - set_vr(op.rt, a >> s | zshuffle(a, 1, 2, 3, 4) << (32 - s)); + const auto a = get_vr(op.ra); + const auto b = eval(-get_imm(op.i7, false) & 0x7); + set_vr(op.rt, a >> b | zshuffle(a, 1, 2) << 56 << (8 - b)); } void SHLQBII(spu_opcode_t op) { - const auto s = op.i7 & 0x7; - const auto a = get_vr(op.ra); - - if (s == 0) - { - return set_vr(op.rt, a); - } - - set_vr(op.rt, a << s | zshuffle(a, 4, 0, 1, 2) >> (32 - s)); + const auto a = get_vr(op.ra); + const auto b = eval(get_imm(op.i7, false) & 0x7); + set_vr(op.rt, a << b | zshuffle(a, 2, 0) >> 56 >> (8 - b)); } void ROTQBYI(spu_opcode_t op) { - const u32 s = -op.i7 & 0xf; - set_vr(op.rt, zshuffle(get_vr(op.ra), - s & 15, (s + 1) & 15, (s + 2) & 15, (s + 3) & 15, - (s + 4) & 15, (s + 5) & 15, (s + 6) & 15, (s + 7) & 15, - (s + 8) & 15, (s + 9) & 15, (s + 10) & 15, (s + 11) & 15, - (s + 12) & 15, (s + 13) & 15, (s + 14) & 15, (s + 15) & 15)); + const auto a = get_vr(op.ra); + auto sh = build(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + sh = eval((sh - get_imm(op.i7, false)) & 0xf); + set_vr(op.rt, pshufb(a, sh)); } void ROTQMBYI(spu_opcode_t op) { - const u32 s = -op.i7 & 0x1f; - - if (s >= 16) - { - return set_vr(op.rt, splat(0)); - } - - set_vr(op.rt, zshuffle(get_vr(op.ra), - s, s + 1, s + 2, s + 3, - s + 4, s + 5, s + 6, s + 7, - s + 8, s + 9, s + 10, s + 11, - s + 12, s + 13, s + 14, s + 15)); + const auto a = get_vr(op.ra); + auto sh = build(112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127); + sh = eval(sh + (-get_imm(op.i7, false) & 0x1f)); + set_vr(op.rt, pshufb(a, sh)); } void SHLQBYI(spu_opcode_t op) { - const u32 s = op.i7 & 0x1f; - - if (s >= 16) - { - return set_vr(op.rt, splat(0)); - } - - const u32 x = -s; - - set_vr(op.rt, zshuffle(get_vr(op.ra), - x & 31, (x + 1) & 31, (x + 2) & 31, (x + 3) & 31, - (x + 4) & 31, (x + 5) & 31, (x + 6) & 31, (x + 7) & 31, - (x + 8) & 31, (x + 9) & 31, (x + 10) & 31, (x + 11) & 31, - (x + 12) & 31, (x + 13) & 31, (x + 14) & 31, (x + 15) & 31)); + const auto a = get_vr(op.ra); + auto sh = build(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + sh = eval(sh - (get_imm(op.i7, false) & 0x1f)); + set_vr(op.rt, pshufb(a, sh)); } void CGT(spu_opcode_t op) @@ -4664,7 +5282,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator set_vr(op.rt, sext(get_vr(op.ra) > get_vr(op.rb))); } - void XOR(spu_opcode_t op) // + void XOR(spu_opcode_t op) { set_vr(op.rt, get_vr(op.ra) ^ get_vr(op.rb)); } @@ -4674,7 +5292,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator set_vr(op.rt, sext(get_vr(op.ra) > get_vr(op.rb))); } - void EQV(spu_opcode_t op) // + void EQV(spu_opcode_t op) { set_vr(op.rt, ~(get_vr(op.ra) ^ get_vr(op.rb))); } @@ -4725,7 +5343,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator set_vr(op.rt, sext(get_vr(op.ra) > get_vr(op.rb))); } - void ANDC(spu_opcode_t op) // + void ANDC(spu_opcode_t op) { set_vr(op.rt, get_vr(op.ra) & ~get_vr(op.rb)); } @@ -4735,7 +5353,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator set_vr(op.rt, sext(get_vr(op.ra) > get_vr(op.rb))); } - void ORC(spu_opcode_t op) // + void ORC(spu_opcode_t op) { set_vr(op.rt, get_vr(op.ra) | ~get_vr(op.rb)); } @@ -4829,6 +5447,13 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void FSMBI(spu_opcode_t op) { + if (m_interp_magn) + { + const auto m = bitcast(get_imm(op.i16)); + set_vr(op.rt, sext(m)); + return; + } + v128 data; for (u32 i = 0; i < 16; i++) data._bytes[i] = op.i16 & (1u << i) ? -1 : 0; @@ -4839,160 +5464,160 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void IL(spu_opcode_t op) { - set_vr(op.rt, splat(op.si16)); + set_vr(op.rt, get_imm(op.si16)); } void ILHU(spu_opcode_t op) { - set_vr(op.rt, splat(op.i16 << 16)); + set_vr(op.rt, get_imm(op.i16) << 16); } void ILH(spu_opcode_t op) { - set_vr(op.rt, splat(op.i16)); + set_vr(op.rt, get_imm(op.i16)); } void IOHL(spu_opcode_t op) { - set_vr(op.rt, get_vr(op.rt) | op.i16); + set_vr(op.rt, get_vr(op.rt) | get_imm(op.i16)); } void ORI(spu_opcode_t op) { - set_vr(op.rt, get_vr(op.ra) | op.si10); + set_vr(op.rt, get_vr(op.ra) | get_imm(op.si10)); } void ORHI(spu_opcode_t op) { - set_vr(op.rt, get_vr(op.ra) | op.si10); + set_vr(op.rt, get_vr(op.ra) | get_imm(op.si10)); } void ORBI(spu_opcode_t op) { - set_vr(op.rt, get_vr(op.ra) | op.si10); + set_vr(op.rt, get_vr(op.ra) | get_imm(op.si10)); } void SFI(spu_opcode_t op) { - set_vr(op.rt, op.si10 - get_vr(op.ra)); + set_vr(op.rt, get_imm(op.si10) - get_vr(op.ra)); } void SFHI(spu_opcode_t op) { - set_vr(op.rt, op.si10 - get_vr(op.ra)); + set_vr(op.rt, get_imm(op.si10) - get_vr(op.ra)); } void ANDI(spu_opcode_t op) { - set_vr(op.rt, get_vr(op.ra) & op.si10); + set_vr(op.rt, get_vr(op.ra) & get_imm(op.si10)); } void ANDHI(spu_opcode_t op) { - set_vr(op.rt, get_vr(op.ra) & op.si10); + set_vr(op.rt, get_vr(op.ra) & get_imm(op.si10)); } void ANDBI(spu_opcode_t op) { - set_vr(op.rt, get_vr(op.ra) & op.si10); + set_vr(op.rt, get_vr(op.ra) & get_imm(op.si10)); } void AI(spu_opcode_t op) { - set_vr(op.rt, get_vr(op.ra) + op.si10); + set_vr(op.rt, get_vr(op.ra) + get_imm(op.si10)); } void AHI(spu_opcode_t op) { - set_vr(op.rt, get_vr(op.ra) + op.si10); + set_vr(op.rt, get_vr(op.ra) + get_imm(op.si10)); } void XORI(spu_opcode_t op) { - set_vr(op.rt, get_vr(op.ra) ^ op.si10); + set_vr(op.rt, get_vr(op.ra) ^ get_imm(op.si10)); } void XORHI(spu_opcode_t op) { - set_vr(op.rt, get_vr(op.ra) ^ op.si10); + set_vr(op.rt, get_vr(op.ra) ^ get_imm(op.si10)); } void XORBI(spu_opcode_t op) { - set_vr(op.rt, get_vr(op.ra) ^ op.si10); + set_vr(op.rt, get_vr(op.ra) ^ get_imm(op.si10)); } void CGTI(spu_opcode_t op) { - set_vr(op.rt, sext(get_vr(op.ra) > op.si10)); + set_vr(op.rt, sext(get_vr(op.ra) > get_imm(op.si10))); } void CGTHI(spu_opcode_t op) { - set_vr(op.rt, sext(get_vr(op.ra) > op.si10)); + set_vr(op.rt, sext(get_vr(op.ra) > get_imm(op.si10))); } void CGTBI(spu_opcode_t op) { - set_vr(op.rt, sext(get_vr(op.ra) > op.si10)); + set_vr(op.rt, sext(get_vr(op.ra) > get_imm(op.si10))); } void CLGTI(spu_opcode_t op) { - set_vr(op.rt, sext(get_vr(op.ra) > op.si10)); + set_vr(op.rt, sext(get_vr(op.ra) > get_imm(op.si10))); } void CLGTHI(spu_opcode_t op) { - set_vr(op.rt, sext(get_vr(op.ra) > op.si10)); + set_vr(op.rt, sext(get_vr(op.ra) > get_imm(op.si10))); } void CLGTBI(spu_opcode_t op) { - set_vr(op.rt, sext(get_vr(op.ra) > op.si10)); + set_vr(op.rt, sext(get_vr(op.ra) > get_imm(op.si10))); } void MPYI(spu_opcode_t op) { - set_vr(op.rt, (get_vr(op.ra) << 16 >> 16) * splat(op.si10)); + set_vr(op.rt, (get_vr(op.ra) << 16 >> 16) * get_imm(op.si10)); } void MPYUI(spu_opcode_t op) { - set_vr(op.rt, (get_vr(op.ra) << 16 >> 16) * splat(op.si10 & 0xffff)); + set_vr(op.rt, (get_vr(op.ra) << 16 >> 16) * (get_imm(op.si10) & 0xffff)); } void CEQI(spu_opcode_t op) { - set_vr(op.rt, sext(get_vr(op.ra) == op.si10)); + set_vr(op.rt, sext(get_vr(op.ra) == get_imm(op.si10))); } void CEQHI(spu_opcode_t op) { - set_vr(op.rt, sext(get_vr(op.ra) == op.si10)); + set_vr(op.rt, sext(get_vr(op.ra) == get_imm(op.si10))); } void CEQBI(spu_opcode_t op) { - set_vr(op.rt, sext(get_vr(op.ra) == op.si10)); + set_vr(op.rt, sext(get_vr(op.ra) == get_imm(op.si10))); } void ILA(spu_opcode_t op) { - set_vr(op.rt, splat(op.i18)); + set_vr(op.rt, get_imm(op.i18)); } void SELB(spu_opcode_t op) { - if (auto ei = llvm::dyn_cast_or_null(m_block->reg[op.rc])) + if (auto ei = llvm::dyn_cast_or_null(get_reg_raw(op.rc))) { // Detect if the mask comes from a comparison instruction if (ei->getOpcode() == llvm::Instruction::SExt && ei->getSrcTy()->isIntOrIntVectorTy(1)) { auto op0 = ei->getOperand(0); auto typ = ei->getDestTy(); - auto op1 = m_block->reg[op.rb]; - auto op2 = m_block->reg[op.ra]; + auto op1 = get_reg_raw(op.rb); + auto op2 = get_reg_raw(op.ra); if (typ == get_type()) { @@ -5043,14 +5668,14 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator if (op0 && op1 && op2) { - set_vr(op.rt4, m_ir->CreateSelect(op0, op1, op2)); + set_reg_fixed(op.rt4, m_ir->CreateSelect(op0, op1, op2)); return; } } } - const auto op1 = m_block->reg[op.rb]; - const auto op2 = m_block->reg[op.ra]; + const auto op1 = get_reg_raw(op.rb); + const auto op2 = get_reg_raw(op.ra); if (op1 && op1->getType() == get_type() || op2 && op2->getType() == get_type()) { @@ -5061,16 +5686,16 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator const auto m = conv_xfloat_mask(c.value); const auto x = m_ir->CreateAnd(double_as_uint64(b.value), m); const auto y = m_ir->CreateAnd(double_as_uint64(a.value), m_ir->CreateNot(m)); - set_vr(op.rt4, uint64_as_double(m_ir->CreateOr(x, y))); + set_reg_fixed(op.rt4, uint64_as_double(m_ir->CreateOr(x, y))); return; } set_vr(op.rt4, merge(get_vr(op.rc), get_vr(op.rb), get_vr(op.ra))); } - void SHUFB(spu_opcode_t op) + void SHUFB(spu_opcode_t op) // { - if (auto ii = llvm::dyn_cast_or_null(m_block->reg[op.rc])) + if (auto ii = llvm::dyn_cast_or_null(get_reg_raw(op.rc))) { // Detect if the mask comes from a CWD-like constant generation instruction auto c0 = llvm::dyn_cast(ii->getOperand(0)); @@ -5109,7 +5734,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator if (vtype && _new) { - set_vr(op.rt4, m_ir->CreateInsertElement(get_vr(op.rb, vtype), _new, ii->getOperand(2))); + set_reg_fixed(op.rt4, m_ir->CreateInsertElement(get_reg_fixed(op.rb, vtype), _new, ii->getOperand(2))); return; } } @@ -5171,10 +5796,10 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator if (mask._u64[0] == cm.i0 && mask._u64[1] == cm.i1) { const auto t = (this->*cm.type)(); - const auto a = get_vr(op.ra, t); - const auto b = get_vr(op.rb, t); + const auto a = get_reg_fixed(op.ra, t); + const auto b = get_reg_fixed(op.rb, t); const auto e = m_ir->CreateExtractElement(a, cm.extract_from); - set_vr(op.rt4, m_ir->CreateInsertElement(b, e, cm.insert_to)); + set_reg_fixed(op.rt4, m_ir->CreateInsertElement(b, e, cm.insert_to)); return; } } @@ -5190,7 +5815,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator const auto a = get_vr(op.ra); const auto b = get_vr(op.rb); const auto c = make_const_vector(mask, get_type()); - set_vr(op.rt4, m_ir->CreateShuffleVector(b.value, op.ra == op.rb ? b.value : a.value, m_ir->CreateZExt(c, get_type()))); + set_reg_fixed(op.rt4, m_ir->CreateShuffleVector(b.value, op.ra == op.rb ? b.value : a.value, m_ir->CreateZExt(c, get_type()))); return; } @@ -5245,42 +5870,42 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator return UNK(op); } - void DFA(spu_opcode_t op) // + void DFA(spu_opcode_t op) { set_vr(op.rt, get_vr(op.ra) + get_vr(op.rb)); } - void DFS(spu_opcode_t op) // + void DFS(spu_opcode_t op) { set_vr(op.rt, get_vr(op.ra) - get_vr(op.rb)); } - void DFM(spu_opcode_t op) // + void DFM(spu_opcode_t op) { set_vr(op.rt, get_vr(op.ra) * get_vr(op.rb)); } - void DFMA(spu_opcode_t op) // + void DFMA(spu_opcode_t op) { set_vr(op.rt, get_vr(op.ra) * get_vr(op.rb) + get_vr(op.rt)); } - void DFMS(spu_opcode_t op) // + void DFMS(spu_opcode_t op) { set_vr(op.rt, get_vr(op.ra) * get_vr(op.rb) - get_vr(op.rt)); } - void DFNMS(spu_opcode_t op) // + void DFNMS(spu_opcode_t op) { set_vr(op.rt, get_vr(op.rt) - get_vr(op.ra) * get_vr(op.rb)); } - void DFNMA(spu_opcode_t op) // + void DFNMA(spu_opcode_t op) { set_vr(op.rt, -(get_vr(op.ra) * get_vr(op.rb) + get_vr(op.rt))); } - void FREST(spu_opcode_t op) // + void FREST(spu_opcode_t op) { // TODO if (g_cfg.core.spu_accurate_xfloat) @@ -5289,7 +5914,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator set_vr(op.rt, fsplat(1.0) / get_vr(op.ra)); } - void FRSQEST(spu_opcode_t op) // + void FRSQEST(spu_opcode_t op) { // TODO if (g_cfg.core.spu_accurate_xfloat) @@ -5298,7 +5923,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator set_vr(op.rt, fsplat(1.0) / sqrt(fabs(get_vr(op.ra)))); } - void FCGT(spu_opcode_t op) // + void FCGT(spu_opcode_t op) { if (g_cfg.core.spu_accurate_xfloat) { @@ -5327,7 +5952,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator } } - void FCMGT(spu_opcode_t op) // + void FCMGT(spu_opcode_t op) { if (g_cfg.core.spu_accurate_xfloat) { @@ -5355,7 +5980,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator } } - void FA(spu_opcode_t op) // + void FA(spu_opcode_t op) { if (g_cfg.core.spu_accurate_xfloat) set_vr(op.rt, get_vr(op.ra) + get_vr(op.rb)); @@ -5363,7 +5988,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator set_vr(op.rt, get_vr(op.ra) + get_vr(op.rb)); } - void FS(spu_opcode_t op) // + void FS(spu_opcode_t op) { if (g_cfg.core.spu_accurate_xfloat) set_vr(op.rt, get_vr(op.ra) - get_vr(op.rb)); @@ -5371,7 +5996,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator set_vr(op.rt, get_vr(op.ra) - get_vr(op.rb)); } - void FM(spu_opcode_t op) // + void FM(spu_opcode_t op) { if (g_cfg.core.spu_accurate_xfloat) set_vr(op.rt, get_vr(op.ra) * get_vr(op.rb)); @@ -5399,7 +6024,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator set_vr(op.rt, get_vr(op.ra) * get_vr(op.rb)); } - void FESD(spu_opcode_t op) // + void FESD(spu_opcode_t op) { if (g_cfg.core.spu_accurate_xfloat) { @@ -5419,7 +6044,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator } } - void FRDS(spu_opcode_t op) // + void FRDS(spu_opcode_t op) { if (g_cfg.core.spu_accurate_xfloat) { @@ -5440,7 +6065,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator } } - void FCEQ(spu_opcode_t op) // + void FCEQ(spu_opcode_t op) { if (g_cfg.core.spu_accurate_xfloat) set_vr(op.rt, sext(fcmp(get_vr(op.ra), get_vr(op.rb)))); @@ -5448,7 +6073,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator set_vr(op.rt, sext(fcmp(get_vr(op.ra), get_vr(op.rb)))); } - void FCMEQ(spu_opcode_t op) // + void FCMEQ(spu_opcode_t op) { if (g_cfg.core.spu_accurate_xfloat) set_vr(op.rt, sext(fcmp(fabs(get_vr(op.ra)), fabs(get_vr(op.rb))))); @@ -5465,7 +6090,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator return select(eval(max(aa, ab) > 0x7f7fffff), fsplat(0.), eval(a * b)); } - void FNMS(spu_opcode_t op) // + void FNMS(spu_opcode_t op) { // See FMA. if (g_cfg.core.spu_accurate_xfloat) @@ -5476,7 +6101,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator set_vr(op.rt4, get_vr(op.rc) - get_vr(op.ra) * get_vr(op.rb)); } - void FMA(spu_opcode_t op) // + void FMA(spu_opcode_t op) { // Hardware FMA produces the same result as multiple + add on the limited double range (xfloat). if (g_cfg.core.spu_accurate_xfloat) @@ -5487,7 +6112,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator set_vr(op.rt4, get_vr(op.ra) * get_vr(op.rb) + get_vr(op.rc)); } - void FMS(spu_opcode_t op) // + void FMS(spu_opcode_t op) { // See FMA. if (g_cfg.core.spu_accurate_xfloat) @@ -5498,7 +6123,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator set_vr(op.rt4, get_vr(op.ra) * get_vr(op.rb) - get_vr(op.rc)); } - void FI(spu_opcode_t op) // + void FI(spu_opcode_t op) { // TODO if (g_cfg.core.spu_accurate_xfloat) @@ -5507,13 +6132,18 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator set_vr(op.rt, get_vr(op.rb)); } - void CFLTS(spu_opcode_t op) // + void CFLTS(spu_opcode_t op) { if (g_cfg.core.spu_accurate_xfloat) { value_t a = get_vr(op.ra); - if (op.i8 != 173) - a = eval(a * fsplat(std::exp2(static_cast(173 - op.i8)))); + value_t s; + if (m_interp_magn) + s = vsplat(bitcast(((1023 + 173) - get_imm(op.i8)) << 52)); + else + s = fsplat(std::exp2(static_cast(173 - op.i8))); + if (op.i8 != 173 || m_interp_magn) + a = eval(a * s); value_t r; @@ -5562,8 +6192,13 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator else { value_t a = get_vr(op.ra); - if (op.i8 != 173) - a = eval(a * fsplat(std::exp2(static_cast(static_cast(173 - op.i8))))); + value_t s; + if (m_interp_magn) + s = vsplat(load_const(m_scale_float_to, get_imm(op.i8))); + else + s = fsplat(std::exp2(static_cast(static_cast(173 - op.i8)))); + if (op.i8 != 173 || m_interp_magn) + a = eval(a * s); value_t r; r.value = m_ir->CreateFPToSI(a.value, get_type()); @@ -5571,13 +6206,18 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator } } - void CFLTU(spu_opcode_t op) // + void CFLTU(spu_opcode_t op) { if (g_cfg.core.spu_accurate_xfloat) { value_t a = get_vr(op.ra); - if (op.i8 != 173) - a = eval(a * fsplat(std::exp2(static_cast(173 - op.i8)))); + value_t s; + if (m_interp_magn) + s = vsplat(bitcast(((1023 + 173) - get_imm(op.i8)) << 52)); + else + s = fsplat(std::exp2(static_cast(173 - op.i8))); + if (op.i8 != 173 || m_interp_magn) + a = eval(a * s); value_t r; @@ -5626,8 +6266,13 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator else { value_t a = get_vr(op.ra); - if (op.i8 != 173) - a = eval(a * fsplat(std::exp2(static_cast(static_cast(173 - op.i8))))); + value_t s; + if (m_interp_magn) + s = vsplat(load_const(m_scale_float_to, get_imm(op.i8))); + else + s = fsplat(std::exp2(static_cast(static_cast(173 - op.i8)))); + if (op.i8 != 173 || m_interp_magn) + a = eval(a * s); value_t r; r.value = m_ir->CreateFPToUI(a.value, get_type()); @@ -5635,7 +6280,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator } } - void CSFLT(spu_opcode_t op) // + void CSFLT(spu_opcode_t op) { if (g_cfg.core.spu_accurate_xfloat) { @@ -5652,21 +6297,31 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator r.value = m_ir->CreateSIToFP(a.value, get_type()); } - if (op.i8 != 155) - r = eval(r * fsplat(std::exp2(static_cast(op.i8 - 155)))); + value_t s; + if (m_interp_magn) + s = vsplat(bitcast((get_imm(op.i8) + (1023 - 155)) << 52)); + else + s = fsplat(std::exp2(static_cast(op.i8 - 155))); + if (op.i8 != 155 || m_interp_magn) + r = eval(r * s); set_vr(op.rt, r); } else { value_t r; r.value = m_ir->CreateSIToFP(get_vr(op.ra).value, get_type()); - if (op.i8 != 155) - r = eval(r * fsplat(std::exp2(static_cast(static_cast(op.i8 - 155))))); + value_t s; + if (m_interp_magn) + s = vsplat(load_const(m_scale_to_float, get_imm(op.i8))); + else + s = fsplat(std::exp2(static_cast(static_cast(op.i8 - 155)))); + if (op.i8 != 155 || m_interp_magn) + r = eval(r * s); set_vr(op.rt, r); } } - void CUFLT(spu_opcode_t op) // + void CUFLT(spu_opcode_t op) { if (g_cfg.core.spu_accurate_xfloat) { @@ -5683,21 +6338,31 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator r.value = m_ir->CreateUIToFP(a.value, get_type()); } - if (op.i8 != 155) - r = eval(r * fsplat(std::exp2(static_cast(op.i8 - 155)))); + value_t s; + if (m_interp_magn) + s = vsplat(bitcast((get_imm(op.i8) + (1023 - 155)) << 52)); + else + s = fsplat(std::exp2(static_cast(op.i8 - 155))); + if (op.i8 != 155 || m_interp_magn) + r = eval(r * s); set_vr(op.rt, r); } else { value_t r; r.value = m_ir->CreateUIToFP(get_vr(op.ra).value, get_type()); - if (op.i8 != 155) - r = eval(r * fsplat(std::exp2(static_cast(static_cast(op.i8 - 155))))); + value_t s; + if (m_interp_magn) + s = vsplat(load_const(m_scale_to_float, get_imm(op.i8))); + else + s = fsplat(std::exp2(static_cast(static_cast(op.i8 - 155)))); + if (op.i8 != 155 || m_interp_magn) + r = eval(r * s); set_vr(op.rt, r); } } - void STQX(spu_opcode_t op) // + void STQX(spu_opcode_t op) { value_t addr = zext((extract(get_vr(op.ra), 3) + extract(get_vr(op.rb), 3)) & 0x3fff0); value_t r = get_vr(op.rt); @@ -5705,7 +6370,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator m_ir->CreateStore(r.value, m_ir->CreateBitCast(m_ir->CreateGEP(m_lsptr, addr.value), get_type())); } - void LQX(spu_opcode_t op) // + void LQX(spu_opcode_t op) { value_t addr = zext((extract(get_vr(op.ra), 3) + extract(get_vr(op.rb), 3)) & 0x3fff0); value_t r; @@ -5714,17 +6379,17 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator set_vr(op.rt, r); } - void STQA(spu_opcode_t op) // + void STQA(spu_opcode_t op) { - value_t addr = splat(spu_ls_target(0, op.i16)); + value_t addr = eval((get_imm(op.i16, false) << 2) & 0x3fff0); value_t r = get_vr(op.rt); r.value = m_ir->CreateShuffleVector(r.value, r.value, {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}); m_ir->CreateStore(r.value, m_ir->CreateBitCast(m_ir->CreateGEP(m_lsptr, addr.value), get_type())); } - void LQA(spu_opcode_t op) // + void LQA(spu_opcode_t op) { - value_t addr = splat(spu_ls_target(0, op.i16)); + value_t addr = eval((get_imm(op.i16, false) << 2) & 0x3fff0); value_t r; r.value = m_ir->CreateLoad(m_ir->CreateBitCast(m_ir->CreateGEP(m_lsptr, addr.value), get_type())); r.value = m_ir->CreateShuffleVector(r.value, r.value, {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}); @@ -5733,7 +6398,9 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void STQR(spu_opcode_t op) // { - value_t addr = splat(spu_ls_target(m_pos, op.i16)); + value_t addr; + addr.value = m_interp_magn ? m_interp_pc : m_ir->getInt32(m_pos); + addr = eval(((get_imm(op.i16, false) << 2) + zext(addr)) & 0x3fff0); value_t r = get_vr(op.rt); r.value = m_ir->CreateShuffleVector(r.value, r.value, {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}); m_ir->CreateStore(r.value, m_ir->CreateBitCast(m_ir->CreateGEP(m_lsptr, addr.value), get_type())); @@ -5741,24 +6408,26 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void LQR(spu_opcode_t op) // { - value_t addr = splat(spu_ls_target(m_pos, op.i16)); + value_t addr; + addr.value = m_interp_magn ? m_interp_pc : m_ir->getInt32(m_pos); + addr = eval(((get_imm(op.i16, false) << 2) + zext(addr)) & 0x3fff0); value_t r; r.value = m_ir->CreateLoad(m_ir->CreateBitCast(m_ir->CreateGEP(m_lsptr, addr.value), get_type())); r.value = m_ir->CreateShuffleVector(r.value, r.value, {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}); set_vr(op.rt, r); } - void STQD(spu_opcode_t op) // + void STQD(spu_opcode_t op) { - value_t addr = zext((extract(get_vr(op.ra), 3) + (op.si10 << 4)) & 0x3fff0); + value_t addr = zext((extract(get_vr(op.ra), 3) + (get_imm(op.si10) << 4)) & 0x3fff0); value_t r = get_vr(op.rt); r.value = m_ir->CreateShuffleVector(r.value, r.value, {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}); m_ir->CreateStore(r.value, m_ir->CreateBitCast(m_ir->CreateGEP(m_lsptr, addr.value), get_type())); } - void LQD(spu_opcode_t op) // + void LQD(spu_opcode_t op) { - value_t addr = zext((extract(get_vr(op.ra), 3) + (op.si10 << 4)) & 0x3fff0); + value_t addr = zext((extract(get_vr(op.ra), 3) + (get_imm(op.si10) << 4)) & 0x3fff0); value_t r; r.value = m_ir->CreateLoad(m_ir->CreateBitCast(m_ir->CreateGEP(m_lsptr, addr.value), get_type())); r.value = m_ir->CreateShuffleVector(r.value, r.value, {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}); @@ -5771,6 +6440,8 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator const auto halt = llvm::BasicBlock::Create(m_context, "", m_function); m_ir->CreateCondBr(cond.value, halt, next, m_md_unlikely); m_ir->SetInsertPoint(halt); + if (m_interp_magn) + m_ir->CreateStore(&*(m_function->arg_begin() + 2), spu_ptr(&spu_thread::pc))->setVolatile(true); const auto pstatus = spu_ptr(&spu_thread::status); const auto chalt = m_ir->getInt32(SPU_STATUS_STOPPED_BY_HALT); m_ir->CreateAtomicRMW(llvm::AtomicRMWInst::Or, pstatus, chalt, llvm::AtomicOrdering::Release)->setVolatile(true); @@ -5780,39 +6451,39 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator m_ir->SetInsertPoint(next); } - void HGT(spu_opcode_t op) // + void HGT(spu_opcode_t op) { const auto cond = eval(extract(get_vr(op.ra), 3) > extract(get_vr(op.rb), 3)); make_halt(cond); } - void HEQ(spu_opcode_t op) // + void HEQ(spu_opcode_t op) { const auto cond = eval(extract(get_vr(op.ra), 3) == extract(get_vr(op.rb), 3)); make_halt(cond); } - void HLGT(spu_opcode_t op) // + void HLGT(spu_opcode_t op) { const auto cond = eval(extract(get_vr(op.ra), 3) > extract(get_vr(op.rb), 3)); make_halt(cond); } - void HGTI(spu_opcode_t op) // + void HGTI(spu_opcode_t op) { - const auto cond = eval(extract(get_vr(op.ra), 3) > op.si10); + const auto cond = eval(extract(get_vr(op.ra), 3) > get_imm(op.si10)); make_halt(cond); } - void HEQI(spu_opcode_t op) // + void HEQI(spu_opcode_t op) { - const auto cond = eval(extract(get_vr(op.ra), 3) == op.si10); + const auto cond = eval(extract(get_vr(op.ra), 3) == get_imm(op.si10)); make_halt(cond); } - void HLGTI(spu_opcode_t op) // + void HLGTI(spu_opcode_t op) { - const auto cond = eval(extract(get_vr(op.ra), 3) > op.si10); + const auto cond = eval(extract(get_vr(op.ra), 3) > get_imm(op.si10)); make_halt(cond); } @@ -5857,6 +6528,36 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator llvm::BasicBlock* add_block_indirect(spu_opcode_t op, value_t addr, bool ret = true) { + if (m_interp_magn) + { + m_interp_bblock = llvm::BasicBlock::Create(m_context, "", m_function); + + const auto cblock = m_ir->GetInsertBlock(); + const auto result = llvm::BasicBlock::Create(m_context, "", m_function); + const auto e_exec = llvm::BasicBlock::Create(m_context, "", m_function); + const auto d_test = llvm::BasicBlock::Create(m_context, "", m_function); + const auto d_exec = llvm::BasicBlock::Create(m_context, "", m_function); + const auto d_done = llvm::BasicBlock::Create(m_context, "", m_function); + m_ir->SetInsertPoint(result); + m_ir->CreateCondBr(get_imm(op.e).value, e_exec, d_test, m_md_unlikely); + m_ir->SetInsertPoint(e_exec); + const auto e_addr = call(&exec_check_interrupts, m_thread, addr.value); + m_ir->CreateBr(d_test); + m_ir->SetInsertPoint(d_test); + const auto target = m_ir->CreatePHI(get_type(), 2); + target->addIncoming(addr.value, result); + target->addIncoming(e_addr, e_exec); + m_ir->CreateCondBr(get_imm(op.d).value, d_exec, d_done, m_md_unlikely); + m_ir->SetInsertPoint(d_exec); + m_ir->CreateStore(m_ir->getFalse(), spu_ptr(&spu_thread::interrupts_enabled))->setVolatile(true); + m_ir->CreateBr(d_done); + m_ir->SetInsertPoint(d_done); + m_ir->CreateBr(m_interp_bblock); + m_ir->SetInsertPoint(cblock); + m_interp_pc = target; + return result; + } + // Convert an indirect branch into a static one if possible if (const auto _int = llvm::dyn_cast(addr.value)) { @@ -5891,7 +6592,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator if (ret && g_cfg.core.spu_block_size != spu_block_size_type::safe) { - sp = eval(extract(get_vr(1), 3) & 0x3fff0); + sp = eval(extract(get_reg_fixed(1), 3) & 0x3fff0); } const auto cblock = m_ir->GetInsertBlock(); @@ -5945,47 +6646,70 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator return result; } + llvm::BasicBlock* add_block_next() + { + if (m_interp_magn) + { + const auto cblock = m_ir->GetInsertBlock(); + m_ir->SetInsertPoint(m_interp_bblock); + const auto target = m_ir->CreatePHI(get_type(), 2); + target->addIncoming(m_interp_pc_next, cblock); + target->addIncoming(m_interp_pc, m_interp_bblock->getSinglePredecessor()); + m_ir->SetInsertPoint(cblock); + m_interp_pc = target; + return m_interp_bblock; + } + + return add_block(m_pos + 4); + } + void BIZ(spu_opcode_t op) // { - m_block->block_end = m_ir->GetInsertBlock(); + if (m_block) m_block->block_end = m_ir->GetInsertBlock(); const auto cond = eval(extract(get_vr(op.rt), 3) == 0); const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc); const auto target = add_block_indirect(op, addr); - m_ir->CreateCondBr(cond.value, target, add_block(m_pos + 4)); + m_ir->CreateCondBr(cond.value, target, add_block_next()); } void BINZ(spu_opcode_t op) // { - m_block->block_end = m_ir->GetInsertBlock(); + if (m_block) m_block->block_end = m_ir->GetInsertBlock(); const auto cond = eval(extract(get_vr(op.rt), 3) != 0); const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc); const auto target = add_block_indirect(op, addr); - m_ir->CreateCondBr(cond.value, target, add_block(m_pos + 4)); + m_ir->CreateCondBr(cond.value, target, add_block_next()); } void BIHZ(spu_opcode_t op) // { - m_block->block_end = m_ir->GetInsertBlock(); + if (m_block) m_block->block_end = m_ir->GetInsertBlock(); const auto cond = eval(extract(get_vr(op.rt), 6) == 0); const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc); const auto target = add_block_indirect(op, addr); - m_ir->CreateCondBr(cond.value, target, add_block(m_pos + 4)); + m_ir->CreateCondBr(cond.value, target, add_block_next()); } void BIHNZ(spu_opcode_t op) // { - m_block->block_end = m_ir->GetInsertBlock(); + if (m_block) m_block->block_end = m_ir->GetInsertBlock(); const auto cond = eval(extract(get_vr(op.rt), 6) != 0); const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc); const auto target = add_block_indirect(op, addr); - m_ir->CreateCondBr(cond.value, target, add_block(m_pos + 4)); + m_ir->CreateCondBr(cond.value, target, add_block_next()); } void BI(spu_opcode_t op) // { - m_block->block_end = m_ir->GetInsertBlock(); + if (m_block) m_block->block_end = m_ir->GetInsertBlock(); const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc); + if (m_interp_magn) + { + m_ir->CreateBr(add_block_indirect(op, addr)); + return; + } + // Create jump table if necessary (TODO) const auto tfound = m_targets.find(m_pos); @@ -6048,7 +6772,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void BISL(spu_opcode_t op) // { - m_block->block_end = m_ir->GetInsertBlock(); + if (m_block) m_block->block_end = m_ir->GetInsertBlock(); const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc); set_link(op); m_ir->CreateBr(add_block_indirect(op, addr, false)); @@ -6056,7 +6780,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void IRET(spu_opcode_t op) // { - m_block->block_end = m_ir->GetInsertBlock(); + if (m_block) m_block->block_end = m_ir->GetInsertBlock(); value_t srr0; srr0.value = m_ir->CreateLoad(spu_ptr(&spu_thread::srr0)); m_ir->CreateBr(add_block_indirect(op, srr0)); @@ -6069,6 +6793,15 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void BRZ(spu_opcode_t op) // { + if (m_interp_magn) + { + value_t target; + target.value = m_interp_pc; + target = eval((target + (get_imm(op.i16, false) << 2)) & 0x3fffc); + m_interp_pc = m_ir->CreateSelect(eval(extract(get_vr(op.rt), 3) == 0).value, target.value, m_interp_pc_next); + return; + } + const u32 target = spu_branch_target(m_pos, op.i16); if (target != m_pos + 4) @@ -6081,6 +6814,15 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void BRNZ(spu_opcode_t op) // { + if (m_interp_magn) + { + value_t target; + target.value = m_interp_pc; + target = eval((target + (get_imm(op.i16, false) << 2)) & 0x3fffc); + m_interp_pc = m_ir->CreateSelect(eval(extract(get_vr(op.rt), 3) != 0).value, target.value, m_interp_pc_next); + return; + } + const u32 target = spu_branch_target(m_pos, op.i16); if (target != m_pos + 4) @@ -6093,6 +6835,15 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void BRHZ(spu_opcode_t op) // { + if (m_interp_magn) + { + value_t target; + target.value = m_interp_pc; + target = eval((target + (get_imm(op.i16, false) << 2)) & 0x3fffc); + m_interp_pc = m_ir->CreateSelect(eval(extract(get_vr(op.rt), 6) == 0).value, target.value, m_interp_pc_next); + return; + } + const u32 target = spu_branch_target(m_pos, op.i16); if (target != m_pos + 4) @@ -6105,6 +6856,15 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void BRHNZ(spu_opcode_t op) // { + if (m_interp_magn) + { + value_t target; + target.value = m_interp_pc; + target = eval((target + (get_imm(op.i16, false) << 2)) & 0x3fffc); + m_interp_pc = m_ir->CreateSelect(eval(extract(get_vr(op.rt), 6) != 0).value, target.value, m_interp_pc_next); + return; + } + const u32 target = spu_branch_target(m_pos, op.i16); if (target != m_pos + 4) @@ -6117,6 +6877,12 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void BRA(spu_opcode_t op) // { + if (m_interp_magn) + { + m_interp_pc = eval((get_imm(op.i16, false) << 2) & 0x3fffc).value; + return; + } + const u32 target = spu_branch_target(0, op.i16); if (target != m_pos + 4) @@ -6134,6 +6900,15 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void BR(spu_opcode_t op) // { + if (m_interp_magn) + { + value_t target; + target.value = m_interp_pc; + target = eval((target + (get_imm(op.i16, false) << 2)) & 0x3fffc); + m_interp_pc = target.value; + return; + } + const u32 target = spu_branch_target(m_pos, op.i16); if (target != m_pos + 4) @@ -6151,13 +6926,21 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void set_link(spu_opcode_t op) { + if (m_interp_magn) + { + value_t next; + next.value = m_interp_pc_next; + set_vr(op.rt, insert(splat(0), 3, next)); + return; + } + set_vr(op.rt, build(0, 0, 0, spu_branch_target(m_pos + 4))); if (g_cfg.core.spu_block_size != spu_block_size_type::safe && m_block_info[m_pos / 4 + 1] && m_entry_info[m_pos / 4 + 1]) { // Store the return function chunk address at the stack mirror const auto func = add_function(m_pos + 4); - const auto stack0 = eval(zext(extract(get_vr(1), 3) & 0x3fff0) + ::offset32(&spu_thread::stack_mirror)); + const auto stack0 = eval(zext(extract(get_reg_fixed(1), 3) & 0x3fff0) + ::offset32(&spu_thread::stack_mirror)); const auto stack1 = eval(stack0 + 8); m_ir->CreateStore(func, m_ir->CreateBitCast(m_ir->CreateGEP(m_thread, stack0.value), func->getType()->getPointerTo())); m_ir->CreateStore(m_ir->getInt64(m_pos + 4), m_ir->CreateBitCast(m_ir->CreateGEP(m_thread, stack1.value), get_type())); @@ -6167,17 +6950,22 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator static const spu_decoder g_decoder; }; -std::unique_ptr spu_recompiler_base::make_llvm_recompiler() +std::unique_ptr spu_recompiler_base::make_llvm_recompiler(u8 magn) { - return std::make_unique(); + return std::make_unique(magn); } DECLARE(spu_llvm_recompiler::g_decoder); #else -std::unique_ptr spu_recompiler_base::make_llvm_recompiler() +std::unique_ptr spu_recompiler_base::make_llvm_recompiler(u8 magn) { + if (magn) + { + return nullptr; + } + fmt::throw_exception("LLVM is not available in this build."); } diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index 518bb58f2cee..e645d8d2f12b 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -108,6 +108,9 @@ class spu_runtime // All dispatchers (array allocated in jit memory) static atomic_t* const g_dispatcher; + // Interpreter entry point + static spu_function_t g_interpreter; + struct passive_lock { spu_runtime& _this; @@ -253,7 +256,7 @@ class spu_recompiler_base static std::unique_ptr make_asmjit_recompiler(); // Create recompiler instance (LLVM) - static std::unique_ptr make_llvm_recompiler(); + static std::unique_ptr make_llvm_recompiler(u8 magn = 0); enum : u8 { diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index 872c7b5f5ea8..23dc4753a89a 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -23,16 +23,6 @@ #include #include -const bool s_use_ssse3 = -#ifdef _MSC_VER - utils::has_ssse3(); -#elif __SSSE3__ - true; -#else - false; -#define _mm_shuffle_epi8 -#endif - #ifdef _MSC_VER bool operator ==(const u128& lhs, const u128& rhs) { @@ -597,6 +587,23 @@ void spu_thread::cpu_task() return; } + if (spu_runtime::g_interpreter) + { + while (true) + { + if (UNLIKELY(state)) + { + if (check_state()) + break; + } + + spu_runtime::g_interpreter(*this, vm::_ptr(offset), nullptr); + } + + cpu_stop(); + return; + } + // Select opcode table const auto& table = *( g_cfg.core.spu_decoder == spu_decoder_type::precise ? &g_spu_interpreter_precise.get_table() : @@ -605,11 +612,6 @@ void spu_thread::cpu_task() // LS pointer const auto base = vm::_ptr(offset); - const auto bswap4 = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3); - - v128 _op; - using func_t = decltype(&spu_interpreter::UNK); - func_t func0, func1, func2, func3, func4, func5; while (true) { @@ -617,62 +619,11 @@ void spu_thread::cpu_task() { if (check_state()) break; - - // Decode single instruction (may be step) - const u32 op = *reinterpret_cast*>(base + pc); - if (table[spu_decode(op)](*this, {op})) { pc += 4; } - continue; } - if (pc % 16 || !s_use_ssse3) - { - // Unaligned - const u32 op = *reinterpret_cast*>(base + pc); - if (table[spu_decode(op)](*this, {op})) { pc += 4; } - continue; - } - - // Reinitialize - _op.vi = _mm_shuffle_epi8(_mm_load_si128(reinterpret_cast(base + pc)), bswap4); - func0 = table[spu_decode(_op._u32[0])]; - func1 = table[spu_decode(_op._u32[1])]; - func2 = table[spu_decode(_op._u32[2])]; - func3 = table[spu_decode(_op._u32[3])]; - - while (LIKELY(func0(*this, {_op._u32[0]}))) - { + const u32 op = *reinterpret_cast*>(base + pc); + if (table[spu_decode(op)](*this, {op})) pc += 4; - if (LIKELY(func1(*this, {_op._u32[1]}))) - { - pc += 4; - u32 op2 = _op._u32[2]; - u32 op3 = _op._u32[3]; - _op.vi = _mm_shuffle_epi8(_mm_load_si128(reinterpret_cast(base + pc + 8)), bswap4); - func0 = table[spu_decode(_op._u32[0])]; - func1 = table[spu_decode(_op._u32[1])]; - func4 = table[spu_decode(_op._u32[2])]; - func5 = table[spu_decode(_op._u32[3])]; - if (LIKELY(func2(*this, {op2}))) - { - pc += 4; - if (LIKELY(func3(*this, {op3}))) - { - pc += 4; - func2 = func4; - func3 = func5; - - if (UNLIKELY(state)) - { - break; - } - continue; - } - break; - } - break; - } - break; - } } cpu_stop(); diff --git a/rpcs3/rpcs3qt/settings_dialog.cpp b/rpcs3/rpcs3qt/settings_dialog.cpp index 943fdeb2c3e2..a3668603a35b 100644 --- a/rpcs3/rpcs3qt/settings_dialog.cpp +++ b/rpcs3/rpcs3qt/settings_dialog.cpp @@ -291,7 +291,12 @@ settings_dialog::settings_dialog(std::shared_ptr guiSettings, std: ui->accurateXFloat->setEnabled(checked); }); - ui->accurateXFloat->setEnabled(ui->spu_llvm->isChecked()); + connect(ui->spu_fast, &QAbstractButton::toggled, [this](bool checked) + { + ui->accurateXFloat->setEnabled(checked); + }); + + ui->accurateXFloat->setEnabled(ui->spu_llvm->isChecked() || ui->spu_fast->isChecked()); #ifndef LLVM_AVAILABLE ui->ppu_llvm->setEnabled(false);