Skip to content

Commit

Permalink
SPU LLVM: PUTLLC 16 Optimization
Browse files Browse the repository at this point in the history
  • Loading branch information
elad335 committed Apr 11, 2024
1 parent 3aca6b3 commit 000a505
Show file tree
Hide file tree
Showing 8 changed files with 2,267 additions and 140 deletions.
2,096 changes: 1,986 additions & 110 deletions rpcs3/Emu/Cell/SPUCommonRecompiler.cpp

Large diffs are not rendered by default.

182 changes: 182 additions & 0 deletions rpcs3/Emu/Cell/SPULLVMRecompiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "Emu/system_config.h"
#include "Emu/IdManager.h"
#include "Emu/Cell/timers.hpp"
#include "Emu/Memory/vm_reservation.h"
#include "Crypto/sha1.h"
#include "Utilities/JIT.h"

Expand Down Expand Up @@ -534,6 +535,14 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
return m_ir->CreateGEP(get_type<u8>(), base, m_ir->getInt64(offset));
}

template <typename T = u8>
llvm::Value* _ptr(llvm::Value* base, llvm::Value* offset)
{
const auto off = m_ir->CreateGEP(get_type<u8>(), base, offset);
const auto ptr = m_ir->CreateBitCast(off, get_type<T*>());
return ptr;
}

template <typename T, typename... Args>
llvm::Value* spu_ptr(Args... offset_args)
{
Expand Down Expand Up @@ -1078,6 +1087,159 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
m_ir->SetInsertPoint(_body);
}

void putllc16_pattern(const spu_program& prog, utils::address_range range)
{
// Prevent store elimination
m_block->store_context_ctr[s_reg_mfc_eal]++;
m_block->store_context_ctr[s_reg_mfc_lsa]++;
m_block->store_context_ctr[s_reg_mfc_tag]++;
m_block->store_context_ctr[s_reg_mfc_size]++;

static const auto on_fail = [](spu_thread* _spu, u32 addr)
{
if (const u32 raddr = _spu->raddr)
{
// Last check for event before we clear the reservation
if (raddr == addr || _spu->rtime != (vm::reservation_acquire(raddr) & -128) || std::memcmp(&_spu->rdata, vm::_ptr<decltype(_spu->rdata)>(raddr), 128))
{
_spu->set_events(SPU_EVENT_LR);
}
}
};

union putllc16_info
{
u32 data;
bf_t<u32, 31, 1> is_const;
bf_t<u32, 30, 1> is_pc_rel;
bf_t<u32, 29, 1> runtime16_select;
bf_t<u32, 0, 8> reg;
bf_t<u32, 8, 18> offs;
} info;

const auto _next = llvm::BasicBlock::Create(m_context, "", m_function);
const auto _next0 = llvm::BasicBlock::Create(m_context, "", m_function);
const auto _next1 = llvm::BasicBlock::Create(m_context, "", m_function);
const auto _next2 = llvm::BasicBlock::Create(m_context, "", m_function);
const auto _fail = llvm::BasicBlock::Create(m_context, "", m_function);
const auto _fail0 = llvm::BasicBlock::Create(m_context, "", m_function);
const auto _final = llvm::BasicBlock::Create(m_context, "", m_function);
info.data = range.end;

const auto _eal = (get_reg_fixed<u32>(s_reg_mfc_eal) & -128).eval(m_ir);

m_ir->CreateCondBr(m_ir->CreateICmpEQ(_eal, m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(&spu_thread::raddr))), _next, _fail, m_md_likely);
m_ir->SetInsertPoint(_next);

value_t<u32> eal_val;
eal_val.value = _eal;

const auto rptr = _ptr<u64>(m_ir->CreateLoad(get_type<u8*>(), spu_ptr<u8*>(&spu_thread::reserv_base_addr)), ((eal_val & 0xff80) >> 1).eval(m_ir));
const auto rval = m_ir->CreateLoad(get_type<u64>(), spu_ptr<u64>(&spu_thread::rtime));
m_ir->CreateCondBr(
m_ir->CreateExtractValue(m_ir->CreateAtomicCmpXchg(rptr, rval, m_ir->CreateOr(rval, 0x7f), llvm::MaybeAlign{16}, llvm::AtomicOrdering::SequentiallyConsistent, llvm::AtomicOrdering::SequentiallyConsistent), 1)
, _next0
, _fail);

m_ir->SetInsertPoint(_next0);

const auto _lsa = (get_reg_fixed<u32>(s_reg_mfc_lsa) & 0x3ff80).eval(m_ir);
const auto _dest = info.is_pc_rel ? get_pc(info.offs) :
!info.is_const ? (extract(get_reg_fixed(info.reg), 3) + splat<u32>(info.offs)).eval(m_ir) : splat<u32>(info.offs & 0x3fff0).eval(m_ir);
const auto dest = info.is_const ? _dest : m_ir->CreateAnd(_dest, m_ir->getInt32(0x3fff0));

const auto diff = m_ir->CreateZExt(m_ir->CreateSub(dest, _lsa), get_type<u64>());

if (info.runtime16_select)
{
m_ir->CreateCondBr(m_ir->CreateICmpULT(diff, m_ir->getInt64(128)), _next1, _next2);
}
else
{
m_ir->CreateBr(_next1);
}

m_ir->SetInsertPoint(_next1);
const auto _new = m_ir->CreateAlignedLoad(get_type<u128>(), _ptr<u128>(m_lsptr, dest), llvm::MaybeAlign{16});
const auto _rdata = m_ir->CreateAlignedLoad(get_type<u128>(), _ptr<u128>(spu_ptr<u8>(&spu_thread::rdata), diff), llvm::MaybeAlign{16});

m_ir->CreateCondBr(
m_ir->CreateExtractValue(m_ir->CreateAtomicCmpXchg(_ptr<u128>(_ptr<u8>(m_memptr, _eal), diff), _rdata, _new, llvm::MaybeAlign{16}, llvm::AtomicOrdering::SequentiallyConsistent, llvm::AtomicOrdering::SequentiallyConsistent), 1)
, _next2
, _fail0);

m_ir->SetInsertPoint(_next2);
m_ir->CreateAlignedStore(m_ir->CreateAdd(rval, m_ir->getInt64(128)), rptr, llvm::MaybeAlign{8});
call("atomic_wait_engine::notify_all", static_cast<void(*)(const void*)>(atomic_wait_engine::notify_all), rptr);
m_ir->CreateStore(m_ir->getInt64(spu_channel::bit_count | MFC_PUTLLC_SUCCESS), spu_ptr<u64>(&spu_thread::ch_atomic_stat));
m_ir->CreateBr(_final);

m_ir->SetInsertPoint(_fail0);
m_ir->CreateStore(rval, rptr);
m_ir->CreateBr(_fail);
m_ir->SetInsertPoint(_fail);
call("PUTLLC16_fail", +on_fail, m_thread, _eal);
m_ir->CreateStore(m_ir->getInt64(spu_channel::bit_count | MFC_PUTLLC_FAILURE), spu_ptr<u64>(&spu_thread::ch_atomic_stat));
m_ir->CreateBr(_final);

m_ir->SetInsertPoint(_final);
m_ir->CreateStore(m_ir->getInt32(0), spu_ptr<u32>(&spu_thread::raddr));
}

void putllc0_pattern(const spu_program& prog, utils::address_range range)
{
// Prevent store elimination
m_block->store_context_ctr[s_reg_mfc_eal]++;
m_block->store_context_ctr[s_reg_mfc_lsa]++;
m_block->store_context_ctr[s_reg_mfc_tag]++;
m_block->store_context_ctr[s_reg_mfc_size]++;

static const auto on_fail = [](spu_thread* _spu, u32 addr)
{
if (const u32 raddr = _spu->raddr)
{
// Last check for event before we clear the reservation
if (raddr == addr || _spu->rtime != (vm::reservation_acquire(raddr) & -128) || std::memcmp(&_spu->rdata, vm::_ptr<decltype(_spu->rdata)>(raddr), 128))
{
_spu->set_events(SPU_EVENT_LR);
}
}
};

const auto _next = llvm::BasicBlock::Create(m_context, "", m_function);
const auto _next0 = llvm::BasicBlock::Create(m_context, "", m_function);
const auto _fail = llvm::BasicBlock::Create(m_context, "", m_function);
const auto _final = llvm::BasicBlock::Create(m_context, "", m_function);

const auto _eal = (get_reg_fixed<u32>(s_reg_mfc_eal) & -128).eval(m_ir);

m_ir->CreateCondBr(m_ir->CreateICmpEQ(_eal, m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(&spu_thread::raddr))), _next, _fail, m_md_likely);
m_ir->SetInsertPoint(_next);

value_t<u32> eal_val;
eal_val.value = _eal;

const auto rptr = _ptr<u64>(m_ir->CreateLoad(get_type<u8*>(), spu_ptr<u8*>(&spu_thread::reserv_base_addr)), ((eal_val & 0xff80) >> 1).eval(m_ir));
const auto rval = m_ir->CreateLoad(get_type<u64>(), spu_ptr<u64>(&spu_thread::rtime));
m_ir->CreateCondBr(
m_ir->CreateExtractValue(m_ir->CreateAtomicCmpXchg(rptr, rval, m_ir->CreateAdd(rval, m_ir->getInt64(128)), llvm::MaybeAlign{16}, llvm::AtomicOrdering::SequentiallyConsistent, llvm::AtomicOrdering::SequentiallyConsistent), 1)
, _next0
, g_cfg.core.spu_accurate_reservations ? _fail : _next0); // Succeed unconditionally

m_ir->SetInsertPoint(_next0);
call("atomic_wait_engine::notify_all", static_cast<void(*)(const void*)>(atomic_wait_engine::notify_all), rptr);
m_ir->CreateStore(m_ir->getInt64(spu_channel::bit_count | MFC_PUTLLC_SUCCESS), spu_ptr<u64>(&spu_thread::ch_atomic_stat));
m_ir->CreateBr(_final);

m_ir->SetInsertPoint(_fail);
call("PUTLLC0_fail", +on_fail, m_thread, _eal);
m_ir->CreateStore(m_ir->getInt64(spu_channel::bit_count | MFC_PUTLLC_FAILURE), spu_ptr<u64>(&spu_thread::ch_atomic_stat));
m_ir->CreateBr(_final);

m_ir->SetInsertPoint(_final);
m_ir->CreateStore(m_ir->getInt32(0), spu_ptr<u32>(&spu_thread::raddr));
}

public:
spu_llvm_recompiler(u8 interp_magn = 0)
: spu_recompiler_base()
Expand Down Expand Up @@ -1621,6 +1783,26 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
else
m_next_op = func.data[(m_pos - start) / 4 + 1];

switch (func.inst_attrs[(m_pos - start) / 4])
{
case spu_program::inst_attr::putllc0:
{
putllc0_pattern(func, func.patterns.at(m_pos - start).range);
continue;
}
case spu_program::inst_attr::putllc16:
{
putllc16_pattern(func, func.patterns.at(m_pos - start).range);
continue;
}
case spu_program::inst_attr::omit:
{
// TODO
continue;
}
default: break;
}

// Execute recompiler function (TODO)
(this->*decode(op))({op});
}
Expand Down
25 changes: 25 additions & 0 deletions rpcs3/Emu/Cell/SPURecompiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include "Utilities/File.h"
#include "Utilities/lockless.h"
#include "Utilities/address_range.h"
#include "SPUThread.h"
#include <vector>
#include <bitset>
Expand Down Expand Up @@ -59,6 +60,27 @@ struct spu_program
// Program data with intentionally wrong endianness (on LE platform opcode values are swapped)
std::vector<u32> data;

// TODO: Add patterns
// Not a bitset to allow more possibilities
enum class inst_attr : u8
{
none,
omit,
putllc16,
putllc0,
};

std::vector<inst_attr> inst_attrs;

struct pattern_info
{
utils::address_range range;
};

std::unordered_map<u32, pattern_info> patterns;

void add_pattern(bool fill_all, inst_attr attr, u32 start, u32 end = -1);

bool operator==(const spu_program& rhs) const noexcept;

bool operator<(const spu_program& rhs) const noexcept;
Expand Down Expand Up @@ -212,6 +234,9 @@ class spu_recompiler_base
// List of block predecessors
std::unordered_map<u32, std::basic_string<u32>, value_hash<u32, 2>> m_preds;

// List of loop connectors (destinatiuon -> branches)
std::unordered_map<u32, std::basic_string<u32>, value_hash<u32, 2>> m_loops;

// List of function entry points and return points (set after BRSL, BRASL, BISL, BISLED)
std::bitset<0x10000> m_entry_info;

Expand Down
1 change: 1 addition & 0 deletions rpcs3/Emu/Cell/SPUThread.h
Original file line number Diff line number Diff line change
Expand Up @@ -668,6 +668,7 @@ class spu_thread : public cpu_thread

// May be used by recompilers.
u8* memory_base_addr = vm::g_base_addr;
u8* reserv_base_addr = vm::g_reservations;

// General-Purpose Registers
std::array<v128, 128> gpr;
Expand Down

0 comments on commit 000a505

Please sign in to comment.