Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SPU LLVM: Optimize GB/GBH/GBB with a GFNI path #14669

Merged
merged 3 commits into from Oct 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
11 changes: 11 additions & 0 deletions rpcs3/Emu/CPU/CPUTranslator.cpp
Expand Up @@ -154,6 +154,16 @@ void cpu_translator::initialize(llvm::LLVMContext& context, llvm::ExecutionEngin
m_use_vnni = true;
}

// Test GFNI feature (TODO)
if (cpu == "tremont" ||
cpu == "gracemont" ||
cpu == "alderlake" ||
cpu == "raptorlake" ||
cpu == "meteorlake")
{
m_use_gfni = true;
Whatcookie marked this conversation as resolved.
Show resolved Hide resolved
}

// Test AVX-512_icelake features (TODO)
if (cpu == "icelake" ||
cpu == "icelake-client" ||
Expand All @@ -168,6 +178,7 @@ void cpu_translator::initialize(llvm::LLVMContext& context, llvm::ExecutionEngin
m_use_avx512 = true;
m_use_avx512_icl = true;
m_use_vnni = true;
m_use_gfni = true;
}

// Aarch64 CPUs
Expand Down
3 changes: 3 additions & 0 deletions rpcs3/Emu/CPU/CPUTranslator.h
Expand Up @@ -2971,6 +2971,9 @@ class cpu_translator
// Allow VNNI
bool m_use_vnni = false;

// Allow GFNI
bool m_use_gfni = false;

// Allow Icelake tier AVX-512
bool m_use_avx512_icl = false;

Expand Down
31 changes: 30 additions & 1 deletion rpcs3/Emu/Cell/SPURecompiler.cpp
Expand Up @@ -8134,21 +8134,50 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator

void GB(spu_opcode_t op)
{
// GFNI trick to extract selected bit from bytes
// By treating the first input as constant, and the second input as variable,
// with only 1 bit set in our constant, gf2p8affineqb will extract that selected bit
// from each byte of the second operand
if (m_use_gfni)
{
const auto a = get_vr<u8[16]>(op.ra);
const auto as = zshuffle(a, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 12, 8, 4, 0);
set_vr(op.rt, gf2p8affineqb(build<u8[16]>(0x0, 0x0, 0x0, 0x0, 0x01, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x01, 0x0, 0x0, 0x0), as, 0x0));
return;
}

const auto a = get_vr<s32[4]>(op.ra);
const auto m = zext<u32>(bitcast<i4>(trunc<bool[4]>(a)));
set_vr(op.rt, insert(splat<u32[4]>(0), 3, eval(m)));
}

void GBH(spu_opcode_t op)
{
if (m_use_gfni)
{
const auto a = get_vr<u8[16]>(op.ra);
const auto as = zshuffle(a, 16, 16, 16, 16, 16, 16, 16, 16, 14, 12, 10, 8, 6, 4, 2, 0);
set_vr(op.rt, gf2p8affineqb(build<u8[16]>(0x0, 0x0, 0x0, 0x0, 0x01, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x01, 0x0, 0x0, 0x0), as, 0x0));
return;
}

const auto a = get_vr<s16[8]>(op.ra);
const auto m = zext<u32>(bitcast<u8>(trunc<bool[8]>(a)));
set_vr(op.rt, insert(splat<u32[4]>(0), 3, eval(m)));
}

void GBB(spu_opcode_t op)
{
const auto a = get_vr<s8[16]>(op.ra);
const auto a = get_vr<u8[16]>(op.ra);

if (m_use_gfni)
{
const auto as = zshuffle(a, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
const auto m = gf2p8affineqb(build<u8[16]>(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x01, 0x01, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0), as, 0x0);
set_vr(op.rt, zshuffle(m, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10));
return;
}

const auto m = zext<u32>(bitcast<u16>(trunc<bool[16]>(a)));
set_vr(op.rt, insert(splat<u32[4]>(0), 3, eval(m)));
}
Expand Down