Skip to content

Commit

Permalink
SPU LLVM: add AVX-512 SPU verification
Browse files Browse the repository at this point in the history
- This is hidden behind a new setting, as some cpus may downclock agressively when executing 512 wide instructions
  • Loading branch information
Whatcookie committed Apr 14, 2021
1 parent 20c69a0 commit d57a508
Show file tree
Hide file tree
Showing 8 changed files with 136 additions and 44 deletions.
1 change: 1 addition & 0 deletions rpcs3/Emu/CPU/CPUTranslator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ void cpu_translator::initialize(llvm::LLVMContext& context, llvm::ExecutionEngin
cpu == "tigerlake")
{
m_use_fma = true;
m_use_avx512 = true;
}

// Test AVX-512_icelake features (TODO)
Expand Down
3 changes: 3 additions & 0 deletions rpcs3/Emu/CPU/CPUTranslator.h
Original file line number Diff line number Diff line change
Expand Up @@ -2422,6 +2422,9 @@ class cpu_translator
// Allow FMA
bool m_use_fma = false;

// Allow skylake-x tier AVX-512
bool m_use_avx512 = false;

// Allow Icelake tier AVX-512
bool m_use_avx512_icl = false;

Expand Down
161 changes: 117 additions & 44 deletions rpcs3/Emu/Cell/SPURecompiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4372,67 +4372,140 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator

llvm::Value* acc = nullptr;

for (u32 j = starta; j < end; j += 32)
if (m_use_avx512 && g_cfg.core.full_width_avx512)
{
int indices[8];
bool holes = false;
bool data = false;

for (u32 i = 0; i < 8; i++)
for (u32 j = starta; j < end; j += 64)
{
const u32 k = j + i * 4;
int indices[16];
bool holes = false;
bool data = false;

if (k < start || k >= end || !func.data[(k - start) / 4])
for (u32 i = 0; i < 16; i++)
{
indices[i] = 8;
holes = true;
const u32 k = j + i * 4;

if (k < start || k >= end || !func.data[(k - start) / 4])
{
indices[i] = 16;
holes = true;
}
else
{
indices[i] = i;
data = true;
}
}
else

if (!data)
{
indices[i] = i;
data = true;
// Skip full-sized holes
continue;
}
}

if (!data)
{
// Skip full-sized holes
continue;
}
// Load unaligned code block from LS
llvm::Value* vls = m_ir->CreateAlignedLoad(_ptr<u32[16]>(data_addr, j - starta), llvm::MaybeAlign{4});

// Load unaligned code block from LS
llvm::Value* vls = m_ir->CreateAlignedLoad(_ptr<u32[8]>(data_addr, j - starta), llvm::MaybeAlign{4});
// Mask if necessary
if (holes)
{
vls = m_ir->CreateShuffleVector(vls, ConstantAggregateZero::get(vls->getType()), indices);
}

// Mask if necessary
if (holes)
{
vls = m_ir->CreateShuffleVector(vls, ConstantAggregateZero::get(vls->getType()), indices);
}
// Perform bitwise comparison and accumulate
u32 words[16];

// Perform bitwise comparison and accumulate
u32 words[8];
for (u32 i = 0; i < 16; i++)
{
const u32 k = j + i * 4;
words[i] = k >= start && k < end ? func.data[(k - start) / 4] : 0;
}

for (u32 i = 0; i < 8; i++)
{
const u32 k = j + i * 4;
words[i] = k >= start && k < end ? func.data[(k - start) / 4] : 0;
vls = m_ir->CreateXor(vls, ConstantDataVector::get(m_context, words));
acc = acc ? m_ir->CreateOr(acc, vls) : vls;
check_iterations++;
}

vls = m_ir->CreateXor(vls, ConstantDataVector::get(m_context, words));
acc = acc ? m_ir->CreateOr(acc, vls) : vls;
check_iterations++;
// Pattern for PTEST
acc = m_ir->CreateBitCast(acc, get_type<u64[8]>());
llvm::Value* elem = m_ir->CreateExtractElement(acc, u64{0});
elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, 1));
elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, 2));
elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, 3));
elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, 4));
elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, 5));
elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, 6));
elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, 7));

// Compare result with zero
const auto cond = m_ir->CreateICmpNE(elem, m_ir->getInt64(0));
m_ir->CreateCondBr(cond, label_diff, label_body, m_md_unlikely);

}
else
{

for (u32 j = starta; j < end; j += 32)
{
int indices[8];
bool holes = false;
bool data = false;

// Pattern for PTEST
acc = m_ir->CreateBitCast(acc, get_type<u64[4]>());
llvm::Value* elem = m_ir->CreateExtractElement(acc, u64{0});
elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, 1));
elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, 2));
elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, 3));
for (u32 i = 0; i < 8; i++)
{
const u32 k = j + i * 4;

// Compare result with zero
const auto cond = m_ir->CreateICmpNE(elem, m_ir->getInt64(0));
m_ir->CreateCondBr(cond, label_diff, label_body, m_md_unlikely);
if (k < start || k >= end || !func.data[(k - start) / 4])
{
indices[i] = 8;
holes = true;
}
else
{
indices[i] = i;
data = true;
}
}

if (!data)
{
// Skip full-sized holes
continue;
}

// Load unaligned code block from LS
llvm::Value* vls = m_ir->CreateAlignedLoad(_ptr<u32[8]>(data_addr, j - starta), llvm::MaybeAlign{4});

// Mask if necessary
if (holes)
{
vls = m_ir->CreateShuffleVector(vls, ConstantAggregateZero::get(vls->getType()), indices);
}

// Perform bitwise comparison and accumulate
u32 words[8];

for (u32 i = 0; i < 8; i++)
{
const u32 k = j + i * 4;
words[i] = k >= start && k < end ? func.data[(k - start) / 4] : 0;
}

vls = m_ir->CreateXor(vls, ConstantDataVector::get(m_context, words));
acc = acc ? m_ir->CreateOr(acc, vls) : vls;
check_iterations++;
}

// Pattern for PTEST
acc = m_ir->CreateBitCast(acc, get_type<u64[4]>());
llvm::Value* elem = m_ir->CreateExtractElement(acc, u64{0});
elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, 1));
elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, 2));
elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, 3));

// Compare result with zero
const auto cond = m_ir->CreateICmpNE(elem, m_ir->getInt64(0));
m_ir->CreateCondBr(cond, label_diff, label_body, m_md_unlikely);
}
}

// Increase block counter with statistics
Expand Down
1 change: 1 addition & 0 deletions rpcs3/Emu/system_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ struct cfg_root : cfg::node
cfg::_int<-1, 14> ppu_128_reservations_loop_max_length{ this, "Accurate PPU 128-byte Reservation Op Max Length", 0, true }; // -1: Always accurate, 0: Never accurate, 1-14: max accurate loop length
cfg::_bool llvm_ppu_accurate_vector_nan{ this, "PPU LLVM Accurate Vector NaN values", false };
cfg::_int<-64, 64> stub_ppu_traps{ this, "Stub PPU Traps", 0, true }; // Hack, skip PPU traps for rare cases where the trap is continueable (specify relative instructions to skip)
cfg::_bool full_width_avx512{ this, "Full Width AVX-512", false};

cfg::_bool debug_console_mode{ this, "Debug Console Mode", false }; // Debug console emulation, not recommended
cfg::_bool hook_functions{ this, "Hook static functions" };
Expand Down
2 changes: 2 additions & 0 deletions rpcs3/rpcs3qt/emu_settings_type.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ enum class emu_settings_type
SleepTimersAccuracy,
ClocksScale,
PerformanceReport,
FullWidthAVX512,

// Graphics
Renderer,
Expand Down Expand Up @@ -189,6 +190,7 @@ inline static const QMap<emu_settings_type, cfg_location> settings_location =
{ emu_settings_type::ClocksScale, { "Core", "Clocks scale"}},
{ emu_settings_type::AccuratePPU128Loop, { "Core", "Accurate PPU 128-byte Reservation Op Max Length"}},
{ emu_settings_type::PerformanceReport, { "Core", "Enable Performance Report"}},
{ emu_settings_type::FullWidthAVX512, { "Core", "Full Width AVX-512"}},

// Graphics Tab
{ emu_settings_type::Renderer, { "Video", "Renderer"}},
Expand Down
4 changes: 4 additions & 0 deletions rpcs3/rpcs3qt/settings_dialog.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,10 @@ settings_dialog::settings_dialog(std::shared_ptr<gui_settings> gui_settings, std
m_emu_settings->EnhanceCheckBox(ui->accurateXFloat, emu_settings_type::AccurateXFloat);
SubscribeTooltip(ui->accurateXFloat, tooltips.settings.accurate_xfloat);

m_emu_settings->EnhanceCheckBox(ui->fullWidthAVX512, emu_settings_type::FullWidthAVX512);
SubscribeTooltip(ui->fullWidthAVX512, tooltips.settings.full_width_avx512);
ui->fullWidthAVX512->setDisabled(!utils::has_avx512());

// Comboboxes

m_emu_settings->EnhanceComboBox(ui->spuBlockSize, emu_settings_type::SPUBlockSize);
Expand Down
7 changes: 7 additions & 0 deletions rpcs3/rpcs3qt/settings_dialog.ui
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,13 @@
</property>
</widget>
</item>
<item>
<widget class="QCheckBox" name="fullWidthAVX512">
<property name="text">
<string>Full Width AVX-512</string>
</property>
</widget>
</item>
</layout>
</widget>
</item>
Expand Down
1 change: 1 addition & 0 deletions rpcs3/rpcs3qt/tooltips.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ class Tooltips : public QObject
const QString enable_tsx = tr("Enable usage of TSX instructions.\nNeeds to be forced on some Haswell or Broadwell CPUs.\nForcing this on older Hardware can lead to system instability, use it with caution.");
const QString spu_block_size = tr("This option controls the SPU analyser, particularly the size of compiled units. The Mega and Giga modes may improve performance by tying smaller units together, decreasing the number of compiled units but increasing their size.\nUse the Safe mode for maximum compatibility.");
const QString preferred_spu_threads = tr("Some SPU stages are sensitive to race conditions and allowing a limited number at a time helps alleviate performance stalls.\nSetting this to a smaller value might improve performance and reduce stuttering in some games.\nLeave this on auto if performance is negatively affected when setting a small value.");
const QString full_width_avx512 = tr("Enables the use of code with full width AVX-512.\nThis code can be executed much faster, but may cause a loss in performance if your CPU model experiences downclocking on wide AVX-512 loads.\nNote that AVX-512 instructions will be used regardless of this option, just at 128 and 256 bit width.");

// debug

Expand Down

0 comments on commit d57a508

Please sign in to comment.