Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

vk: Allow host GPU to synchronize directly with CELL via RSX semaphores #11568

Merged
merged 5 commits into from
Feb 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions rpcs3/Emu/RSX/RSXThread.h
Original file line number Diff line number Diff line change
Expand Up @@ -590,6 +590,7 @@ namespace rsx
bool supports_hw_conditional_render; // Conditional render
bool supports_passthrough_dma; // DMA passthrough
bool supports_asynchronous_compute; // Async compute
bool supports_host_gpu_labels; // Advanced host synchronization
};

struct sampled_image_descriptor_base;
Expand Down Expand Up @@ -859,6 +860,7 @@ namespace rsx
void sync();
flags32_t read_barrier(u32 memory_address, u32 memory_range, bool unconditional);
virtual void sync_hint(FIFO_hint hint, void* args);
virtual bool release_GCM_label(u32 /*address*/, u32 /*value*/) { return false; }

std::span<const std::byte> get_raw_index_array(const draw_clause& draw_indexed_clause) const;

Expand Down
168 changes: 142 additions & 26 deletions rpcs3/Emu/RSX/VK/VKGSRender.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -550,22 +550,31 @@ VKGSRender::VKGSRender() : GSRender()
// Relaxed query synchronization
backend_config.supports_hw_conditional_render = !!g_cfg.video.relaxed_zcull_sync;

// Passthrough DMA
backend_config.supports_passthrough_dma = m_device->get_external_memory_host_support();

// Host sync
backend_config.supports_host_gpu_labels = !!g_cfg.video.host_label_synchronization;

// Async compute and related operations
if (g_cfg.video.vk.asynchronous_texture_streaming)
{
// Optimistic, enable async compute and passthrough DMA
backend_config.supports_passthrough_dma = m_device->get_external_memory_host_support();
// Optimistic, enable async compute
backend_config.supports_asynchronous_compute = true;

if (m_device->get_graphics_queue() == m_device->get_transfer_queue())
{
rsx_log.error("Cannot run graphics and async transfer in the same queue. Async uploads are disabled. This is a limitation of your GPU");
backend_config.supports_asynchronous_compute = false;
}
}

switch (vk::get_driver_vendor())
// Sanity checks
switch (vk::get_driver_vendor())
{
case vk::driver_vendor::NVIDIA:
if (backend_config.supports_asynchronous_compute)
{
case vk::driver_vendor::NVIDIA:
if (auto chip_family = vk::get_chip_family();
chip_family == vk::chip_class::NV_kepler || chip_family == vk::chip_class::NV_maxwell)
{
Expand All @@ -574,34 +583,54 @@ VKGSRender::VKGSRender() : GSRender()

rsx_log.notice("Forcing safe async compute for NVIDIA device to avoid crashing.");
g_cfg.video.vk.asynchronous_scheduler.set(vk_gpu_scheduler_mode::safe);
break;
}
break;
#if !defined(_WIN32)
// Anything running on AMDGPU kernel driver will not work due to the check for fd-backed memory allocations
case vk::driver_vendor::RADV:
case vk::driver_vendor::AMD:
// Anything running on AMDGPU kernel driver will not work due to the check for fd-backed memory allocations
case vk::driver_vendor::RADV:
case vk::driver_vendor::AMD:
#if !defined(__linux__)
// Intel chipsets would fail on BSD in most cases and DRM_IOCTL_i915_GEM_USERPTR unimplemented
case vk::driver_vendor::ANV:
// Intel chipsets would fail on BSD in most cases and DRM_IOCTL_i915_GEM_USERPTR unimplemented
case vk::driver_vendor::ANV:
#endif
if (backend_config.supports_passthrough_dma)
{
rsx_log.error("AMDGPU kernel driver on linux and INTEL driver on some platforms cannot support passthrough DMA buffers.");
backend_config.supports_passthrough_dma = false;
}
break;
#endif
case vk::driver_vendor::MVK:
// Async compute crashes immediately on Apple GPUs
rsx_log.error("Apple GPUs are incompatible with the current implementation of asynchronous texture decoding.");
backend_config.supports_asynchronous_compute = false;
break;
default: break;
if (backend_config.supports_passthrough_dma)
{
rsx_log.error("AMDGPU kernel driver on linux and INTEL driver on some platforms cannot support passthrough DMA buffers.");
backend_config.supports_passthrough_dma = false;
}
break;
#endif
case vk::driver_vendor::MVK:
// Async compute crashes immediately on Apple GPUs
rsx_log.error("Apple GPUs are incompatible with the current implementation of asynchronous texture decoding.");
backend_config.supports_asynchronous_compute = false;
break;
default: break;
}

if (backend_config.supports_asynchronous_compute)
if (backend_config.supports_asynchronous_compute)
{
// Run only if async compute can be used.
g_fxo->init<vk::AsyncTaskScheduler>(g_cfg.video.vk.asynchronous_scheduler);
}

if (backend_config.supports_host_gpu_labels)
{
if (backend_config.supports_passthrough_dma)
{
m_host_object_data = std::make_unique<vk::buffer>(*m_device,
0x10000,
memory_map.host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT,
VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0,
VMM_ALLOCATION_POOL_SYSTEM);

m_host_data_ptr = new (m_host_object_data->map(0, 0x100000)) vk::host_data_t();
ensure(m_host_data_ptr->magic == 0xCAFEBABE);
}
else
{
// Run only if async compute can be used.
g_fxo->init<vk::AsyncTaskScheduler>(g_cfg.video.vk.asynchronous_scheduler);
rsx_log.error("Your GPU/driver does not support extensions required to enable passthrough DMA emulation. Host GPU labels will be disabled.");
backend_config.supports_host_gpu_labels = false;
}
}
}
Expand Down Expand Up @@ -629,6 +658,13 @@ VKGSRender::~VKGSRender()
g_fxo->get<vk::AsyncTaskScheduler>().destroy();
}

// Host data
if (m_host_object_data)
{
m_host_object_data->unmap();
m_host_object_data.reset();
}

// Clear flush requests
m_flush_requests.clear_pending_flag();

Expand Down Expand Up @@ -1453,6 +1489,75 @@ void VKGSRender::flush_command_queue(bool hard_sync, bool do_not_switch)
m_current_command_buffer->begin();
}

bool VKGSRender::release_GCM_label(u32 address, u32 args)
{
if (!backend_config.supports_host_gpu_labels)
{
return false;
}

auto drain_label_queue = [this]()
{
while (m_host_data_ptr->last_label_release_event > m_host_data_ptr->commands_complete_event)
{
_mm_pause();

if (thread_ctrl::state() == thread_state::aborting)
{
break;
}
}
};

ensure(m_host_data_ptr);
if (m_host_data_ptr->texture_load_complete_event == m_host_data_ptr->texture_load_request_event)
{
// All texture loads already seen by the host GPU
// Wait for all previously submitted labels to be flushed
drain_label_queue();
return false;
}

const auto mapping = vk::map_dma(address, 4);
const auto write_data = std::bit_cast<u32, be_t<u32>>(args);

if (!dynamic_cast<vk::memory_block_host*>(mapping.second->memory.get()))
{
// NVIDIA GPUs can disappoint when DMA blocks straddle VirtualAlloc boundaries.
// Take the L and try the fallback.
rsx_log.warning("Host label update at 0x%x was not possible.", address);
drain_label_queue();
return false;
}

m_host_data_ptr->last_label_release_event = ++m_host_data_ptr->event_counter;

if (m_host_data_ptr->texture_load_request_event > m_host_data_ptr->last_label_submit_event)
{
if (vk::is_renderpass_open(*m_current_command_buffer))
{
vk::end_renderpass(*m_current_command_buffer);
}

vkCmdUpdateBuffer(*m_current_command_buffer, mapping.second->value, mapping.first, 4, &write_data);
flush_command_queue();
}
else
{
auto cmd = m_secondary_cb_list.next();
cmd->begin();
vkCmdUpdateBuffer(*cmd, mapping.second->value, mapping.first, 4, &write_data);
vkCmdUpdateBuffer(*cmd, m_host_object_data->value, ::offset32(&vk::host_data_t::commands_complete_event), 8, const_cast<u64*>(&m_host_data_ptr->last_label_release_event));
cmd->end();

vk::queue_submit_t submit_info = { m_device->get_graphics_queue(), nullptr };
cmd->submit(submit_info);

m_host_data_ptr->last_label_submit_event = m_host_data_ptr->last_label_release_event;
}
return true;
}

void VKGSRender::sync_hint(rsx::FIFO_hint hint, void* args)
{
ensure(args);
Expand Down Expand Up @@ -2088,6 +2193,17 @@ void VKGSRender::close_and_submit_command_buffer(vk::fence* pFence, VkSemaphore
m_current_command_buffer->flags &= ~vk::command_buffer::cb_has_open_query;
}

if (m_host_data_ptr && m_host_data_ptr->last_label_release_event > m_host_data_ptr->last_label_submit_event)
{
vkCmdUpdateBuffer(*m_current_command_buffer,
m_host_object_data->value,
::offset32(&vk::host_data_t::commands_complete_event),
sizeof(u64),
const_cast<u64*>(&m_host_data_ptr->last_label_release_event));

m_host_data_ptr->last_label_submit_event = m_host_data_ptr->last_label_release_event;
}

m_current_command_buffer->end();
m_current_command_buffer->tag();

Expand Down
7 changes: 7 additions & 0 deletions rpcs3/Emu/RSX/VK/VKGSRender.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,9 @@ class VKGSRender : public GSRender, public ::rsx::reports::ZCULL_control
vk::command_buffer_chunk* m_current_command_buffer = nullptr;
VkSemaphore m_dangling_semaphore_signal = VK_NULL_HANDLE;

volatile vk::host_data_t* m_host_data_ptr = nullptr;
std::unique_ptr<vk::buffer> m_host_object_data;

VkDescriptorSetLayout descriptor_layouts;
VkPipelineLayout pipeline_layout;

Expand Down Expand Up @@ -242,6 +245,7 @@ class VKGSRender : public GSRender, public ::rsx::reports::ZCULL_control
void bind_viewport();

void sync_hint(rsx::FIFO_hint hint, void* args) override;
bool release_GCM_label(u32 address, u32 data) override;

void begin_occlusion_query(rsx::reports::occlusion_query_info* query) override;
void end_occlusion_query(rsx::reports::occlusion_query_info* query) override;
Expand All @@ -259,6 +263,9 @@ class VKGSRender : public GSRender, public ::rsx::reports::ZCULL_control
void begin_conditional_rendering(const std::vector<rsx::reports::occlusion_query_info*>& sources) override;
void end_conditional_rendering() override;

// Host sync object
inline std::pair<volatile vk::host_data_t*, VkBuffer> map_host_object_data() { return { m_host_data_ptr, m_host_object_data->value }; }

protected:
void clear_surface(u32 mask) override;
void begin() override;
Expand Down
12 changes: 12 additions & 0 deletions rpcs3/Emu/RSX/VK/VKTexture.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

#include "vkutils/data_heap.h"
#include "vkutils/image_helpers.h"
#include "VKGSRender.h"

#include "../GCM.h"
#include "../rsx_utils.h"
Expand Down Expand Up @@ -1146,6 +1147,17 @@ namespace vk
// Release from async chain, the primary chain will acquire later
dst_image->queue_release(cmd2, cmd.get_queue_family(), dst_image->current_layout);
}

if (auto rsxthr = rsx::get_current_renderer();
rsxthr->get_backend_config().supports_host_gpu_labels)
{
// Queue a sync update on the CB doing the load
auto [host_data, host_buffer] = static_cast<VKGSRender*>(rsxthr)->map_host_object_data();
ensure(host_data);
const auto event_id = ++host_data->event_counter;
host_data->texture_load_request_event = event_id;
vkCmdUpdateBuffer(cmd2, host_buffer, ::offset32(&vk::host_data_t::texture_load_complete_event), sizeof(u64), &event_id);
}
}

void blitter::scale_image(vk::command_buffer& cmd, vk::image* src, vk::image* dst, areai src_area, areai dst_area, bool interpolate, const rsx::typeless_xfer& xfer_info)
Expand Down
12 changes: 12 additions & 0 deletions rpcs3/Emu/RSX/VK/vkutils/sync.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,18 @@ namespace vk
gpu = 1
};

struct host_data_t // Pick a better name
{
u64 magic = 0xCAFEBABE;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not 0xB0BAFEDD ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

0xBEEFFACE

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

0xB16B00B5

u64 event_counter = 0;
u64 texture_load_request_event = 0;
u64 texture_load_complete_event = 0;
u64 last_label_release_event = 0;
u64 last_label_submit_event = 0;
u64 commands_complete_event = 0;
u64 last_label_request_timestamp = 0;
};

struct fence
{
atomic_t<bool> flushed = false;
Expand Down
Loading