RPCS3 · kd-11 · Feb 23, 2022 · Feb 19, 2022 · Feb 19, 2022 · Feb 19, 2022
diff --git a/rpcs3/Emu/RSX/RSXThread.h b/rpcs3/Emu/RSX/RSXThread.h
@@ -590,6 +590,7 @@ namespace rsx
 		bool supports_hw_conditional_render; // Conditional render
 		bool supports_passthrough_dma;       // DMA passthrough
 		bool supports_asynchronous_compute;  // Async compute
+		bool supports_host_gpu_labels;       // Advanced host synchronization
 	};
 
 	struct sampled_image_descriptor_base;
@@ -859,6 +860,7 @@ namespace rsx
 		void sync();
 		flags32_t read_barrier(u32 memory_address, u32 memory_range, bool unconditional);
 		virtual void sync_hint(FIFO_hint hint, void* args);
+		virtual bool release_GCM_label(u32 /*address*/, u32 /*value*/) { return false; }
 
 		std::span<const std::byte> get_raw_index_array(const draw_clause& draw_indexed_clause) const;
 

diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp
@@ -550,22 +550,31 @@ VKGSRender::VKGSRender() : GSRender()
 	// Relaxed query synchronization
 	backend_config.supports_hw_conditional_render = !!g_cfg.video.relaxed_zcull_sync;
 
+	// Passthrough DMA
+	backend_config.supports_passthrough_dma = m_device->get_external_memory_host_support();
+
+	// Host sync
+	backend_config.supports_host_gpu_labels = !!g_cfg.video.host_label_synchronization;
+
 	// Async compute and related operations
 	if (g_cfg.video.vk.asynchronous_texture_streaming)
 	{
-		// Optimistic, enable async compute and passthrough DMA
-		backend_config.supports_passthrough_dma = m_device->get_external_memory_host_support();
+		// Optimistic, enable async compute
 		backend_config.supports_asynchronous_compute = true;
 
 		if (m_device->get_graphics_queue() == m_device->get_transfer_queue())
 		{
 			rsx_log.error("Cannot run graphics and async transfer in the same queue. Async uploads are disabled. This is a limitation of your GPU");
 			backend_config.supports_asynchronous_compute = false;
 		}
+	}
 
-		switch (vk::get_driver_vendor())
+	// Sanity checks
+	switch (vk::get_driver_vendor())
+	{
+	case vk::driver_vendor::NVIDIA:
+		if (backend_config.supports_asynchronous_compute)
 		{
-		case vk::driver_vendor::NVIDIA:
 			if (auto chip_family = vk::get_chip_family();
 				chip_family == vk::chip_class::NV_kepler || chip_family == vk::chip_class::NV_maxwell)
 			{
@@ -574,34 +583,54 @@ VKGSRender::VKGSRender() : GSRender()
 
 			rsx_log.notice("Forcing safe async compute for NVIDIA device to avoid crashing.");
 			g_cfg.video.vk.asynchronous_scheduler.set(vk_gpu_scheduler_mode::safe);
-			break;
+		}
+		break;
 #if !defined(_WIN32)
-			// Anything running on AMDGPU kernel driver will not work due to the check for fd-backed memory allocations
-		case vk::driver_vendor::RADV:
-		case vk::driver_vendor::AMD:
+		// Anything running on AMDGPU kernel driver will not work due to the check for fd-backed memory allocations
+	case vk::driver_vendor::RADV:
+	case vk::driver_vendor::AMD:
 #if !defined(__linux__)
-			// Intel chipsets would fail on BSD in most cases and DRM_IOCTL_i915_GEM_USERPTR unimplemented
-		case vk::driver_vendor::ANV:
+		// Intel chipsets would fail on BSD in most cases and DRM_IOCTL_i915_GEM_USERPTR unimplemented
+	case vk::driver_vendor::ANV:
 #endif
-			if (backend_config.supports_passthrough_dma)
-			{
-				rsx_log.error("AMDGPU kernel driver on linux and INTEL driver on some platforms cannot support passthrough DMA buffers.");
-				backend_config.supports_passthrough_dma = false;
-			}
-			break;
-#endif
-		case vk::driver_vendor::MVK:
-			// Async compute crashes immediately on Apple GPUs
-			rsx_log.error("Apple GPUs are incompatible with the current implementation of asynchronous texture decoding.");
-			backend_config.supports_asynchronous_compute = false;
-			break;
-		default: break;
+		if (backend_config.supports_passthrough_dma)
+		{
+			rsx_log.error("AMDGPU kernel driver on linux and INTEL driver on some platforms cannot support passthrough DMA buffers.");
+			backend_config.supports_passthrough_dma = false;
 		}
+		break;
+#endif
+	case vk::driver_vendor::MVK:
+		// Async compute crashes immediately on Apple GPUs
+		rsx_log.error("Apple GPUs are incompatible with the current implementation of asynchronous texture decoding.");
+		backend_config.supports_asynchronous_compute = false;
+		break;
+	default: break;
+	}
 
-		if (backend_config.supports_asynchronous_compute)
+	if (backend_config.supports_asynchronous_compute)
+	{
+		// Run only if async compute can be used.
+		g_fxo->init<vk::AsyncTaskScheduler>(g_cfg.video.vk.asynchronous_scheduler);
+	}
+
+	if (backend_config.supports_host_gpu_labels)
+	{
+		if (backend_config.supports_passthrough_dma)
+		{
+			m_host_object_data = std::make_unique<vk::buffer>(*m_device,
+				0x10000,
+				memory_map.host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT,
+				VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0,
+				VMM_ALLOCATION_POOL_SYSTEM);
+
+			m_host_data_ptr = new (m_host_object_data->map(0, 0x100000)) vk::host_data_t();
+			ensure(m_host_data_ptr->magic == 0xCAFEBABE);
+		}
+		else
 		{
-			// Run only if async compute can be used.
-			g_fxo->init<vk::AsyncTaskScheduler>(g_cfg.video.vk.asynchronous_scheduler);
+			rsx_log.error("Your GPU/driver does not support extensions required to enable passthrough DMA emulation. Host GPU labels will be disabled.");
+			backend_config.supports_host_gpu_labels = false;
 		}
 	}
 }
@@ -629,6 +658,13 @@ VKGSRender::~VKGSRender()
 		g_fxo->get<vk::AsyncTaskScheduler>().destroy();
 	}
 
+	// Host data
+	if (m_host_object_data)
+	{
+		m_host_object_data->unmap();
+		m_host_object_data.reset();
+	}
+
 	// Clear flush requests
 	m_flush_requests.clear_pending_flag();
 
@@ -1453,6 +1489,75 @@ void VKGSRender::flush_command_queue(bool hard_sync, bool do_not_switch)
 	m_current_command_buffer->begin();
 }
 
+bool VKGSRender::release_GCM_label(u32 address, u32 args)
+{
+	if (!backend_config.supports_host_gpu_labels)
+	{
+		return false;
+	}
+
+	auto drain_label_queue = [this]()
+	{
+		while (m_host_data_ptr->last_label_release_event > m_host_data_ptr->commands_complete_event)
+		{
+			_mm_pause();
+
+			if (thread_ctrl::state() == thread_state::aborting)
+			{
+				break;
+			}
+		}
+	};
+
+	ensure(m_host_data_ptr);
+	if (m_host_data_ptr->texture_load_complete_event == m_host_data_ptr->texture_load_request_event)
+	{
+		// All texture loads already seen by the host GPU
+		// Wait for all previously submitted labels to be flushed
+		drain_label_queue();
+		return false;
+	}
+
+	const auto mapping = vk::map_dma(address, 4);
+	const auto write_data = std::bit_cast<u32, be_t<u32>>(args);
+
+	if (!dynamic_cast<vk::memory_block_host*>(mapping.second->memory.get()))
+	{
+		// NVIDIA GPUs can disappoint when DMA blocks straddle VirtualAlloc boundaries.
+		// Take the L and try the fallback.
+		rsx_log.warning("Host label update at 0x%x was not possible.", address);
+		drain_label_queue();
+		return false;
+	}
+
+	m_host_data_ptr->last_label_release_event = ++m_host_data_ptr->event_counter;
+
+	if (m_host_data_ptr->texture_load_request_event > m_host_data_ptr->last_label_submit_event)
+	{
+		if (vk::is_renderpass_open(*m_current_command_buffer))
+		{
+			vk::end_renderpass(*m_current_command_buffer);
+		}
+
+		vkCmdUpdateBuffer(*m_current_command_buffer, mapping.second->value, mapping.first, 4, &write_data);
+		flush_command_queue();
+	}
+	else
+	{
+		auto cmd = m_secondary_cb_list.next();
+		cmd->begin();
+		vkCmdUpdateBuffer(*cmd, mapping.second->value, mapping.first, 4, &write_data);
+		vkCmdUpdateBuffer(*cmd, m_host_object_data->value, ::offset32(&vk::host_data_t::commands_complete_event), 8, const_cast<u64*>(&m_host_data_ptr->last_label_release_event));
+		cmd->end();
+
+		vk::queue_submit_t submit_info = { m_device->get_graphics_queue(), nullptr };
+		cmd->submit(submit_info);
+
+		m_host_data_ptr->last_label_submit_event = m_host_data_ptr->last_label_release_event;
+	}
+	return true;
+}
+
 void VKGSRender::sync_hint(rsx::FIFO_hint hint, void* args)
 {
 	ensure(args);
@@ -2088,6 +2193,17 @@ void VKGSRender::close_and_submit_command_buffer(vk::fence* pFence, VkSemaphore
 		m_current_command_buffer->flags &= ~vk::command_buffer::cb_has_open_query;
 	}
 
+	if (m_host_data_ptr && m_host_data_ptr->last_label_release_event > m_host_data_ptr->last_label_submit_event)
+	{
+		vkCmdUpdateBuffer(*m_current_command_buffer,
+			m_host_object_data->value,
+			::offset32(&vk::host_data_t::commands_complete_event),
+			sizeof(u64),
+			const_cast<u64*>(&m_host_data_ptr->last_label_release_event));
+
+		m_host_data_ptr->last_label_submit_event = m_host_data_ptr->last_label_release_event;
+	}
+
 	m_current_command_buffer->end();
 	m_current_command_buffer->tag();
 

diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.h b/rpcs3/Emu/RSX/VK/VKGSRender.h
@@ -117,6 +117,9 @@ class VKGSRender : public GSRender, public ::rsx::reports::ZCULL_control
 	vk::command_buffer_chunk* m_current_command_buffer = nullptr;
 	VkSemaphore m_dangling_semaphore_signal = VK_NULL_HANDLE;
 
+	volatile vk::host_data_t* m_host_data_ptr = nullptr;
+	std::unique_ptr<vk::buffer> m_host_object_data;
+
 	VkDescriptorSetLayout descriptor_layouts;
 	VkPipelineLayout pipeline_layout;
 
@@ -242,6 +245,7 @@ class VKGSRender : public GSRender, public ::rsx::reports::ZCULL_control
 	void bind_viewport();
 
 	void sync_hint(rsx::FIFO_hint hint, void* args) override;
+	bool release_GCM_label(u32 address, u32 data) override;
 
 	void begin_occlusion_query(rsx::reports::occlusion_query_info* query) override;
 	void end_occlusion_query(rsx::reports::occlusion_query_info* query) override;
@@ -259,6 +263,9 @@ class VKGSRender : public GSRender, public ::rsx::reports::ZCULL_control
 	void begin_conditional_rendering(const std::vector<rsx::reports::occlusion_query_info*>& sources) override;
 	void end_conditional_rendering() override;
 
+	// Host sync object
+	inline std::pair<volatile vk::host_data_t*, VkBuffer> map_host_object_data() { return { m_host_data_ptr, m_host_object_data->value }; }
+
 protected:
 	void clear_surface(u32 mask) override;
 	void begin() override;

diff --git a/rpcs3/Emu/RSX/VK/VKTexture.cpp b/rpcs3/Emu/RSX/VK/VKTexture.cpp
@@ -9,6 +9,7 @@
 
 #include "vkutils/data_heap.h"
 #include "vkutils/image_helpers.h"
+#include "VKGSRender.h"
 
 #include "../GCM.h"
 #include "../rsx_utils.h"
@@ -1146,6 +1147,17 @@ namespace vk
 			// Release from async chain, the primary chain will acquire later
 			dst_image->queue_release(cmd2, cmd.get_queue_family(), dst_image->current_layout);
 		}
+
+		if (auto rsxthr = rsx::get_current_renderer();
+			rsxthr->get_backend_config().supports_host_gpu_labels)
+		{
+			// Queue a sync update on the CB doing the load
+			auto [host_data, host_buffer] = static_cast<VKGSRender*>(rsxthr)->map_host_object_data();
+			ensure(host_data);
+			const auto event_id = ++host_data->event_counter;
+			host_data->texture_load_request_event = event_id;
+			vkCmdUpdateBuffer(cmd2, host_buffer, ::offset32(&vk::host_data_t::texture_load_complete_event), sizeof(u64), &event_id);
+		}
 	}
 
 	void blitter::scale_image(vk::command_buffer& cmd, vk::image* src, vk::image* dst, areai src_area, areai dst_area, bool interpolate, const rsx::typeless_xfer& xfer_info)

diff --git a/rpcs3/Emu/RSX/VK/vkutils/sync.h b/rpcs3/Emu/RSX/VK/vkutils/sync.h
@@ -16,6 +16,18 @@ namespace vk
 		gpu = 1
 	};
 
+	struct host_data_t // Pick a better name
+	{
+		u64 magic = 0xCAFEBABE;
+		u64 event_counter = 0;
+		u64 texture_load_request_event = 0;
+		u64 texture_load_complete_event = 0;
+		u64 last_label_release_event = 0;
+		u64 last_label_submit_event = 0;
+		u64 commands_complete_event = 0;
+		u64 last_label_request_timestamp = 0;
+	};
+
 	struct fence
 	{
 		atomic_t<bool> flushed = false;