From 85e5b077f7da1497d83bc2d74d49246d059a47ff Mon Sep 17 00:00:00 2001 From: kd-11 Date: Sun, 6 Sep 2020 18:17:08 +0300 Subject: [PATCH] gl: Overhaul upload and download routines for textures to go through shared image_to_buffer and buffer_to_image routines. - This automatically adds support for depth float textures as well --- rpcs3/Emu/RSX/GL/GLCompute.h | 248 ++++++++++++-- rpcs3/Emu/RSX/GL/GLHelpers.h | 36 +- rpcs3/Emu/RSX/GL/GLRenderTargets.cpp | 11 +- rpcs3/Emu/RSX/GL/GLTexture.cpp | 484 ++++++++++++++------------- rpcs3/Emu/RSX/GL/GLTexture.h | 24 +- rpcs3/Emu/RSX/GL/GLTextureCache.h | 48 ++- 6 files changed, 526 insertions(+), 325 deletions(-) diff --git a/rpcs3/Emu/RSX/GL/GLCompute.h b/rpcs3/Emu/RSX/GL/GLCompute.h index 3c612973752d..7d22bd7d2e81 100644 --- a/rpcs3/Emu/RSX/GL/GLCompute.h +++ b/rpcs3/Emu/RSX/GL/GLCompute.h @@ -110,7 +110,7 @@ namespace gl u32 m_data_length = 0; u32 kernel_size = 1; - std::string uniforms, variables, work_kernel, loop_advance, suffix; + std::string uniforms, variables, work_kernel, loop_advance, suffix, method_declarations; cs_shuffle_base() { @@ -146,10 +146,8 @@ namespace gl "#define bswap_u16_u32(bits) (bits & 0xFFFF) << 16 | (bits & 0xFFFF0000) >> 16\n" "\n" "// Depth format conversions\n" - "#define d24x8_to_x8d24(bits) (bits << 8) | (bits >> 24)\n" - "#define d24x8_to_x8d24_swapped(bits) bswap_u32(d24x8_to_x8d24(bits))\n" - "#define x8d24_to_d24x8(bits) (bits >> 8) | (bits << 24)\n" - "#define x8d24_to_d24x8_swapped(bits) x8d24_to_d24x8(bswap_u32(bits))\n" + "#define d24f_to_f32(bits) (bits << 7)\n" + "#define f32_to_d24f(bits) (bits >> 7)\n" "\n" "uint linear_invocation_id()\n" "{\n" @@ -157,6 +155,7 @@ namespace gl " return (gl_GlobalInvocationID.y * size_in_x) + gl_GlobalInvocationID.x;\n" "}\n" "\n" + "%md" "void main()\n" "{\n" " uint invocation_id = linear_invocation_id();\n" @@ -173,6 +172,7 @@ namespace gl { "%vars", variables }, { "%f", function_name }, { "%ub", uniforms }, + { "%md", method_declarations } }; m_src = fmt::replace_all(m_src, syntax_replace); @@ -265,35 +265,229 @@ namespace gl } }; - template - struct cs_shuffle_d24x8_to_x8d24 : cs_shuffle_base + struct cs_shuffle_d32fx8_to_x8d24f : cs_shuffle_base { - cs_shuffle_d24x8_to_x8d24() + u32 m_ssbo_length = 0; + + cs_shuffle_d32fx8_to_x8d24f() { - if constexpr (_SwapBytes) - { - cs_shuffle_base::build("d24x8_to_x8d24_swapped"); - } - else - { - cs_shuffle_base::build("d24x8_to_x8d24"); - } + uniforms = "uniform uint in_ptr, out_ptr;\n"; + + variables = + " uint in_offset = in_ptr >> 2;\n" + " uint out_offset = out_ptr >> 2;\n" + " uint depth, stencil;\n"; + + work_kernel = + " depth = data[index * 2 + in_offset];\n" + " stencil = data[index * 2 + (in_offset + 1)] & 0xFFu;\n" + " value = f32_to_d24f(depth) << 8;\n" + " value |= stencil;\n" + " data[index + out_ptr] = bswap_u32(value);\n"; + + cs_shuffle_base::build(""); + } + + void bind_resources() override + { + m_data->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_ssbo_length); + } + + void run(const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels) + { + u32 data_offset; + if (src_offset > dst_offset) + { + data_offset = dst_offset; + m_ssbo_length = (src_offset + num_texels * 8) - data_offset; + } + else + { + data_offset = src_offset; + m_ssbo_length = (dst_offset + num_texels * 4) - data_offset; + } + + m_program.uniforms["in_ptr"] = src_offset - data_offset; + m_program.uniforms["out_ptr"] = dst_offset - data_offset; + cs_shuffle_base::run(data, num_texels * 4, data_offset); } }; - template - struct cs_shuffle_x8d24_to_d24x8 : cs_shuffle_base + struct cs_shuffle_x8d24f_to_d32fx8 : cs_shuffle_base { - cs_shuffle_x8d24_to_d24x8() + u32 m_ssbo_length = 0; + + cs_shuffle_x8d24f_to_d32fx8() { - if constexpr (_SwapBytes) - { - cs_shuffle_base::build("x8d24_to_d24x8_swapped"); - } - else - { - cs_shuffle_base::build("x8d24_to_d24x8"); - } + uniforms = "uniform uint texel_count, in_ptr, out_ptr;\n"; + + variables = + " uint in_offset = in_ptr >> 2;\n" + " uint out_offset = out_ptr >> 2;\n" + " uint depth, stencil;\n"; + + work_kernel = + " value = data[index + in_offset];\n" + " value = bswap_u32(value);\n" + " stencil = (value & 0xFFu);\n" + " depth = (value >> 8);\n" + " data[index * 2 + out_offset] = d24f_to_f32(depth);\n" + " data[index * 2 + (out_offset + 1)] = stencil;\n"; + + cs_shuffle_base::build(""); + } + + void bind_resources() override + { + m_data->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_ssbo_length); + } + + void run(const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels) + { + u32 data_offset; + if (src_offset > dst_offset) + { + data_offset = dst_offset; + m_ssbo_length = (src_offset + num_texels * 4) - data_offset; + } + else + { + data_offset = src_offset; + m_ssbo_length = (dst_offset + num_texels * 8) - data_offset; + } + + m_program.uniforms["in_ptr"] = src_offset - data_offset; + m_program.uniforms["out_ptr"] = dst_offset - data_offset; + cs_shuffle_base::run(data, num_texels * 4, data_offset); + } + }; + + + template + struct cs_fconvert_task : cs_shuffle_base + { + u32 m_ssbo_length = 0; + + void declare_f16_expansion() + { + method_declarations += + "uvec2 unpack_e4m12_pack16(const in uint value)\n" + "{\n" + " uvec2 result = uvec2(bitfieldExtract(value, 0, 16), bitfieldExtract(value, 16, 16));\n" + " result <<= 11;\n" + " result += (120 << 23);\n" + " return result;\n" + "}\n\n"; + } + + void declare_f16_contraction() + { + method_declarations += + "uint pack_e4m12_pack16(const in uvec2 value)\n" + "{\n" + " uvec2 result = (value - (120 << 23)) >> 11;\n" + " return (result.x & 0xFFFF) | (result.y << 16);\n" + "}\n\n"; + } + + cs_fconvert_task() + { + uniforms = + "uniform uint data_length_in_bytes, in_ptr, out_ptr;\n"; + + variables = + " uint block_length = data_length_in_bytes >> 2;\n" + " uint in_offset = in_ptr >> 2;\n" + " uint out_offset = out_ptr >> 2;\n" + " uvec4 tmp;\n"; + + work_kernel = + " if (index >= block_length)\n" + " return;\n"; + + if constexpr (sizeof(From) == 4) + { + static_assert(sizeof(To) == 2); + declare_f16_contraction(); + + work_kernel += + " const uint src_offset = (index * 2) + in_offset;\n" + " const uint dst_offset = index + out_offset;\n" + " tmp.x = data[src_offset];\n" + " tmp.y = data[src_offset + 1];\n"; + + if constexpr (_SwapSrc) + { + work_kernel += + " tmp = bswap_u32(tmp);\n"; + } + + // Convert + work_kernel += " tmp.z = pack_e4m12_pack16(tmp.xy);\n"; + + if constexpr (_SwapDst) + { + work_kernel += " tmp.z = bswap_u16(tmp.z);\n"; + } + + work_kernel += " data[dst_offset] = tmp.z;\n"; + } + else + { + static_assert(sizeof(To) == 4); + declare_f16_expansion(); + + work_kernel += + " const uint src_offset = index + in_offset;\n" + " const uint dst_offset = (index * 2) + out_offset;\n" + " tmp.x = data[src_offset];\n"; + + if constexpr (_SwapSrc) + { + work_kernel += + " tmp.x = bswap_u16(tmp.x);\n"; + } + + // Convert + work_kernel += " tmp.yz = unpack_e4m12_pack16(tmp.x);\n"; + + if constexpr (_SwapDst) + { + work_kernel += " tmp.yz = bswap_u32(tmp.yz);\n"; + } + + work_kernel += + " data[dst_offset] = tmp.y;\n" + " data[dst_offset + 1] = tmp.z;\n"; + } + + cs_shuffle_base::build(""); + } + + void bind_resources() override + { + m_data->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_ssbo_length); + } + + void run(const gl::buffer* data, u32 src_offset, u32 src_length, u32 dst_offset) + { + u32 data_offset; + if (src_offset > dst_offset) + { + m_ssbo_length = (src_offset + src_length) - dst_offset; + data_offset = dst_offset; + } + else + { + m_ssbo_length = (dst_offset - src_offset) + (src_length / sizeof(From)) * sizeof(To); + data_offset = src_offset; + } + + m_program.uniforms["data_length_in_bytes"] = src_length; + m_program.uniforms["in_ptr"] = src_offset - data_offset; + m_program.uniforms["out_ptr"] = dst_offset - data_offset; + + cs_shuffle_base::run(data, src_length, data_offset); } }; diff --git a/rpcs3/Emu/RSX/GL/GLHelpers.h b/rpcs3/Emu/RSX/GL/GLHelpers.h index d9ee67be8a6d..9d54424395a4 100644 --- a/rpcs3/Emu/RSX/GL/GLHelpers.h +++ b/rpcs3/Emu/RSX/GL/GLHelpers.h @@ -1479,14 +1479,6 @@ namespace gl enum class internal_format { - r = GL_RED, - rg = GL_RG, - rgb = GL_RGB, - rgba = GL_RGBA, - - bgr = GL_BGR, - bgra = GL_BGRA, - stencil8 = GL_STENCIL_INDEX8, depth16 = GL_DEPTH_COMPONENT16, depth32f = GL_DEPTH_COMPONENT32F, @@ -1821,7 +1813,7 @@ namespace gl return m_component_layout; } - void copy_from(const void* src, texture::format format, texture::type type, const coord3u region, const pixel_unpack_settings& pixel_settings) + void copy_from(const void* src, texture::format format, texture::type type, int level, const coord3u region, const pixel_unpack_settings& pixel_settings) { pixel_settings.apply(); @@ -1829,25 +1821,25 @@ namespace gl { case GL_TEXTURE_1D: { - DSA_CALL(TextureSubImage1D, m_id, GL_TEXTURE_1D, 0, region.x, region.width, static_cast(format), static_cast(type), src); + DSA_CALL(TextureSubImage1D, m_id, GL_TEXTURE_1D, level, region.x, region.width, static_cast(format), static_cast(type), src); break; } case GL_TEXTURE_2D: { - DSA_CALL(TextureSubImage2D, m_id, GL_TEXTURE_2D, 0, region.x, region.y, region.width, region.height, static_cast(format), static_cast(type), src); + DSA_CALL(TextureSubImage2D, m_id, GL_TEXTURE_2D, level, region.x, region.y, region.width, region.height, static_cast(format), static_cast(type), src); break; } case GL_TEXTURE_3D: case GL_TEXTURE_2D_ARRAY: { - DSA_CALL(TextureSubImage3D, m_id, target_, 0, region.x, region.y, region.z, region.width, region.height, region.depth, static_cast(format), static_cast(type), src); + DSA_CALL(TextureSubImage3D, m_id, target_, level, region.x, region.y, region.z, region.width, region.height, region.depth, static_cast(format), static_cast(type), src); break; } case GL_TEXTURE_CUBE_MAP: { if (get_driver_caps().ARB_dsa_supported) { - glTextureSubImage3D(m_id, 0, region.x, region.y, region.z, region.width, region.height, region.depth, static_cast(format), static_cast(type), src); + glTextureSubImage3D(m_id, level, region.x, region.y, region.z, region.width, region.height, region.depth, static_cast(format), static_cast(type), src); } else { @@ -1856,7 +1848,7 @@ namespace gl const auto end = std::min(6u, region.z + region.depth); for (unsigned face = region.z; face < end; ++face) { - glTextureSubImage2DEXT(m_id, GL_TEXTURE_CUBE_MAP_POSITIVE_X + face, 0, region.x, region.y, region.width, region.height, static_cast(format), static_cast(type), ptr); + glTextureSubImage2DEXT(m_id, GL_TEXTURE_CUBE_MAP_POSITIVE_X + face, level, region.x, region.y, region.width, region.height, static_cast(format), static_cast(type), ptr); ptr += (region.width * region.height * 4); //TODO } } @@ -1868,7 +1860,7 @@ namespace gl void copy_from(const void* src, texture::format format, texture::type type, const pixel_unpack_settings& pixel_settings) { const coord3u region = { {}, size3D() }; - copy_from(src, format, type, region, pixel_settings); + copy_from(src, format, type, 0, region, pixel_settings); } void copy_from(buffer &buf, u32 gl_format_type, u32 offset, u32 length) @@ -1884,7 +1876,7 @@ namespace gl copy_from(*view.value(), view.format(), view.offset(), view.range()); } - void copy_to(void* dst, texture::format format, texture::type type, const coord3u& region, const pixel_pack_settings& pixel_settings) const + void copy_to(void* dst, texture::format format, texture::type type, int level, const coord3u& region, const pixel_pack_settings& pixel_settings) const { pixel_settings.apply(); const auto& caps = get_driver_caps(); @@ -1893,13 +1885,13 @@ namespace gl region.width == m_width && region.height == m_height && region.depth == m_depth) { if (caps.ARB_dsa_supported) - glGetTextureImage(m_id, 0, static_cast(format), static_cast(type), INT32_MAX, dst); + glGetTextureImage(m_id, level, static_cast(format), static_cast(type), INT32_MAX, dst); else - glGetTextureImageEXT(m_id, static_cast(m_target), 0, static_cast(format), static_cast(type), dst); + glGetTextureImageEXT(m_id, static_cast(m_target), level, static_cast(format), static_cast(type), dst); } else if (caps.ARB_dsa_supported) { - glGetTextureSubImage(m_id, 0, region.x, region.y, region.z, region.width, region.height, region.depth, + glGetTextureSubImage(m_id, level, region.x, region.y, region.z, region.width, region.height, region.depth, static_cast(format), static_cast(type), INT32_MAX, dst); } else @@ -1907,18 +1899,18 @@ namespace gl // Worst case scenario. For some reason, EXT_dsa does not have glGetTextureSubImage const auto target_ = static_cast(m_target); texture tmp{ target_, region.width, region.height, region.depth, 1, static_cast(m_internal_format) }; - glCopyImageSubData(m_id, target_, 0, region.x, region.y, region.z, tmp.id(), target_, 0, 0, 0, 0, + glCopyImageSubData(m_id, target_, level, region.x, region.y, region.z, tmp.id(), target_, 0, 0, 0, 0, region.width, region.height, region.depth); const coord3u region2 = { {0, 0, 0}, region.size }; - tmp.copy_to(dst, format, type, region2, pixel_settings); + tmp.copy_to(dst, format, type, 0, region2, pixel_settings); } } void copy_to(void* dst, texture::format format, texture::type type, const pixel_pack_settings& pixel_settings) const { const coord3u region = { {}, size3D() }; - copy_to(dst, format, type, region, pixel_settings); + copy_to(dst, format, type, 0, region, pixel_settings); } }; diff --git a/rpcs3/Emu/RSX/GL/GLRenderTargets.cpp b/rpcs3/Emu/RSX/GL/GLRenderTargets.cpp index 27bacf47f671..a4d78153c4da 100644 --- a/rpcs3/Emu/RSX/GL/GLRenderTargets.cpp +++ b/rpcs3/Emu/RSX/GL/GLRenderTargets.cpp @@ -73,7 +73,7 @@ depth_format rsx::internals::surface_depth_format_to_gl(rsx::surface_depth_forma case rsx::surface_depth_format2::z16_uint: return{ ::gl::texture::type::ushort, ::gl::texture::format::depth, ::gl::texture::internal_format::depth16 }; case rsx::surface_depth_format2::z16_float: - return{ ::gl::texture::type::f16, ::gl::texture::format::depth, ::gl::texture::internal_format::depth32f }; + return{ ::gl::texture::type::f32, ::gl::texture::format::depth, ::gl::texture::internal_format::depth32f }; case rsx::surface_depth_format2::z24s8_uint: if (g_cfg.video.force_high_precision_z_buffer && ::gl::get_driver_caps().ARB_depth_buffer_float_supported) @@ -81,8 +81,7 @@ depth_format rsx::internals::surface_depth_format_to_gl(rsx::surface_depth_forma else return{ ::gl::texture::type::uint_24_8, ::gl::texture::format::depth_stencil, ::gl::texture::internal_format::depth24_stencil8 }; case rsx::surface_depth_format2::z24s8_float: - // TODO, requires separate aspect transfer for reading - return{ ::gl::texture::type::uint_24_8, ::gl::texture::format::depth_stencil, ::gl::texture::internal_format::depth32f_stencil8 }; + return{ ::gl::texture::type::float32_uint8, ::gl::texture::format::depth_stencil, ::gl::texture::internal_format::depth32f_stencil8 }; default: fmt::throw_exception("Unsupported depth format 0x%x" HERE, static_cast(depth_format)); @@ -468,14 +467,12 @@ void gl::render_target::load_memory(gl::command_context& cmd) // TODO: MSAA support if (g_cfg.video.resolution_scale_percent == 100 && spp == 1) [[likely]] { - gl::upload_texture(id(), gcm_format, surface_width, surface_height, 1, 1, - false, rsx::texture_dimension_extended::texture_dimension_2d, { subres }); + gl::upload_texture(this, gcm_format, false, { subres }); } else { auto tmp = std::make_unique(GL_TEXTURE_2D, subres.width_in_block, subres.height_in_block, 1, 1, static_cast(get_internal_format())); - gl::upload_texture(tmp->id(), gcm_format, surface_width, surface_height, 1, 1, - false, rsx::texture_dimension_extended::texture_dimension_2d, { subres }); + gl::upload_texture(tmp.get(), gcm_format, false, { subres }); gl::g_hw_blitter->scale_image(cmd, tmp.get(), this, { 0, 0, subres.width_in_block, subres.height_in_block }, diff --git a/rpcs3/Emu/RSX/GL/GLTexture.cpp b/rpcs3/Emu/RSX/GL/GLTexture.cpp index dd6ee9eb40aa..f48131593f44 100644 --- a/rpcs3/Emu/RSX/GL/GLTexture.cpp +++ b/rpcs3/Emu/RSX/GL/GLTexture.cpp @@ -454,6 +454,161 @@ namespace gl fmt::throw_exception("Unknown format 0x%x" HERE, texture_format); } + cs_shuffle_base* get_trivial_transform_job(const pixel_buffer_layout& pack_info) + { + if (!pack_info.swap_bytes) + { + return nullptr; + } + + switch (pack_info.size) + { + case 1: + return nullptr; + case 2: + return get_compute_task(); + break; + case 4: + return get_compute_task(); + break; + default: + fmt::throw_exception("Unsupported format"); + } + } + + void* copy_image_to_buffer(const pixel_buffer_layout& pack_info, const gl::texture* src, gl::buffer* dst, + const int src_level, const coord3u& src_region, image_memory_requirements* mem_info) + { + auto initialize_scratch_mem = [&]() + { + const u64 max_mem = (mem_info->memory_required) ? mem_info->memory_required : mem_info->image_size_in_bytes; + if (!(*dst) || max_mem > static_cast(dst->size())) + { + if (*dst) dst->remove(); + dst->create(buffer::target::pixel_pack, max_mem, nullptr, buffer::memory_type::local, GL_STATIC_COPY); + } + + dst->bind(buffer::target::pixel_pack); + src->copy_to(nullptr, static_cast(pack_info.format), static_cast(pack_info.type), src_level, src_region, {}); + }; + + void* result = nullptr; + if (src->aspect() == image_aspect::color || + pack_info.type == GL_UNSIGNED_SHORT || + pack_info.type == GL_UNSIGNED_INT_24_8) + { + initialize_scratch_mem(); + if (auto job = get_trivial_transform_job(pack_info)) + { + job->run(dst, static_cast(mem_info->image_size_in_bytes)); + } + } + else if (pack_info.type == GL_FLOAT) + { + verify(HERE), mem_info->image_size_in_bytes == (mem_info->image_size_in_texels * 4); + mem_info->memory_required = (mem_info->image_size_in_texels * 6); + initialize_scratch_mem(); + + get_compute_task>()->run(dst, 0, + static_cast(mem_info->image_size_in_bytes), static_cast(mem_info->image_size_in_bytes)); + result = reinterpret_cast(mem_info->image_size_in_bytes); + } + else if (pack_info.type == GL_FLOAT_32_UNSIGNED_INT_24_8_REV) + { + verify(HERE), mem_info->image_size_in_bytes == (mem_info->image_size_in_texels * 8); + mem_info->memory_required = (mem_info->image_size_in_texels * 12); + initialize_scratch_mem(); + + get_compute_task()->run(dst, 0, + static_cast(mem_info->image_size_in_bytes), static_cast(mem_info->image_size_in_texels)); + result = reinterpret_cast(mem_info->image_size_in_bytes); + } + else + { + fmt::throw_exception("Invalid depth/stencil type 0x%x" HERE, pack_info.type); + } + + glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT | GL_PIXEL_BUFFER_BARRIER_BIT); + return result; + } + + void copy_buffer_to_image(const pixel_buffer_layout& unpack_info, gl::buffer* src, gl::texture* dst, + const void* src_offset, const int dst_level, const coord3u& dst_region, image_memory_requirements* mem_info) + { + buffer scratch_mem; + buffer* transfer_buf = src; + bool skip_barrier = false; + u32 in_offset = static_cast(reinterpret_cast(src_offset)); + u32 out_offset = in_offset; + + auto initialize_scratch_mem = [&]() + { + if (in_offset >= mem_info->memory_required) + { + return; + } + + const u64 max_mem = mem_info->memory_required + mem_info->image_size_in_bytes; + if ((max_mem + in_offset) <= static_cast(src->size())) + { + out_offset = static_cast(in_offset + mem_info->image_size_in_bytes); + return; + } + + scratch_mem.create(buffer::target::pixel_pack, max_mem, nullptr, buffer::memory_type::local, GL_STATIC_COPY); + + glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT); + src->copy_to(&scratch_mem, in_offset, 0, mem_info->image_size_in_bytes); + + in_offset = 0; + out_offset = static_cast(mem_info->image_size_in_bytes); + transfer_buf = &scratch_mem; + }; + + if (dst->aspect() == image_aspect::color || + unpack_info.type == GL_UNSIGNED_SHORT || + unpack_info.type == GL_UNSIGNED_INT_24_8) + { + if (auto job = get_trivial_transform_job(unpack_info)) + { + job->run(src, static_cast(mem_info->image_size_in_bytes), in_offset); + } + else + { + skip_barrier = true; + } + } + else if (unpack_info.type == GL_FLOAT) + { + mem_info->memory_required = (mem_info->image_size_in_texels * 4); + initialize_scratch_mem(); + get_compute_task>()->run(transfer_buf, in_offset, static_cast(mem_info->image_size_in_bytes), out_offset); + } + else if (unpack_info.type == GL_FLOAT_32_UNSIGNED_INT_24_8_REV) + { + mem_info->memory_required = (mem_info->image_size_in_texels * 8); + initialize_scratch_mem(); + get_compute_task()->run(transfer_buf, in_offset, out_offset, static_cast(mem_info->image_size_in_texels)); + } + else + { + fmt::throw_exception("Invalid depth/stencil type 0x%x" HERE, unpack_info.type); + } + + if (!skip_barrier) + { + glMemoryBarrier(GL_PIXEL_BUFFER_BARRIER_BIT); + } + + glBindBuffer(GL_SHADER_STORAGE_BUFFER, GL_NONE); + transfer_buf->bind(buffer::target::pixel_unpack); + + dst->copy_from(reinterpret_cast(u64(out_offset)), static_cast(unpack_info.format), + static_cast(unpack_info.type), dst_level, dst_region, {}); + + if (scratch_mem) scratch_mem.remove(); + } + gl::viewable_image* create_texture(u32 gcm_format, u16 width, u16 height, u16 depth, u16 mipmaps, rsx::texture_dimension_extended type) { @@ -488,8 +643,9 @@ namespace gl return new gl::viewable_image(target, width, height, depth, mipmaps, internal_format, format_class); } - void fill_texture(rsx::texture_dimension_extended dim, u16 mipmap_count, int format, u16 width, u16 height, u16 depth, - const std::vector &input_layouts, bool is_swizzled, GLenum gl_format, GLenum gl_type, std::vector& staging_buffer) + void fill_texture(texture* dst, int format, + const std::vector &input_layouts, + bool is_swizzled, GLenum gl_format, GLenum gl_type, std::vector& staging_buffer) { rsx::texture_uploader_capabilities caps{ true, false, false, 4 }; @@ -500,9 +656,11 @@ namespace gl { caps.supports_vtc_decoding = gl::get_driver_caps().vendor_NVIDIA; - unpack_settings.row_length(align(width, 4)); + unpack_settings.row_length(align(dst->width(), 4)); unpack_settings.apply(); + glBindTexture(static_cast(dst->get_target()), dst->id()); + const GLsizei format_block_size = (format == CELL_GCM_TEXTURE_COMPRESSED_DXT1) ? 8 : 16; for (const rsx::subresource_layout& layout : input_layouts) @@ -510,27 +668,27 @@ namespace gl upload_texture_subresource(staging_buffer, layout, format, is_swizzled, caps); const sizei image_size{ align(layout.width_in_texel, 4), align(layout.height_in_texel, 4) }; - switch (dim) + switch (dst->get_target()) { - case rsx::texture_dimension_extended::texture_dimension_1d: + case texture::target::texture1D: { const GLsizei size = layout.width_in_block * format_block_size; glCompressedTexSubImage1D(GL_TEXTURE_1D, layout.level, 0, image_size.width, gl_format, size, staging_buffer.data()); break; } - case rsx::texture_dimension_extended::texture_dimension_2d: + case texture::target::texture2D: { const GLsizei size = layout.width_in_block * layout.height_in_block * format_block_size; glCompressedTexSubImage2D(GL_TEXTURE_2D, layout.level, 0, 0, image_size.width, image_size.height, gl_format, size, staging_buffer.data()); break; } - case rsx::texture_dimension_extended::texture_dimension_cubemap: + case texture::target::textureCUBE: { const GLsizei size = layout.width_in_block * layout.height_in_block * format_block_size; glCompressedTexSubImage2D(GL_TEXTURE_CUBE_MAP_POSITIVE_X + layout.layer, layout.level, 0, 0, image_size.width, image_size.height, gl_format, size, staging_buffer.data()); break; } - case rsx::texture_dimension_extended::texture_dimension_3d: + case texture::target::texture3D: { const GLsizei size = layout.width_in_block * layout.height_in_block * layout.depth * format_block_size; glCompressedTexSubImage3D(GL_TEXTURE_3D, layout.level, 0, 0, 0, image_size.width, image_size.height, layout.depth, gl_format, size, staging_buffer.data()); @@ -547,9 +705,11 @@ namespace gl else { bool apply_settings = true; + bool use_compute_transform = false; buffer upload_scratch_mem, compute_scratch_mem; + image_memory_requirements mem_info; + pixel_buffer_layout mem_layout; - cs_shuffle_base* pixel_transform = nullptr; gsl::span dst_buffer = staging_buffer; void* out_pointer = staging_buffer.data(); u8 block_size_in_bytes = rsx::get_format_block_size_in_bytes(format); @@ -569,90 +729,72 @@ namespace gl apply_settings = (gl_format == GL_RED); caps.supports_byteswap = apply_settings; break; - case GL_UNSIGNED_INT_24_8: - if (gl::get_driver_caps().ARB_compute_shader_supported) - { - apply_settings = false; - pixel_transform = gl::get_compute_task>(); - } - break; case GL_FLOAT: - // TODO: Expand depth16f to depth32f - gl_type = GL_HALF_FLOAT; - break; + case GL_UNSIGNED_INT_24_8: case GL_FLOAT_32_UNSIGNED_INT_24_8_REV: - // TODO: Expand depth24 to depth32f - gl_type = GL_UNSIGNED_INT_24_8; - break; - default: + mem_layout.format = gl_format; + mem_layout.type = gl_type; + mem_layout.swap_bytes = true; + mem_layout.size = 4; + use_compute_transform = true; + apply_settings = false; break; } - if (!apply_settings) - { - unpack_settings.apply(); - } - - if (pixel_transform) + if (use_compute_transform) { upload_scratch_mem.create(staging_buffer.size(), nullptr, buffer::memory_type::host_visible, GL_STREAM_DRAW); - compute_scratch_mem.create(staging_buffer.size(), nullptr, buffer::memory_type::local, GL_STATIC_COPY); + compute_scratch_mem.create(std::max(512, staging_buffer.size() * 3), nullptr, buffer::memory_type::local, GL_STATIC_COPY); out_pointer = nullptr; } for (const rsx::subresource_layout& layout : input_layouts) { - if (pixel_transform) + if (use_compute_transform) { - const u64 row_pitch = rsx::align2(layout.width_in_block * block_size_in_bytes, caps.alignment); + const u64 row_pitch = rsx::align2(layout.width_in_block * block_size_in_bytes, caps.alignment); image_linear_size = row_pitch * layout.height_in_block * layout.depth; dst_buffer = { reinterpret_cast(upload_scratch_mem.map(buffer::access::write)), image_linear_size }; } auto op = upload_texture_subresource(dst_buffer, layout, format, is_swizzled, caps); - if (pixel_transform) + // Define upload region + coord3u region; + region.x = 0; + region.y = 0; + region.z = layout.layer; + region.width = layout.width_in_texel; + region.height = layout.height_in_texel; + region.depth = layout.depth; + + if (use_compute_transform) { // 1. Unmap buffer upload_scratch_mem.unmap(); - // 2. Execute compute job + // 2. Upload memory to GPU upload_scratch_mem.copy_to(&compute_scratch_mem, 0, 0, image_linear_size); - pixel_transform->run(&compute_scratch_mem, image_linear_size); - // 3. Bind compute buffer as pixel unpack buffer - glMemoryBarrier(GL_PIXEL_UNPACK_BUFFER); - glBindBuffer(GL_SHADER_STORAGE_BUFFER, GL_NONE); - compute_scratch_mem.bind(buffer::target::pixel_unpack); + // 3. Dispatch compute routines + mem_info.image_size_in_texels = image_linear_size / block_size_in_bytes; + mem_info.image_size_in_bytes = image_linear_size; + mem_info.memory_required = 0; + copy_buffer_to_image(mem_layout, &compute_scratch_mem, dst, nullptr, layout.level, region, & mem_info); } - else if (apply_settings) + else { - unpack_settings.swap_bytes(op.require_swap); - unpack_settings.apply(); - apply_settings = false; - } + if (apply_settings) + { + unpack_settings.swap_bytes(op.require_swap); + apply_settings = false; + } - switch (dim) - { - case rsx::texture_dimension_extended::texture_dimension_1d: - glTexSubImage1D(GL_TEXTURE_1D, layout.level, 0, layout.width_in_texel, gl_format, gl_type, out_pointer); - break; - case rsx::texture_dimension_extended::texture_dimension_2d: - glTexSubImage2D(GL_TEXTURE_2D, layout.level, 0, 0, layout.width_in_texel, layout.height_in_texel, gl_format, gl_type, out_pointer); - break; - case rsx::texture_dimension_extended::texture_dimension_cubemap: - glTexSubImage2D(GL_TEXTURE_CUBE_MAP_POSITIVE_X + layout.layer, layout.level, 0, 0, layout.width_in_texel, layout.height_in_texel, gl_format, gl_type, out_pointer); - break; - case rsx::texture_dimension_extended::texture_dimension_3d: - glTexSubImage3D(GL_TEXTURE_3D, layout.layer, 0, 0, 0, layout.width_in_texel, layout.height_in_texel, depth, gl_format, gl_type, out_pointer); - break; - default: - ASSUME(0); - fmt::throw_exception("Unreachable" HERE); + dst->copy_from(out_pointer, static_cast(gl_format), static_cast(gl_type), layout.level, region, unpack_settings); } } - if (pixel_transform) + if (use_compute_transform) { upload_scratch_mem.remove(); compute_scratch_mem.remove(); @@ -693,41 +835,18 @@ namespace gl return remap_values; } - void upload_texture(GLuint id, u32 gcm_format, u16 width, u16 height, u16 depth, u16 mipmaps, bool is_swizzled, rsx::texture_dimension_extended type, - const std::vector& subresources_layout) + void upload_texture(texture* dst, u32 gcm_format, bool is_swizzled, const std::vector& subresources_layout) { - GLenum target; - switch (type) - { - case rsx::texture_dimension_extended::texture_dimension_1d: - target = GL_TEXTURE_1D; - break; - case rsx::texture_dimension_extended::texture_dimension_2d: - target = GL_TEXTURE_2D; - break; - case rsx::texture_dimension_extended::texture_dimension_3d: - target = GL_TEXTURE_3D; - break; - case rsx::texture_dimension_extended::texture_dimension_cubemap: - target = GL_TEXTURE_CUBE_MAP; - break; - } - - glBindTexture(target, id); - glTexParameteri(target, GL_TEXTURE_BASE_LEVEL, 0); - glTexParameteri(target, GL_TEXTURE_MAX_LEVEL, mipmaps - 1); - // The rest of sampler state is now handled by sampler state objects - // Calculate staging buffer size - const u32 aligned_pitch = align(width * rsx::get_format_block_size_in_bytes(gcm_format), 4); - size_t texture_data_sz = depth * height * aligned_pitch; + const u32 aligned_pitch = align(dst->pitch(), 4); + size_t texture_data_sz = dst->depth() * dst->height() * aligned_pitch; std::vector data_upload_buf(texture_data_sz); // TODO: GL drivers support byteswapping and this should be used instead of doing so manually const auto format_type = get_format_type(gcm_format); const GLenum gl_format = std::get<0>(format_type); const GLenum gl_type = std::get<1>(format_type); - fill_texture(type, mipmaps, gcm_format, width, height, depth, subresources_layout, is_swizzled, gl_format, gl_type, data_upload_buf); + fill_texture(dst, gcm_format, subresources_layout, is_swizzled, gl_format, gl_type, data_upload_buf); } u32 get_format_texel_width(GLenum format) @@ -821,111 +940,12 @@ namespace gl return false; } - cs_shuffle_base* get_trivial_transform_job(const pixel_buffer_layout& pack_info) - { - if (!pack_info.swap_bytes) - { - return nullptr; - } - - switch (pack_info.size) - { - case 1: - return nullptr; - case 2: - return gl::get_compute_task(); - break; - case 4: - return gl::get_compute_task(); - break; - default: - fmt::throw_exception("Unsupported format"); - } - } - - cs_shuffle_base* get_image_to_buffer_job(const pixel_buffer_layout& pack_info, u32 aspect_mask) - { - switch (aspect_mask) - { - case image_aspect::color: - { - return get_trivial_transform_job(pack_info); - } - case image_aspect::depth: - { - if (pack_info.type == GL_FLOAT) - { - // TODO: D16F - return nullptr; - } - - return get_trivial_transform_job(pack_info); - } - case image_aspect::depth | image_aspect::stencil: - { - verify(HERE), pack_info.swap_bytes; - if (pack_info.type == GL_FLOAT_32_UNSIGNED_INT_24_8_REV) - { - // TODO: D24FX8 - return nullptr; - } - - return gl::get_compute_task>(); - } - default: - { - fmt::throw_exception("Invalid aspect mask 0x%x" HERE, aspect_mask); - } - } - } - - cs_shuffle_base* get_buffer_to_image_job(const pixel_buffer_layout& unpack_info, u32 aspect_mask) - { - switch (aspect_mask) - { - case image_aspect::color: - { - return get_trivial_transform_job(unpack_info); - } - case image_aspect::depth: - { - if (unpack_info.type == GL_FLOAT) - { - // TODO: D16F - return nullptr; - } - - return get_trivial_transform_job(unpack_info); - } - case image_aspect::depth | image_aspect::stencil: - { - verify(HERE), unpack_info.swap_bytes; - if (unpack_info.type == GL_FLOAT_32_UNSIGNED_INT_24_8_REV) - { - // TODO: D24FX8 - return nullptr; - } - - return gl::get_compute_task>(); - } - default: - { - fmt::throw_exception("Invalid aspect mask 0x%x" HERE, aspect_mask); - } - } - } - void copy_typeless(texture * dst, const texture * src, const coord3u& dst_region, const coord3u& src_region) { - const u32 src_mem = src->pitch() * src_region.height; - const u32 dst_mem = dst->pitch() * dst_region.height; - - auto max_mem = std::max(src_mem, dst_mem); - if (!g_typeless_transfer_buffer || max_mem > g_typeless_transfer_buffer.size()) - { - if (g_typeless_transfer_buffer) g_typeless_transfer_buffer.remove(); - g_typeless_transfer_buffer.create(buffer::target::pixel_pack, max_mem, nullptr, buffer::memory_type::local, GL_STATIC_COPY); - } + const auto src_bpp = src->pitch() / src->width(); + const auto dst_bpp = dst->pitch() / dst->width(); + image_memory_requirements src_mem = { src_region.width * src_region.height, src_region.width * src_bpp * src_region.height, 0ull }; + image_memory_requirements dst_mem = { dst_region.width * dst_region.height, dst_region.width * dst_bpp * dst_region.height, 0ull }; const auto& caps = gl::get_driver_caps(); auto pack_info = get_format_type(src); @@ -954,54 +974,31 @@ namespace gl } // Start pack operation - g_typeless_transfer_buffer.bind(buffer::target::pixel_pack); - - if (caps.ARB_compute_shader_supported) [[likely]] - { - // Raw copy - src->copy_to(nullptr, static_cast(pack_info.format), static_cast(pack_info.type), src_region, {}); - } - else - { - pixel_pack_settings pack_settings{}; - pack_settings.swap_bytes(pack_info.swap_bytes); - src->copy_to(nullptr, static_cast(pack_info.format), static_cast(pack_info.type), src_region, pack_settings); - } - - glBindBuffer(GL_PIXEL_PACK_BUFFER, GL_NONE); - - // Start unpack operation - pixel_unpack_settings unpack_settings{}; - + void* transfer_offset = nullptr; if (caps.ARB_compute_shader_supported) [[likely]] { - auto src_transform = get_image_to_buffer_job(pack_info, src->aspect()); - auto dst_transform = get_buffer_to_image_job(unpack_info, dst->aspect()); - - if (src->aspect() == gl::image_aspect::color && dst->aspect() == gl::image_aspect::color) + // Apply transformation + bool skip_transform = false; + if ((src->aspect() | dst->aspect()) == gl::image_aspect::color) { - if (src_transform == dst_transform) - { - src_transform = dst_transform = nullptr; - } - else if (src_transform && dst_transform) - { - src_transform = gl::get_compute_task(); - dst_transform = nullptr; - } + skip_transform = (pack_info.format == unpack_info.format && + pack_info.type == unpack_info.type && + pack_info.swap_bytes == unpack_info.swap_bytes && + pack_info.size == unpack_info.size); } - const auto job_length = std::min(src_mem, dst_mem); - if (src_transform) + if (skip_transform) [[likely]] { - src_transform->run(&g_typeless_transfer_buffer, job_length); - glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT | GL_PIXEL_BUFFER_BARRIER_BIT); - } + const bool old_swap_bytes = pack_info.swap_bytes; + pack_info.swap_bytes = false; - if (dst_transform) + copy_image_to_buffer(pack_info, src, &g_typeless_transfer_buffer, 0, src_region, &src_mem); + pack_info.swap_bytes = old_swap_bytes; + } + else { - dst_transform->run(&g_typeless_transfer_buffer, job_length); - glMemoryBarrier(GL_PIXEL_BUFFER_BARRIER_BIT); + void* data_ptr = copy_image_to_buffer(pack_info, src, &g_typeless_transfer_buffer, 0, src_region, &src_mem); + copy_buffer_to_image(unpack_info, &g_typeless_transfer_buffer, dst, data_ptr, 0, dst_region, &dst_mem); } // NOTE: glBindBufferRange also binds the buffer to the old-school target. @@ -1009,12 +1006,33 @@ namespace gl glBindBuffer(GL_SHADER_STORAGE_BUFFER, GL_NONE); } else + { + const u64 max_mem = std::max(src_mem.image_size_in_bytes, dst_mem.image_size_in_bytes); + if (!g_typeless_transfer_buffer || max_mem > static_cast(g_typeless_transfer_buffer.size())) + { + if (g_typeless_transfer_buffer) g_typeless_transfer_buffer.remove(); + g_typeless_transfer_buffer.create(buffer::target::pixel_pack, max_mem, nullptr, buffer::memory_type::local, GL_STATIC_COPY); + } + + pixel_pack_settings pack_settings{}; + pack_settings.swap_bytes(pack_info.swap_bytes); + + g_typeless_transfer_buffer.bind(buffer::target::pixel_pack); + src->copy_to(nullptr, static_cast(pack_info.format), static_cast(pack_info.type), 0, src_region, pack_settings); + } + + glBindBuffer(GL_PIXEL_PACK_BUFFER, GL_NONE); + + // Start unpack operation + pixel_unpack_settings unpack_settings{}; + + if (!caps.ARB_compute_shader_supported) [[unlikely]] { unpack_settings.swap_bytes(unpack_info.swap_bytes); } g_typeless_transfer_buffer.bind(buffer::target::pixel_unpack); - dst->copy_from(nullptr, static_cast(unpack_info.format), static_cast(unpack_info.type), dst_region, unpack_settings); + dst->copy_from(transfer_offset, static_cast(unpack_info.format), static_cast(unpack_info.type), 0, dst_region, unpack_settings); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, GL_NONE); } diff --git a/rpcs3/Emu/RSX/GL/GLTexture.h b/rpcs3/Emu/RSX/GL/GLTexture.h index 86c35e34b940..f47f81000650 100644 --- a/rpcs3/Emu/RSX/GL/GLTexture.h +++ b/rpcs3/Emu/RSX/GL/GLTexture.h @@ -21,6 +21,13 @@ namespace gl bool swap_bytes; }; + struct image_memory_requirements + { + u64 image_size_in_texels; + u64 image_size_in_bytes; + u64 memory_required; + }; + GLenum get_target(rsx::texture_dimension_extended type); GLenum get_sized_internal_format(u32 texture_format); std::tuple get_format_type(u32 texture_format); @@ -35,16 +42,13 @@ namespace gl void copy_typeless(texture* dst, const texture* src, const coord3u& dst_region, const coord3u& src_region); void copy_typeless(texture* dst, const texture* src); - /** - * is_swizzled - determines whether input bytes are in morton order - * subresources_layout - descriptor of the mipmap levels in memory - * decoded_remap - two vectors, first one contains index to read, e.g if v[0] = 1 then component 0[A] in the texture should read as component 1[R] - * - layout of vector is in A-R-G-B - * - second vector contains overrides to force the value to either 0 or 1 instead of reading from texture - * static_state - set up the texture without consideration for sampler state (useful for vertex textures which have no real sampler state on RSX) - */ - void upload_texture(GLuint id, u32 gcm_format, u16 width, u16 height, u16 depth, u16 mipmaps, bool is_swizzled, rsx::texture_dimension_extended type, - const std::vector& subresources_layout); + void* copy_image_to_buffer(const pixel_buffer_layout& pack_info, const gl::texture* src, gl::buffer* dst, + const int src_level, const coord3u& src_region, image_memory_requirements* mem_info); + + void copy_buffer_to_image(const pixel_buffer_layout& unpack_info, gl::buffer* src, gl::texture* dst, + const void* src_offset, const int dst_level, const coord3u& dst_region, image_memory_requirements* mem_info); + + void upload_texture(texture* dst, u32 gcm_format, bool is_swizzled, const std::vector& subresources_layout); class sampler_state { diff --git a/rpcs3/Emu/RSX/GL/GLTextureCache.h b/rpcs3/Emu/RSX/GL/GLTextureCache.h index 4ff754d51603..d0c9cdcd80b6 100644 --- a/rpcs3/Emu/RSX/GL/GLTextureCache.h +++ b/rpcs3/Emu/RSX/GL/GLTextureCache.h @@ -15,7 +15,6 @@ #include "GLRenderTargets.h" #include "GLOverlays.h" #include "GLTexture.h" -#include "GLCompute.h" #include "../Common/TextureUtils.h" #include "../Common/texture_cache.h" @@ -163,38 +162,39 @@ namespace gl pack_unpack_swap_bytes = format_info.swap_bytes; } + real_pitch = src->pitch(); + rsx_pitch = pitch; + bool use_driver_pixel_transform = true; if (get_driver_caps().ARB_compute_shader_supported) [[likely]] { - if (src->aspect() & image_aspect::stencil) + if (src->aspect() & image_aspect::depth) { buffer scratch_mem; - scratch_mem.create(buffer::target::pixel_pack, pbo.size(), nullptr, buffer::memory_type::local, GL_STATIC_COPY); - scratch_mem.bind(); - - pixel_pack_settings pack_settings; - pack_settings.alignment(1); - src->copy_to(nullptr, format, type, pack_settings); // Invoke compute if (auto error = glGetError(); !error) [[likely]] { - cs_shuffle_base * job; - if (pack_unpack_swap_bytes) - { - job = get_compute_task>(); - } - else - { - job = get_compute_task>(); - } + pixel_buffer_layout pack_info{}; + image_memory_requirements mem_info{}; + + pack_info.format = static_cast(format); + pack_info.type = static_cast(type); + pack_info.size = (src->aspect() & image_aspect::stencil) ? 4 : 2; + pack_info.swap_bytes = true; - const auto job_length = src->pitch() * src->height(); - job->run(&scratch_mem, job_length); + mem_info.image_size_in_texels = src->width() * src->height(); + mem_info.image_size_in_bytes = src->pitch() * src->height(); + mem_info.memory_required = 0; + + void* out_offset = copy_image_to_buffer(pack_info, src, &scratch_mem, 0, { {}, src->size3D() }, &mem_info); glBindBuffer(GL_SHADER_STORAGE_BUFFER, GL_NONE); glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT); - scratch_mem.copy_to(&pbo, 0, 0, job_length); + + real_pitch = pack_info.size * src->width(); + const u64 data_length = pack_info.size * mem_info.image_size_in_texels; + scratch_mem.copy_to(&pbo, reinterpret_cast(out_offset), 0, data_length); } else { @@ -222,9 +222,6 @@ namespace gl src->copy_to(nullptr, format, type, pack_settings); } - real_pitch = src->pitch(); - rsx_pitch = pitch; - if (auto error = glGetError()) { if (error == GL_OUT_OF_MEMORY && ::gl::get_driver_caps().vendor_AMD) @@ -561,7 +558,7 @@ namespace gl sized_internal_fmt = gl::get_sized_internal_format(gcm_format); } - std::unique_ptr dst = std::make_unique(dst_type, width, height, depth, mipmaps, sized_internal_fmt); + std::unique_ptr dst = std::make_unique(dst_type, width, height, depth, mipmaps, sized_internal_fmt, rsx::classify_format(gcm_format)); if (copy) { @@ -939,8 +936,7 @@ namespace gl auto section = create_new_texture(cmd, rsx_range, width, height, depth, mipmaps, pitch, gcm_format, context, type, input_swizzled, rsx::texture_create_flags::default_component_order); - gl::upload_texture(section->get_raw_texture()->id(), gcm_format, width, height, depth, mipmaps, - input_swizzled, type, subresource_layout); + gl::upload_texture(section->get_raw_texture(), gcm_format, input_swizzled, subresource_layout); section->last_write_tag = rsx::get_shared_tag(); return section;