diff --git a/pcsx2/CMakeLists.txt b/pcsx2/CMakeLists.txt index 3a83fe5d0af3e..1d28e53c57e4b 100644 --- a/pcsx2/CMakeLists.txt +++ b/pcsx2/CMakeLists.txt @@ -794,6 +794,7 @@ if(USE_VULKAN) endif() set(pcsx2GSMetalShaders + GS/Renderers/Metal/cas.metal GS/Renderers/Metal/convert.metal GS/Renderers/Metal/present.metal GS/Renderers/Metal/merge.metal diff --git a/pcsx2/GS/Renderers/Metal/GSDeviceMTL.h b/pcsx2/GS/Renderers/Metal/GSDeviceMTL.h index 729c70893bbc6..7e3dd775e869a 100644 --- a/pcsx2/GS/Renderers/Metal/GSDeviceMTL.h +++ b/pcsx2/GS/Renderers/Metal/GSDeviceMTL.h @@ -237,6 +237,7 @@ class GSDeviceMTL final : public GSDevice MRCOwned> m_spin_fence; // Functions and Pipeline States + MRCOwned> m_cas_pipeline[2]; MRCOwned> m_convert_pipeline[static_cast(ShaderConvert::Count)]; MRCOwned> m_present_pipeline[static_cast(PresentShader::Count)]; MRCOwned> m_convert_pipeline_copy[2]; @@ -359,6 +360,7 @@ class GSDeviceMTL final : public GSDevice MRCOwned> LoadShader(NSString* name); MRCOwned> MakePipeline(MTLRenderPipelineDescriptor* desc, id vertex, id fragment, NSString* name); + MRCOwned> MakeComputePipeline(id compute, NSString* name); bool Create() override; void ClearRenderTarget(GSTexture* t, const GSVector4& c) override; diff --git a/pcsx2/GS/Renderers/Metal/GSDeviceMTL.mm b/pcsx2/GS/Renderers/Metal/GSDeviceMTL.mm index 60db4d39761b8..5a412cf06facc 100644 --- a/pcsx2/GS/Renderers/Metal/GSDeviceMTL.mm +++ b/pcsx2/GS/Renderers/Metal/GSDeviceMTL.mm @@ -503,6 +503,9 @@ static constexpr MTLPixelFormat ConvertPixelFormat(GSTexture::Format format) else [desc setUsage:MTLTextureUsageShaderRead | MTLTextureUsageRenderTarget]; break; + case GSTexture::Type::RWTexture: + [desc setUsage:MTLTextureUsageShaderRead | MTLTextureUsageShaderWrite]; + break; default: [desc setUsage:MTLTextureUsageShaderRead | MTLTextureUsageRenderTarget]; } @@ -625,9 +628,24 @@ static constexpr MTLPixelFormat ConvertPixelFormat(GSTexture::Format format) #endif bool GSDeviceMTL::DoCAS(GSTexture* sTex, GSTexture* dTex, bool sharpen_only, const std::array& constants) -{ - return false; -} +{ @autoreleasepool { + static constexpr int threadGroupWorkRegionDim = 16; + const int dispatchX = (dTex->GetWidth() + (threadGroupWorkRegionDim - 1)) / threadGroupWorkRegionDim; + const int dispatchY = (dTex->GetHeight() + (threadGroupWorkRegionDim - 1)) / threadGroupWorkRegionDim; + static_assert(sizeof(constants) == sizeof(GSMTLCASPSUniform)); + + EndRenderPass(); + id enc = [GetRenderCmdBuf() computeCommandEncoder]; + [enc setLabel:@"CAS"]; + [enc setComputePipelineState:m_cas_pipeline[sharpen_only]]; + [enc setTexture:static_cast(sTex)->GetTexture() atIndex:0]; + [enc setTexture:static_cast(dTex)->GetTexture() atIndex:1]; + [enc setBytes:&constants length:sizeof(constants) atIndex:GSMTLBufferIndexUniforms]; + [enc dispatchThreadgroups:MTLSizeMake(dispatchX, dispatchY, 1) + threadsPerThreadgroup:MTLSizeMake(64, 1, 1)]; + [enc endEncoding]; + return true; +}} MRCOwned> GSDeviceMTL::LoadShader(NSString* name) { @@ -658,6 +676,26 @@ static constexpr MTLPixelFormat ConvertPixelFormat(GSTexture::Format format) return res; } +MRCOwned> GSDeviceMTL::MakeComputePipeline(id compute, NSString* name) +{ + MRCOwned desc = MRCTransfer([MTLComputePipelineDescriptor new]); + [desc setLabel:name]; + [desc setComputeFunction:compute]; + NSError* err; + MRCOwned> res = MRCTransfer([m_dev.dev + newComputePipelineStateWithDescriptor:desc + options:0 + reflection:nil + error:&err]); + if (unlikely(err)) + { + NSString* msg = [NSString stringWithFormat:@"Failed to create pipeline %@: %@", name, [err localizedDescription]]; + Console.Error("%s", [msg UTF8String]); + throw GSRecoverableError(); + } + return res; +} + static void applyAttribute(MTLVertexDescriptor* desc, NSUInteger idx, MTLVertexFormat fmt, NSUInteger offset, NSUInteger buffer_index) { MTLVertexAttributeDescriptor* attrs = desc.attributes[idx]; @@ -704,6 +742,7 @@ static void setFnConstantI(MTLFunctionConstantValues* fc, unsigned int value, GS m_features.framebuffer_fetch = m_dev.features.framebuffer_fetch; m_features.dual_source_blend = true; m_features.stencil_buffer = true; + m_features.cas_sharpening = true; try { @@ -725,12 +764,13 @@ static void setFnConstantI(MTLFunctionConstantValues* fc, unsigned int value, GS [clearSpinBuffer fillBuffer:m_spin_buffer range:NSMakeRange(0, 4) value:0]; [clearSpinBuffer updateFence:m_spin_fence]; [clearSpinBuffer endEncoding]; - NSError* err = nullptr; - m_spin_pipeline = MRCTransfer([m_dev.dev newComputePipelineStateWithFunction:LoadShader(@"waste_time") error:&err]); - if (err) + m_spin_pipeline = MakeComputePipeline(LoadShader(@"waste_time"), @"waste_time"); + + for (int sharpen_only = 0; sharpen_only < 2; sharpen_only++) { - Console.Error("Failed to create spin pipeline: %s", [[err localizedDescription] UTF8String]); - return false; + setFnConstantB(m_fn_constants, sharpen_only, GSMTLConstantIndex_CAS_SHARPEN_ONLY); + NSString* shader = m_dev.features.has_fast_half ? @"CASHalf" : @"CASFloat"; + m_cas_pipeline[sharpen_only] = MakeComputePipeline(LoadShader(shader), sharpen_only ? @"CAS Sharpen" : @"CAS Upscale"); } m_hw_vertex = MRCTransfer([MTLVertexDescriptor new]); diff --git a/pcsx2/GS/Renderers/Metal/GSMTLDeviceInfo.h b/pcsx2/GS/Renderers/Metal/GSMTLDeviceInfo.h index c0f8c1740ae66..87ca48a6f1e1f 100644 --- a/pcsx2/GS/Renderers/Metal/GSMTLDeviceInfo.h +++ b/pcsx2/GS/Renderers/Metal/GSMTLDeviceInfo.h @@ -42,6 +42,7 @@ struct GSMTLDevice bool framebuffer_fetch; bool primid; bool slow_color_compression; ///< Color compression seems to slow down rt read on AMD + bool has_fast_half; MetalVersion shader_version; int max_texsize; }; diff --git a/pcsx2/GS/Renderers/Metal/GSMTLDeviceInfo.mm b/pcsx2/GS/Renderers/Metal/GSMTLDeviceInfo.mm index 9a8eafef7c9bb..22cb6456be7d3 100644 --- a/pcsx2/GS/Renderers/Metal/GSMTLDeviceInfo.mm +++ b/pcsx2/GS/Renderers/Metal/GSMTLDeviceInfo.mm @@ -152,6 +152,10 @@ static DetectionResult detectIntelGPU(id dev, id lib) if ([dev supportsFamily:MTLGPUFamilyApple1]) features.framebuffer_fetch = true; + if (@available(macOS 10.15, iOS 13.0, *)) + if ([dev supportsFamily:MTLGPUFamilyMac2] || [dev supportsFamily:MTLGPUFamilyApple1]) + features.has_fast_half = true; // Approximate guess + features.shader_version = detectLibraryVersion(shaders); if (features.framebuffer_fetch && features.shader_version < MetalVersion::Metal23) { diff --git a/pcsx2/GS/Renderers/Metal/GSMTLSharedHeader.h b/pcsx2/GS/Renderers/Metal/GSMTLSharedHeader.h index 5ff545b8d0fe3..05634115fb6f9 100644 --- a/pcsx2/GS/Renderers/Metal/GSMTLSharedHeader.h +++ b/pcsx2/GS/Renderers/Metal/GSMTLSharedHeader.h @@ -57,6 +57,13 @@ struct GSMTLInterlacePSUniform vector_float4 ZrH; }; +struct GSMTLCASPSUniform +{ + vector_uint4 const0; + vector_uint4 const1; + vector_int2 srcOffset; +}; + struct GSMTLMainVertex { vector_float2 st; @@ -132,6 +139,7 @@ enum class GSMTLExpandType : unsigned char enum GSMTLFnConstants { + GSMTLConstantIndex_CAS_SHARPEN_ONLY, GSMTLConstantIndex_SCALING_FACTOR, GSMTLConstantIndex_FRAMEBUFFER_FETCH, GSMTLConstantIndex_FST, diff --git a/pcsx2/GS/Renderers/Metal/cas.metal b/pcsx2/GS/Renderers/Metal/cas.metal new file mode 100644 index 0000000000000..b45c1f6c2afbb --- /dev/null +++ b/pcsx2/GS/Renderers/Metal/cas.metal @@ -0,0 +1,112 @@ +/* PCSX2 - PS2 Emulator for PCs + * Copyright (C) 2002-2022 PCSX2 Dev Team + * + * PCSX2 is free software: you can redistribute it and/or modify it under the terms + * of the GNU Lesser General Public License as published by the Free Software Found- + * ation, either version 3 of the License, or (at your option) any later version. + * + * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with PCSX2. + * If not, see . + */ + +#define A_GPU 1 +#define A_MSL 1 +#define A_HALF 1 + +#include "../../../../bin/resources/shaders/common/ffx_a.h" + +struct CASTextureF +{ + const thread texture2d& tex; + uint2 offset; +}; + +struct CASTextureH +{ + const thread texture2d& tex; + ushort2 offset; +}; + +#define CAS_TEXTURE CASTextureF +#define CAS_TEXTUREH CASTextureH + +A_STATIC AF3 CasLoad(CASTextureF tex, ASU2 coord) +{ + return tex.tex.read(AU2(coord) + tex.offset).rgb; +} +#define CasInput(r,g,b) + +A_STATIC AH3 CasLoadH(CASTextureH tex, ASW2 coord) +{ + return tex.tex.read(AW2(coord) + tex.offset).rgb; +} + +A_STATIC void CasInputH(inoutAH2 r, inoutAH2 g, inoutAH2 b){} + +#include "../../../../bin/resources/shaders/common/ffx_cas.h" + +#include "GSMTLShaderCommon.h" + +constant bool CAS_SHARPEN_ONLY [[function_constant(GSMTLConstantIndex_CAS_SHARPEN_ONLY)]]; + +kernel void CASFloat( + uint2 localID [[thread_position_in_threadgroup]], + uint2 workgroupID [[threadgroup_position_in_grid]], + texture2d input [[texture(0)]], + texture2d output [[texture(1)]], + constant GSMTLCASPSUniform& cb [[buffer(GSMTLBufferIndexUniforms)]]) +{ + // Do remapping of local xy in workgroup for a more PS-like swizzle pattern. + AU2 gxy = ARmp8x8(localID.x) + (workgroupID << 4); + const AU4 const0 = cb.const0; + const AU4 const1 = cb.const1; + const CASTextureF tex{input, AU2(cb.srcOffset)}; + + // Filter. + float r, g, b; + + CasFilter(tex, r, g, b, gxy, const0, const1, CAS_SHARPEN_ONLY); + output.write(float4(r, g, b, 1), gxy); + gxy.x += 8; + + CasFilter(tex, r, g, b, gxy, const0, const1, CAS_SHARPEN_ONLY); + output.write(float4(r, g, b, 1), gxy); + gxy.y += 8; + + CasFilter(tex, r, g, b, gxy, const0, const1, CAS_SHARPEN_ONLY); + output.write(float4(r, g, b, 1), gxy); + gxy.x -= 8; + + CasFilter(tex, r, g, b, gxy, const0, const1, CAS_SHARPEN_ONLY); + output.write(float4(r, g, b, 1), gxy); +} + +kernel void CASHalf( + uint2 localID [[thread_position_in_threadgroup]], + uint2 workgroupID [[threadgroup_position_in_grid]], + texture2d input [[texture(0)]], + texture2d output [[texture(1)]], + constant GSMTLCASPSUniform& cb [[buffer(GSMTLBufferIndexUniforms)]]) +{ + // Do remapping of local xy in workgroup for a more PS-like swizzle pattern. + AU2 gxy = ARmp8x8(localID.x) + (workgroupID << 4); + const AU4 const0 = cb.const0; + const AU4 const1 = cb.const1; + const CASTextureH tex{input, AW2(cb.srcOffset)}; + + // Filter. + half2 r, g, b; + + #pragma unroll + for (int i = 0; i < 2; i++) + { + CasFilterH(tex, r, g, b, gxy, const0, const1, CAS_SHARPEN_ONLY); + output.write(half4(r.x, g.x, b.x, 1), gxy); + output.write(half4(r.y, g.y, b.y, 1), gxy + AU2(8, 0)); + gxy.y += 8; + } +}