Revert "Make sure light loop is scalarized by defining correctly SCAL…

…ARIZE_LIGHT_LOOP (#1684)" This reverts commit 2d016c5.
Unity-Technologies · Sep 14, 2020 · f5ab594 · f5ab594
1 parent 5bd028c
commit f5ab594
Show file tree

Hide file tree

Showing 10 changed files with 78 additions and 91 deletions.
diff --git a/com.unity.render-pipelines.high-definition/CHANGELOG.md b/com.unity.render-pipelines.high-definition/CHANGELOG.md
@@ -66,7 +66,6 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Fixed warning with area mesh (case 1268379)
 - Fixed issue with diffusion profile not being updated upon reset of the editor. 
 - Fixed an issue that lead to corrupted refraction in some scenarios on xbox.
-- Fixed for light loop scalarization not happening. 
 
 ### Changed
 - Preparation pass for RTSSShadows to be supported by render graph.

diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/HDShadowLoop.hlsl b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/HDShadowLoop.hlsl
@@ -66,13 +66,15 @@ void ShadowLoopMin(HDShadowContext shadowContext, PositionInputs posInput, float
 #endif
 
         bool fastPath = false;
+    #if SCALARIZE_LIGHT_LOOP
         uint lightStartLane0;
         fastPath = IsFastPath(lightStart, lightStartLane0);
 
         if (fastPath)
         {
             lightStart = lightStartLane0;
         }
+    #endif
 
         // Scalarized loop. All lights that are in a tile/cluster touched by any pixel in the wave are loaded (scalar load), only the one relevant to current thread/pixel are processed.
         // For clarity, the following code will follow the convention: variables starting with s_ are meant to be wave uniform (meant for scalar register),

diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.hlsl b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.hlsl
@@ -7,12 +7,9 @@
 #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/ScreenSpaceLighting/ScreenSpaceGlobalIllumination.cs.hlsl"
 #endif
 
-#ifndef SCALARIZE_LIGHT_LOOP
 // We perform scalarization only for forward rendering as for deferred loads will already be scalar since tiles will match waves and therefore all threads will read from the same tile.
 // More info on scalarization: https://flashypixels.wordpress.com/2018/11/10/intro-to-gpu-scalarization-part-2-scalarize-all-the-lights/
 #define SCALARIZE_LIGHT_LOOP (defined(PLATFORM_SUPPORTS_WAVE_INTRINSICS) && !defined(LIGHTLOOP_DISABLE_TILE_AND_CLUSTER) && SHADERPASS == SHADERPASS_FORWARD)
-#endif
-
 
 //-----------------------------------------------------------------------------
 // LightLoop
@@ -255,11 +252,7 @@ void LightLoop( float3 V, PositionInputs posInput, PreLightData preLightData, BS
         while (v_lightListOffset < lightCount)
         {
             v_lightIdx = FetchIndex(lightStart, v_lightListOffset);
-#if SCALARIZE_LIGHT_LOOP
             uint s_lightIdx = ScalarizeElementIndex(v_lightIdx, fastPath);
-#else
-            uint s_lightIdx = v_lightIdx;
-#endif
             if (s_lightIdx == -1)
                 break;
 

diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoopDef.hlsl b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoopDef.hlsl
@@ -268,6 +268,43 @@ uint FetchIndex(uint lightStart, uint lightOffset)
 
 #endif // LIGHTLOOP_DISABLE_TILE_AND_CLUSTER
 
+bool IsFastPath(uint lightStart, out uint lightStartLane0)
+{
+#if SCALARIZE_LIGHT_LOOP
+    // Fast path is when we all pixels in a wave are accessing same tile or cluster.
+    lightStartLane0 = WaveReadLaneFirst(lightStart);
+    return WaveActiveAllTrue(lightStart == lightStartLane0);
+#else
+    lightStartLane0 = lightStart;
+    return false;
+#endif
+}
+
+// This function scalarize an index accross all lanes. To be effecient it must be used in the context
+// of the scalarization of a loop. It is to use with IsFastPath so it can optimize the number of
+// element to load, which is optimal when all the lanes are contained into a tile.
+uint ScalarizeElementIndex(uint v_elementIdx, bool fastPath)
+{
+    uint s_elementIdx = v_elementIdx;
+#if SCALARIZE_LIGHT_LOOP
+    if (!fastPath)
+    {
+        // If we are not in fast path, v_elementIdx is not scalar, so we need to query the Min value across the wave.
+        s_elementIdx = WaveActiveMin(v_elementIdx);
+        // If WaveActiveMin returns 0xffffffff it means that all lanes are actually dead, so we can safely ignore the loop and move forward.
+        // This could happen as an helper lane could reach this point, hence having a valid v_elementIdx, but their values will be ignored by the WaveActiveMin
+        if (s_elementIdx == -1)
+        {
+            return -1;
+        }
+    }
+    // Note that the WaveReadLaneFirst should not be needed, but the compiler might insist in putting the result in VGPR.
+    // However, we are certain at this point that the index is scalar.
+    s_elementIdx = WaveReadLaneFirst(s_elementIdx);
+#endif
+    return s_elementIdx;
+}
+
 uint FetchIndexWithBoundsCheck(uint start, uint count, uint i)
 {
     if (i < count)

diff --git a/....render-pipelines.high-definition/Runtime/Lighting/ProbeVolume/ProbeVolumeAccumulate.hlsl b/....render-pipelines.high-definition/Runtime/Lighting/ProbeVolume/ProbeVolumeAccumulate.hlsl
@@ -1,13 +1,6 @@
 // This file should only be included inside of ProbeVolume.hlsl.
 // There are no #ifndef HEADER guards to stop multiple inclusion, as this is simply used for code gen.
 
-#ifndef SCALARIZE_LIGHT_LOOP
-// We perform scalarization only for forward rendering as for deferred loads will already be scalar since tiles will match waves and therefore all threads will read from the same tile.
-// More info on scalarization: https://flashypixels.wordpress.com/2018/11/10/intro-to-gpu-scalarization-part-2-scalarize-all-the-lights/
-#define SCALARIZE_LIGHT_LOOP (defined(PLATFORM_SUPPORTS_WAVE_INTRINSICS) && !defined(LIGHTLOOP_DISABLE_TILE_AND_CLUSTER) && SHADERPASS == SHADERPASS_FORWARD)
-#endif
-
-
 #ifndef PROBE_VOLUMES_ACCUMULATE_MODE
     #error "PROBE_VOLUMES_ACCUMULATE_MODE must be defined as 0, 1, or 2 before including ProbeVolumeAccumulate.hlsl. 0 triggers generation of SH0 variant, 1 triggers generation of SH1 variant, and 2 triggers generation of SH2 variant.";
 #endif
@@ -47,35 +40,17 @@
     if (weightHierarchy >= 1.0) { return; }
 #endif
 
-    bool fastPath = false;
-
-
     uint probeVolumeStart, probeVolumeCount;
-    // Fetch first probe volume to provide the scene proxy for screen space computation
-    ProbeVolumeGetCountAndStart(posInput, probeVolumeStart, probeVolumeCount);
-
-#if SCALARIZE_LIGHT_LOOP
-    uint probeStartLane0;
-    fastPath = IsFastPath(probeVolumeStart, probeStartLane0);
-
-    if (fastPath)
-    {
-        probeVolumeStart = probeStartLane0;
-    }
-#endif
+    bool fastPath;
+    ProbeVolumeGetCountAndStartAndFastPath(posInput, probeVolumeStart, probeVolumeCount, fastPath);
 
     // Scalarized loop, same rationale of the punctual light version
     uint v_probeVolumeListOffset = 0;
     uint v_probeVolumeIdx = probeVolumeStart;
     while (v_probeVolumeListOffset < probeVolumeCount)
     {
         v_probeVolumeIdx = ProbeVolumeFetchIndex(probeVolumeStart, v_probeVolumeListOffset);
-#if SCALARIZE_LIGHT_LOOP
-        uint s_probeVolumeIdx = ScalarizeElementIndex(v_probeVolumeIdx, fastPath);
-#else
-        uint s_probeVolumeIdx = v_probeVolumeIdx;
-#endif
-
+        uint s_probeVolumeIdx = ProbeVolumeScalarizeElementIndex(v_probeVolumeIdx, fastPath);
         if (s_probeVolumeIdx == -1) { break; }
 
         // Scalar load.

diff --git a/...ender-pipelines.high-definition/Runtime/Lighting/ProbeVolume/ProbeVolumeLightLoopDef.hlsl b/...ender-pipelines.high-definition/Runtime/Lighting/ProbeVolume/ProbeVolumeLightLoopDef.hlsl
@@ -4,12 +4,6 @@
 #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs.hlsl"
 #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/ProbeVolume/ProbeVolumeShaderVariables.hlsl"
 
-#ifndef SCALARIZE_LIGHT_LOOP
-// We perform scalarization only for forward rendering as for deferred loads will already be scalar since tiles will match waves and therefore all threads will read from the same tile.
-// More info on scalarization: https://flashypixels.wordpress.com/2018/11/10/intro-to-gpu-scalarization-part-2-scalarize-all-the-lights/
-#define SCALARIZE_LIGHT_LOOP (defined(PLATFORM_SUPPORTS_WAVE_INTRINSICS) && !defined(LIGHTLOOP_DISABLE_TILE_AND_CLUSTER) && SHADERPASS == SHADERPASS_FORWARD)
-#endif
-
 #if SHADEROPTIONS_PROBE_VOLUMES_EVALUATION_MODE == PROBEVOLUMESEVALUATIONMODES_MATERIAL_PASS
 // Cluster helper functions copied and lightly modified from ClusteredUtils.hlsl with ENABLE_DEPTH_TEXTURE_BACKPLANE undefined
 
@@ -117,6 +111,8 @@ void ProbeVolumeGetCountAndStart(PositionInputs posInput, out uint probeVolumeSt
 
 void ProbeVolumeGetCountAndStartAndFastPath(PositionInputs posInput, out uint probeVolumeStart, out uint probeVolumeCount, out bool fastPath)
 {
+    // Fetch first probe volume to provide the scene proxy for screen space computation
+    ProbeVolumeGetCountAndStart(posInput, probeVolumeStart, probeVolumeCount);
     fastPath = false;
 
 #if SCALARIZE_LIGHT_LOOP

diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/Shadow/ContactShadows.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/Shadow/ContactShadows.compute
@@ -14,6 +14,10 @@
 #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoopDef.hlsl"
 #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/Shadow/ContactShadows.hlsl"
 
+// We perform scalarization all the time here as we don't know if we have clustered data structure or not at this point. 
+// More info on scalarization: https://flashypixels.wordpress.com/2018/11/10/intro-to-gpu-scalarization-part-2-scalarize-all-the-lights/
+#define SCALARIZE_LIGHT_LOOP (defined(PLATFORM_SUPPORTS_WAVE_INTRINSICS) && !defined(LIGHTLOOP_DISABLE_TILE_AND_CLUSTER))
+
 #pragma only_renderers d3d11 playstation xboxone vulkan metal switch
 
 // #pragma enable_d3d11_debug_symbols

diff --git a/com.unity.render-pipelines.high-definition/Runtime/Material/Decal/DecalUtilities.hlsl b/com.unity.render-pipelines.high-definition/Runtime/Material/Decal/DecalUtilities.hlsl
@@ -1,6 +1,10 @@
 #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Material/Decal/Decal.hlsl"
 #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Material/Decal/DecalPrepassBuffer.hlsl"
 
+#ifndef SCALARIZE_LIGHT_LOOP
+#define SCALARIZE_LIGHT_LOOP (defined(PLATFORM_SUPPORTS_WAVE_INTRINSICS) && !defined(LIGHTLOOP_DISABLE_TILE_AND_CLUSTER) && SHADERPASS == SHADERPASS_FORWARD)
+#endif
+
 DECLARE_DBUFFER_TEXTURE(_DBufferTexture);
 
 // In order that the lod for with transpartent decal better match the lod for opaque decal
@@ -188,9 +192,11 @@ DecalSurfaceData GetDecalSurfaceData(PositionInputs posInput, inout float alpha)
 #ifndef LIGHTLOOP_DISABLE_TILE_AND_CLUSTER
     GetCountAndStart(posInput, LIGHTCATEGORY_DECAL, decalStart, decalCount);
 
+    #if SCALARIZE_LIGHT_LOOP
     // Fast path is when we all pixels in a wave are accessing same tile or cluster.
-    uint decalStartLane0;
-    bool fastPath = IsFastPath(decalStart, decalStartLane0);
+    uint decalStartLane0 = WaveReadLaneFirst(decalStart);
+    bool fastPath = WaveActiveAllTrue(decalStart == decalStartLane0);
+    #endif
 
 #else // LIGHTLOOP_DISABLE_TILE_AND_CLUSTER
     decalCount = _DecalCount;
@@ -220,9 +226,26 @@ DecalSurfaceData GetDecalSurfaceData(PositionInputs posInput, inout float alpha)
         v_decalIdx = decalStart + v_decalListOffset;
 #endif // LIGHTLOOP_DISABLE_TILE_AND_CLUSTER
 
-        uint s_decalIdx = ScalarizeElementIndex(v_decalIdx, fastPath);
-        if (s_decalIdx == -1)
-            break;
+        uint s_decalIdx = v_decalIdx;
+
+#if SCALARIZE_LIGHT_LOOP
+
+        if (!fastPath)
+        {
+            // If we are not in fast path, v_lightIdx is not scalar, so we need to query the Min value across the wave.
+            s_decalIdx = WaveActiveMin(v_decalIdx);
+            // If WaveActiveMin returns 0xffffffff it means that all lanes are actually dead, so we can safely ignore the loop and move forward.
+            // This could happen as an helper lane could reach this point, hence having a valid v_lightIdx, but their values will be ignored by the WaveActiveMin
+            if (s_decalIdx == -1)
+            {
+                break;
+            }
+        }
+        // Note that the WaveReadLaneFirst should not be needed, but the compiler might insist in putting the result in VGPR.
+        // However, we are certain at this point that the index is scalar.
+        s_decalIdx = WaveReadLaneFirst(s_decalIdx);
+
+#endif // SCALARIZE_LIGHT_LOOP
 
         DecalData s_decalData = FetchDecal(s_decalIdx);
         bool isRejected = (s_decalData.decalLayerMask & decalLayerMask) == 0;

diff --git a/com.unity.render-pipelines.high-definition/Runtime/ShaderLibrary/ShaderVariables.hlsl b/com.unity.render-pipelines.high-definition/Runtime/ShaderLibrary/ShaderVariables.hlsl
@@ -54,6 +54,8 @@
 #define RAY_TRACING_OPTIONAL_ALPHA_TEST_PASS
 #endif
 
+
+
 // ----------------------------------------------------------------------------
 
 CBUFFER_START(UnityPerDraw)

diff --git a/...nity.render-pipelines.high-definition/Runtime/ShaderLibrary/ShaderVariablesFunctions.hlsl b/...nity.render-pipelines.high-definition/Runtime/ShaderLibrary/ShaderVariablesFunctions.hlsl
@@ -163,48 +163,4 @@ float3 TransformPreviousObjectToWorld(float3 positionOS)
     return mul(previousModelMatrix, float4(positionOS, 1.0)).xyz;
 }
 
-
-// ----------------------------------------------------------------------------
-// Scalarization helper functions.
-// These assume a scalarization of a list of elements as described in https://flashypixels.wordpress.com/2018/11/10/intro-to-gpu-scalarization-part-2-scalarize-all-the-lights/
-
-bool IsFastPath(uint lightStart, out uint lightStartLane0)
-{
-#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
-    // Fast path is when we all pixels in a wave are accessing same tile or cluster.
-    lightStartLane0 = WaveReadLaneFirst(lightStart);
-    return WaveActiveAllTrue(lightStart == lightStartLane0);
-#else
-    lightStartLane0 = lightStart;
-    return false;
-#endif
-}
-
-// This function scalarize an index accross all lanes. To be effecient it must be used in the context
-// of the scalarization of a loop. It is to use with IsFastPath so it can optimize the number of
-// element to load, which is optimal when all the lanes are contained into a tile.
-// Please note that if PLATFORM_SUPPORTS_WAVE_INTRINSICS is not defined, this will *not* scalarize the index.
-uint ScalarizeElementIndex(uint v_elementIdx, bool fastPath)
-{
-    uint s_elementIdx = v_elementIdx;
-#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
-    if (!fastPath)
-    {
-        // If we are not in fast path, v_elementIdx is not scalar, so we need to query the Min value across the wave.
-        s_elementIdx = WaveActiveMin(v_elementIdx);
-        // If WaveActiveMin returns 0xffffffff it means that all lanes are actually dead, so we can safely ignore the loop and move forward.
-        // This could happen as an helper lane could reach this point, hence having a valid v_elementIdx, but their values will be ignored by the WaveActiveMin
-        if (s_elementIdx == -1)
-        {
-            return -1;
-        }
-    }
-    // Note that the WaveReadLaneFirst should not be needed, but the compiler might insist in putting the result in VGPR.
-    // However, we are certain at this point that the index is scalar.
-    s_elementIdx = WaveReadLaneFirst(s_elementIdx);
-#endif
-    return s_elementIdx;
-}
-
-
 #endif // UNITY_SHADER_VARIABLES_FUNCTIONS_INCLUDED