Unity-Technologies · sebastienlagarde · Jan 14, 2021 · Jan 7, 2021 · Jan 7, 2021 · Jan 14, 2021
diff --git a/com.unity.render-pipelines.high-definition/CHANGELOG.md b/com.unity.render-pipelines.high-definition/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ### Fixed
 - Fixed GC allocations from XR occlusion mesh when using multipass.
 - Fixed XR depth copy when using MSAA.
+- Fixed register spilling on  FXC in light list shaders.
 
 ## [11.0.0] - 2020-10-21
 

diff --git a/...der-pipelines.high-definition/Runtime/Lighting/LightLoop/lightlistbuild-clustered.compute b/...der-pipelines.high-definition/Runtime/Lighting/LightLoop/lightlistbuild-clustered.compute
@@ -69,6 +69,38 @@ groupshared float4 lightPlanes[4*6]; // Each plane is defined by a float4. 6 pla
 
 groupshared uint lightOffs;
 
+groupshared int categoryListCountScratch[NR_THREADS * LIGHTCATEGORY_COUNT];
+groupshared int shiftIndexScratch[NR_THREADS * LIGHTCATEGORY_COUNT];
+
+void ZeroCategoryListCountAndShiftIndex(uint threadIdx)
+{
+    for (int i = 0; i < LIGHTCATEGORY_COUNT; ++i)
+    {
+        categoryListCountScratch[threadIdx * LIGHTCATEGORY_COUNT + i] = 0;
+        shiftIndexScratch[threadIdx * LIGHTCATEGORY_COUNT + i] = 0;
+    }
+}
+
+void WriteShiftIndex(uint threadIdx, uint index, int value)
+{
+    shiftIndexScratch[threadIdx * LIGHTCATEGORY_COUNT + index] = value;
+}
+
+int ReadShiftIndex(uint threadIdx, uint index)
+{
+    return shiftIndexScratch[threadIdx * LIGHTCATEGORY_COUNT + index];
+}
+
+void IncrementCategoryListCount(uint threadIdx, uint index)
+{
+    categoryListCountScratch[threadIdx * LIGHTCATEGORY_COUNT + index]++;
+}
+
+int ReadCategoryListCount(uint threadIdx, uint index)
+{
+    return categoryListCountScratch[threadIdx * LIGHTCATEGORY_COUNT + index];
+}
+
 #ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE
 groupshared uint ldsZMax;
 #endif
@@ -375,14 +407,11 @@ void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
 
     // All our cull data are in the same list, but at render time envLights are separated so we need to shift the index
     // to make it work correctly
-    int shiftIndex[LIGHTCATEGORY_COUNT];
-    ZERO_INITIALIZE_ARRAY(int, shiftIndex, LIGHTCATEGORY_COUNT);
-    shiftIndex[LIGHTCATEGORY_ENV] = _EnvLightIndexShift;
-    shiftIndex[LIGHTCATEGORY_DECAL] = _DecalIndexShift;
-    shiftIndex[LIGHTCATEGORY_DENSITY_VOLUME] = _DensityVolumeIndexShift;
+    ZeroCategoryListCountAndShiftIndex(t);
 
-    int categoryListCount[LIGHTCATEGORY_COUNT]; // number of direct lights, reflection probes, decals, density volumes, and probe volumes
-    ZERO_INITIALIZE_ARRAY(int, categoryListCount, LIGHTCATEGORY_COUNT);
+    WriteShiftIndex(t, LIGHTCATEGORY_ENV, _EnvLightIndexShift);
+    WriteShiftIndex(t, LIGHTCATEGORY_DECAL, _DecalIndexShift);
+    WriteShiftIndex(t, LIGHTCATEGORY_DENSITY_VOLUME, _DensityVolumeIndexShift);
 
     uint offs = start;
     for(int ll=0; ll<iNrCoarseLights; ll+=4)
@@ -401,8 +430,8 @@ void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
             {
                 const int lightVolIndex = GenerateLightCullDataIndex(coarseList[l], g_iNrVisibLights, eyeIndex);
                 uint lightCategory = _LightVolumeData[lightVolIndex].lightCategory;
-                ++categoryListCount[lightCategory];
-                g_vLayeredLightList[offs++] = coarseList[l] - shiftIndex[lightCategory];
+                IncrementCategoryListCount(t, lightCategory);
+                g_vLayeredLightList[offs++] = coarseList[l] - ReadShiftIndex(t, lightCategory);
             }
         }
 
@@ -416,12 +445,12 @@ void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
     offs = GenerateLayeredOffsetBufferIndex(0, tileIDX, i, nrTilesX, nrTilesY, nrClusters, eyeIndex);
     for(int category=0; category<LIGHTCATEGORY_COUNT; category++)
     {
-        int numLights = min(categoryListCount[category],31);        // only allow 5 bits
+        int numLights = min(ReadCategoryListCount(t, category),31);        // only allow 5 bits
         if(i<nrClusters)
         {
             g_LayeredOffset[offs] = (start+localOffs) | (((uint) numLights)<<27);
             offs += (nrClusters*nrTilesX*nrTilesY);
-            localOffs += categoryListCount[category];       // use unclamped count for localOffs
+            localOffs += ReadCategoryListCount(t, category);       // use unclamped count for localOffs
         }
     }
 

diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
@@ -215,8 +215,36 @@ groupshared uint  gs_NdcAaBbMinPtW[LIGHTS_PER_GROUP]; // View-space Z coordinate
 groupshared uint  gs_NdcAaBbMaxPtW[LIGHTS_PER_GROUP]; // View-space Z coordinate
 #endif // PLATFORM_SUPPORTS_WAVE_INTRINSICS
 
+
+// ----------- Use LDS for the vertex ring buffer as otherwise on FXC we create register spilling
+
+groupshared float gs_VertexRingBufferX[MAX_CLIP_VERTS * THREADS_PER_GROUP];
+groupshared float gs_VertexRingBufferY[MAX_CLIP_VERTS * THREADS_PER_GROUP];
+groupshared float gs_VertexRingBufferZ[MAX_CLIP_VERTS * THREADS_PER_GROUP];
+groupshared float gs_VertexRingBufferW[MAX_CLIP_VERTS * THREADS_PER_GROUP];
+
+float4 GetFromRingBuffer(uint threadIdx, uint entry)
+{
+    float4 outV;
+    outV.x = gs_VertexRingBufferX[threadIdx * MAX_CLIP_VERTS + entry];
+    outV.y = gs_VertexRingBufferY[threadIdx * MAX_CLIP_VERTS + entry];
+    outV.z = gs_VertexRingBufferZ[threadIdx * MAX_CLIP_VERTS + entry];
+    outV.w = gs_VertexRingBufferW[threadIdx * MAX_CLIP_VERTS + entry];
+    return outV;
+}
+
+void WriteToRingBuffer(uint threadIdx, uint entry, float4 value)
+{
+    gs_VertexRingBufferX[threadIdx * MAX_CLIP_VERTS + entry] = value.x;
+    gs_VertexRingBufferY[threadIdx * MAX_CLIP_VERTS + entry] = value.y;
+    gs_VertexRingBufferZ[threadIdx * MAX_CLIP_VERTS + entry] = value.z;
+    gs_VertexRingBufferW[threadIdx * MAX_CLIP_VERTS + entry] = value.w;
+}
+/////////////////////////////////////////////////////////
+
+
 // Returns 'true' if it manages to cull the face.
-bool TryCullFace(uint f, uint behindMasksOfVerts[NUM_VERTS])
+bool TryCullFace(uint f, uint baseOffsetVertex)
 {
     uint cullMaskOfFace = FACE_MASK; // Initially behind
     uint vertListOfFace = GetVertexListOfFace(f);
@@ -225,7 +253,7 @@ bool TryCullFace(uint f, uint behindMasksOfVerts[NUM_VERTS])
     {
         uint v = BitFieldExtract(vertListOfFace, 3 * j, 3);
         // Non-zero if ALL the vertices are behind any of the planes.
-        cullMaskOfFace &= behindMasksOfVerts[v];
+        cullMaskOfFace &= gs_BehindMasksOfVerts[baseOffsetVertex + v];
     }
 
     return (cullMaskOfFace != 0);
@@ -260,13 +288,13 @@ float4 IntersectEdgeAgainstPlane(ClipVertex v0, ClipVertex v1)
 }
 
 void ClipPolygonAgainstPlane(uint p, uint srcBegin, uint srcSize,
-                             inout float4 vertRingBuffer[MAX_CLIP_VERTS],
+                             uint threadIdx,
                              out uint dstBegin, out uint dstSize)
 {
     dstBegin = srcBegin + srcSize; // Start at the end; we don't use modular arithmetic here
     dstSize  = 0;
 
-    ClipVertex tailVert = CreateClipVertex(p, vertRingBuffer[(srcBegin + srcSize - 1) % MAX_CLIP_VERTS]);
+    ClipVertex tailVert = CreateClipVertex(p, GetFromRingBuffer(threadIdx, (srcBegin + srcSize - 1) % MAX_CLIP_VERTS));
 
 #ifdef OBTUSE_COMPILER
     uint modSrcIdx = srcBegin % MAX_CLIP_VERTS;
@@ -278,7 +306,7 @@ void ClipPolygonAgainstPlane(uint p, uint srcBegin, uint srcSize,
     #ifndef OBTUSE_COMPILER
         uint modSrcIdx = j % MAX_CLIP_VERTS;
     #endif
-        ClipVertex leadVert = CreateClipVertex(p, vertRingBuffer[modSrcIdx]);
+        ClipVertex leadVert = CreateClipVertex(p, GetFromRingBuffer(threadIdx, modSrcIdx));
 
         // Execute Blinn's line clipping algorithm.
         // Classify the line segment. 4 cases:
@@ -295,7 +323,7 @@ void ClipPolygonAgainstPlane(uint p, uint srcBegin, uint srcSize,
         #ifndef OBTUSE_COMPILER
             uint modDstIdx = (dstBegin + dstSize++) % MAX_CLIP_VERTS;
         #endif
-            vertRingBuffer[modDstIdx] = clipVert;
+            WriteToRingBuffer(threadIdx, modDstIdx, clipVert);
         #ifdef OBTUSE_COMPILER
             dstSize++;
             modDstIdx++;
@@ -308,7 +336,9 @@ void ClipPolygonAgainstPlane(uint p, uint srcBegin, uint srcSize,
         #ifndef OBTUSE_COMPILER
             uint modDstIdx = (dstBegin + dstSize++) % MAX_CLIP_VERTS;
         #endif
-            vertRingBuffer[modDstIdx] = leadVert.pt;
+            WriteToRingBuffer(threadIdx, modDstIdx, leadVert.pt);
+
+            //vertRingBuffer[modDstIdx] = leadVert.pt;
         #ifdef OBTUSE_COMPILER
             dstSize++;
             modDstIdx++;
@@ -324,9 +354,9 @@ void ClipPolygonAgainstPlane(uint p, uint srcBegin, uint srcSize,
     }
 }
 
-void ClipFaceAgainstViewVolume(uint f, uint behindMasksOfVerts[NUM_VERTS], uint baseVertexOffset,
+void ClipFaceAgainstViewVolume(uint f, uint baseVertexOffset,
                                out uint srcBegin, out uint srcSize,
-                               out float4 vertRingBuffer[MAX_CLIP_VERTS])
+                               uint threadIdx)
 {
     srcBegin = 0;
     srcSize  = 4;
@@ -338,14 +368,15 @@ void ClipFaceAgainstViewVolume(uint f, uint behindMasksOfVerts[NUM_VERTS], uint
     {
         uint v = BitFieldExtract(vertListOfFace, 3 * j, 3);
         // Non-zero if ANY of the vertices are behind any of the planes.
-        clipMaskOfFace |= behindMasksOfVerts[v];
+        clipMaskOfFace |= gs_BehindMasksOfVerts[baseVertexOffset + v];
 
         // Not all edges may require clipping. However, filtering the vertex list
         // is somewhat expensive, so we currently don't do it.
-        vertRingBuffer[j].x = gs_HapVertsX[baseVertexOffset + v];
-        vertRingBuffer[j].y = gs_HapVertsY[baseVertexOffset + v];
-        vertRingBuffer[j].z = gs_HapVertsZ[baseVertexOffset + v];
-        vertRingBuffer[j].w = gs_HapVertsW[baseVertexOffset + v];
+        WriteToRingBuffer(threadIdx, j, float4(gs_HapVertsX[baseVertexOffset + v], gs_HapVertsY[baseVertexOffset + v], gs_HapVertsZ[baseVertexOffset + v], gs_HapVertsW[baseVertexOffset + v]));
+        //vertRingBuffer[j].x = gs_HapVertsX[baseVertexOffset + v];
+        //vertRingBuffer[j].y = gs_HapVertsY[baseVertexOffset + v];
+        //vertRingBuffer[j].z = gs_HapVertsZ[baseVertexOffset + v];
+        //vertRingBuffer[j].w = gs_HapVertsW[baseVertexOffset + v];
     }
 
     // Sutherland-Hodgeman polygon clipping algorithm.
@@ -355,7 +386,7 @@ void ClipFaceAgainstViewVolume(uint f, uint behindMasksOfVerts[NUM_VERTS], uint
         uint p = firstbitlow(clipMaskOfFace);
 
         uint dstBegin, dstSize;
-        ClipPolygonAgainstPlane(p, srcBegin, srcSize, vertRingBuffer, dstBegin, dstSize);
+        ClipPolygonAgainstPlane(p, srcBegin, srcSize, threadIdx, dstBegin, dstSize);
 
         srcBegin = dstBegin;
         srcSize  = dstSize;
@@ -364,7 +395,7 @@ void ClipFaceAgainstViewVolume(uint f, uint behindMasksOfVerts[NUM_VERTS], uint
     }
 }
 
-void UpdateAaBb(uint srcBegin, uint srcSize, float4 vertRingBuffer[MAX_CLIP_VERTS],
+void UpdateAaBb(uint srcBegin, uint srcSize, uint threadIdx,
                 bool isOrthoProj, float4x4 invProjMat,
                 inout float4 ndcAaBbMinPt, inout float4 ndcAaBbMaxPt)
 {
@@ -376,7 +407,7 @@ void UpdateAaBb(uint srcBegin, uint srcSize, float4 vertRingBuffer[MAX_CLIP_VERT
     #ifndef OBTUSE_COMPILER
         uint modSrcIdx = j % MAX_CLIP_VERTS;
     #endif
-        float4 hapVert = vertRingBuffer[modSrcIdx];
+        float4 hapVert = GetFromRingBuffer(threadIdx, modSrcIdx);
         // Clamp to the bounds in case of numerical errors (may still generate -0).
         float3 rapVertNDC = saturate(hapVert.xyz * rcp(hapVert.w));
         float  rbpVertVSz = hapVert.w;
@@ -714,13 +745,6 @@ void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID)
     GroupMemoryBarrierWithGroupSync();
 #endif
 
-    uint behindMasksOfVerts[NUM_VERTS];
-
-    for (i = 0; i < NUM_VERTS; i++)
-    {
-        behindMasksOfVerts[i] = gs_BehindMasksOfVerts[baseVertexOffset + i];
-    }
-
     // (3) Cull the faces.
     {
         const uint cullFaceMask   = cullClipFaceMask;
@@ -734,7 +758,7 @@ void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID)
             {
                 uint f = NthBitLow(cullFaceMask, n);
 
-                if (TryCullFace(f, behindMasksOfVerts))
+                if (TryCullFace(f, baseVertexOffset))
                 {
                     cullClipFaceMask ^= 1 << f; // Clear the bit
                 }
@@ -773,10 +797,9 @@ void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID)
                 uint f = NthBitLow(clipFaceMask, n);
 
                 uint   srcBegin, srcSize;
-                float4 vertRingBuffer[MAX_CLIP_VERTS];
-                ClipFaceAgainstViewVolume(f, behindMasksOfVerts, baseVertexOffset,
-                                          srcBegin, srcSize, vertRingBuffer);
-                UpdateAaBb(srcBegin, srcSize, vertRingBuffer, g_isOrthographic != 0, invProjMat,
+                ClipFaceAgainstViewVolume(f, baseVertexOffset,
+                                          srcBegin, srcSize, t);
+                UpdateAaBb(srcBegin, srcSize, t, g_isOrthographic != 0, invProjMat,
                            ndcAaBbMinPt, ndcAaBbMaxPt);
             }
         }