From c084cdd0df7fa4c35be0136e7ae57bb0ebf79302 Mon Sep 17 00:00:00 2001
From: Evgenii <evgenii@unity3d.com>
Date: Mon, 20 Jul 2020 16:35:07 -0700
Subject: [PATCH 01/22] Implement clipping and culling (does not consider view
 frustum corners)

---
 .../ShaderLibrary/Macros.hlsl                 |   1 +
 .../Runtime/Lighting/LightLoop/LightLoop.cs   |   9 +-
 .../Lighting/LightLoop/scrbound.compute       | 529 +++++++++++++++++-
 3 files changed, 533 insertions(+), 6 deletions(-)
diff --git a/com.unity.render-pipelines.core/ShaderLibrary/Macros.hlsl b/com.unity.render-pipelines.core/ShaderLibrary/Macros.hlsl
index f8f478a163e..c89f6f0bbd2 100644
--- a/com.unity.render-pipelines.core/ShaderLibrary/Macros.hlsl
+++ b/com.unity.render-pipelines.core/ShaderLibrary/Macros.hlsl
@@ -43,6 +43,7 @@
 #define HALF_MIN 6.103515625e-5  // 2^-14, the same value for 10, 11 and 16-bit: https://www.khronos.org/opengl/wiki/Small_Float_Formats
 #define HALF_MAX 65504.0
 #define UINT_MAX 0xFFFFFFFFu
+#define INT_MAX  0x7FFFFFFF
 
 
 #ifdef SHADER_API_GLES
diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs
index 1ca6843ed51..1aa7b2292f8 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs
+++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs
@@ -1642,9 +1642,12 @@ void GetLightVolumeDataAndBound(LightCategory lightCategory, GPULightType gpuLig
             }
             else if (gpuLightType == GPULightType.Point)
             {
-                Vector3 vx = xAxisVS;
-                Vector3 vy = yAxisVS;
-                Vector3 vz = zAxisVS;
+                // Construct a view-space axis-aligned bounding cube around the bounding sphere.
+                // This allows us to utilize the same polygon clipping technique for all lights.
+                // Non-axis-aligned vectors may result in a larger screen-space AABB.
+                Vector3 vx = new Vector3(1, 0, 0);
+                Vector3 vy = new Vector3(0, 1, 0);
+                Vector3 vz = new Vector3(0, 0, 1);
 
                 bound.center = positionVS;
                 bound.boxAxisX = vx * range;
diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
index 2681070522f..6b37c450c6e 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
+++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
@@ -10,6 +10,7 @@
 #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs.hlsl"
 #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightCullUtils.hlsl"
 
+// #pragma enable_d3d11_debug_symbols
 #pragma only_renderers d3d11 playstation xboxone vulkan metal switch
 
 uniform int g_isOrthographic;
@@ -25,6 +26,288 @@ StructuredBuffer<SFiniteLightBound> g_data : register( t0 );
 // output buffer
 RWStructuredBuffer<float4> g_vBoundsBuffer : register( u0 );
 
+#define DUMB_COMPILER
+// #define USE_WAVE_INTRINSICS // We use TGSM and atomic operations if wave intrinsics are not supported
+
+#ifdef Z_BINNING
+
+// Computes r=(n/d) and rounds the result towards the largest adjacent integer.
+uint DivRoundUp(uint n, uint d)
+{
+    return (n + d - 1) / d; // No division by 0 checks
+}
+
+// Returns the location of the N-th set bit starting from the lowest order bit and working upward.
+// Slow implementation - do not use for large bit sets.
+// Could be optimized - see https://graphics.stanford.edu/~seander/bithacks.html
+uint NthBitLow(uint value, uint n)
+{
+    uint b = -1;                                    // Consistent with the behavior of firstbitlow()
+    uint c = countbits(value);
+
+    if (n < c)                                      // Validate inputs
+    {
+        uint r = n + 1;                             // Compute the number of remaining bits
+
+        do
+        {
+            uint f = firstbitlow(value >> (b + 1)); // Find the next set bit
+            b += f + r;                             // Make a guess (assume all [b+f+1,b+f+r] bits are set)
+            c = countbits(value << (32 - (b + 1))); // Count the number of bits actually set
+            r = (n + 1) - c;                        // Compute the number of remaining bits
+        } while (r > 0);
+    }
+
+    return b;
+}
+
+// Clipping a plane by a cube may produce a hexagon (6-gon).
+// Clipping a hexagon by 4 planes may produce a decagon (10-gon).
+#define MAX_CLIP_VERTS    (10)
+#define NUM_EDGES         (12)
+#define NUM_VERTS         (8)
+#define NUM_FACES         (6)
+#define NUM_PLANES        (6)
+#define THREADS_PER_LIGHT (4)
+#define THREADS_PER_GROUP (64)
+#define LIGHTS_PER_GROUP  (THREADS_PER_GROUP / THREADS_PER_LIGHT)
+#define VERTS_PER_GROUP   (NUM_VERTS * LIGHTS_PER_GROUP)
+#define VERTS_PER_THREAD  (NUM_VERTS / THREADS_PER_LIGHT)
+#define FACES_PER_THREAD  DivRoundUp(NUM_FACES, THREADS_PER_LIGHT)
+
+// All planes and faces are always in the standard order (see below).
+#define FACE_LEFT   (1 << 0) // x = -1
+#define FACE_RIGHT  (1 << 1) // x = +1
+#define FACE_FRONT  (1 << 2) // y = -1
+#define FACE_BACK   (1 << 3) // y = +1
+#define FACE_TOP    (1 << 4) // z = -1
+#define FACE_BOTTOM (1 << 5) // z = +1
+#define FACE_MASK   ((1 << NUM_FACES) - 1)
+
+// TODO: the compiler generates 'tbuffer_load_format_x' instructions
+// when we access the look-up tables. Can we avoid this?
+
+// All vertices are always in the standard order (see below).
+static const uint s_FaceMasksOfVerts[NUM_VERTS] =
+{
+    FACE_LEFT  | FACE_FRONT | FACE_TOP,    // 0: (-1, -1, -1)
+    FACE_RIGHT | FACE_FRONT | FACE_TOP,    // 1: (+1, -1, -1)
+    FACE_RIGHT | FACE_BACK  | FACE_TOP,    // 2: (+1, +1, -1)
+    FACE_LEFT  | FACE_BACK  | FACE_TOP,    // 3: (-1, +1, -1)
+    FACE_LEFT  | FACE_FRONT | FACE_BOTTOM, // 4: (-1, -1, +1)
+    FACE_RIGHT | FACE_FRONT | FACE_BOTTOM, // 5: (+1, -1, +1)
+    FACE_RIGHT | FACE_BACK  | FACE_BOTTOM, // 6: (+1, +1, +1)
+    FACE_LEFT  | FACE_BACK  | FACE_BOTTOM  // 7: (-1, +1, +1)
+};
+
+// CCW order (starting with the LSB) of vertices for each face (w.r.t. its normal),
+// with normals pointing in the interior of the volume.
+static const uint s_VertMasksOfFaces[NUM_FACES] =
+{
+    3 << 9 | 7 << 6 | 4 << 3 | 0 << 0, // 0: FACE_LEFT
+    5 << 9 | 6 << 6 | 2 << 3 | 1 << 0, // 1: FACE_RIGHT
+    4 << 9 | 5 << 6 | 1 << 3 | 0 << 0, // 2: FACE_FRONT
+    2 << 9 | 6 << 6 | 7 << 3 | 3 << 0, // 3: FACE_BACK
+    1 << 9 | 2 << 6 | 3 << 3 | 0 << 0, // 4: FACE_TOP
+    7 << 9 | 6 << 6 | 5 << 3 | 4 << 0  // 5: FACE_BOTTOM
+};
+
+// 5 arrays * 128 elements * 4 bytes each = 2560 bytes.
+groupshared float gs_HapVertsX[VERTS_PER_GROUP];
+groupshared float gs_HapVertsY[VERTS_PER_GROUP];
+groupshared float gs_HapVertsZ[VERTS_PER_GROUP];
+groupshared float gs_HapVertsW[VERTS_PER_GROUP];
+groupshared uint  gs_BehindMasksOfVerts[VERTS_PER_GROUP]; // 6 planes each (HLSL does not support small data types)
+
+#ifndef USE_WAVE_INTRINSICS
+// 1 array *  16 elements * 4 bytes each = 64 bytes.
+groupshared uint  gs_CullClipFaceMasks[LIGHTS_PER_GROUP]; // 6 faces  each (HLSL does not support small data types)
+
+// 6 arrays * 16 elements * 4 bytes each = 384 bytes.
+// Note that these are actually floats reinterpreted as uints.
+// The reason is because floating-point atomic operations are not supported.
+groupshared uint  gs_RapAaBbMinPtX[LIGHTS_PER_GROUP];
+groupshared uint  gs_RapAaBbMaxPtX[LIGHTS_PER_GROUP];
+groupshared uint  gs_RapAaBbMinPtY[LIGHTS_PER_GROUP];
+groupshared uint  gs_RapAaBbMaxPtY[LIGHTS_PER_GROUP];
+groupshared uint  gs_RapAaBbMinPtZ[LIGHTS_PER_GROUP];
+groupshared uint  gs_RapAaBbMaxPtZ[LIGHTS_PER_GROUP];
+#endif // USE_WAVE_INTRINSICS
+
+// Returns 'true' if it manages to cull the face.
+bool TryCullFace(uint f, uint behindMasksOfVerts[NUM_VERTS])
+{
+    uint cullMaskOfFace = FACE_MASK; // Initially behind
+    uint vertMaskOfFace = s_VertMasksOfFaces[f];
+
+    for (int j = 0; j < 4; j++)
+    {
+        uint v = BitFieldExtract(vertMaskOfFace, 3 * j, 3);
+        // Non-zero if ALL the vertices are behind any of the planes.
+        cullMaskOfFace &= behindMasksOfVerts[v];
+    }
+
+    return (cullMaskOfFace != 0);
+}
+
+struct ClipVertex
+{
+    float4 pt; // Homogeneous coordinate after perspective
+    float  bc; // Boundary coordinate with respect to the plane 'p'
+};
+
+ClipVertex CreateClipVertex(uint p, float4 v)
+{
+    bool evenPlane = (p % 2) == 0;
+
+    float c = v[p / 2];
+    float w = v.w;
+
+    ClipVertex cv;
+
+    cv.pt = v;
+    cv.bc = evenPlane ? c : w - c; // dot(PlaneEquation, HapVertex);
+
+    return cv;
+}
+
+float4 IntersectEdgeAgainstPlane(ClipVertex v0, ClipVertex v1)
+{
+    float alpha = saturate(v0.bc * rcp(v0.bc - v1.bc)); // Guaranteed to lie between 0 and 1
+
+    return lerp(v0.pt, v1.pt, alpha);
+}
+
+void ClipPolygonAgainstPlane(uint p, uint srcBegin, uint srcSize,
+                             inout float4 vertRingBuffer[MAX_CLIP_VERTS],
+                             out uint dstBegin, out uint dstSize)
+{
+    dstBegin = srcBegin + srcSize; // Start at the end; we don't use modular arithmetic here
+    dstSize  = 0;
+
+    ClipVertex tailVert = CreateClipVertex(p, vertRingBuffer[(srcBegin + srcSize - 1) % MAX_CLIP_VERTS]);
+
+#ifdef DUMB_COMPILER
+    uint modSrcIdx = srcBegin % MAX_CLIP_VERTS;
+    uint modDstIdx = dstBegin % MAX_CLIP_VERTS;
+#endif
+
+    for (uint k = srcBegin; k < (srcBegin + srcSize); k++)
+    {
+    #ifndef DUMB_COMPILER
+        uint modSrcIdx = k % MAX_CLIP_VERTS;
+    #endif
+        ClipVertex leadVert = CreateClipVertex(p, vertRingBuffer[modSrcIdx]);
+
+        // Execute Blinn's line clipping algorithm.
+        // Classify the line segment. 4 cases:
+        // 0. v0 out, v1 out -> add nothing
+        // 1. v0 in,  v1 out -> add intersection
+        // 2. v0 out, v1 in  -> add intersection, add v1
+        // 3. v0 in,  v1 in  -> add v1
+        // (bc >= 0) <-> in, (bc < 0) <-> out. Beware of the signed zero.
+
+        if ((tailVert.bc >= 0) != (leadVert.bc >= 0))
+        {
+            // The line segment is guaranteed to cross the plane.
+            float4 clipVert = IntersectEdgeAgainstPlane(tailVert, leadVert);
+        #ifndef DUMB_COMPILER
+            uint modDstIdx = (dstBegin + dstSize++) % MAX_CLIP_VERTS;
+        #endif
+            vertRingBuffer[modDstIdx] = clipVert;
+        #ifdef DUMB_COMPILER
+            dstSize++;
+            modDstIdx++;
+            modDstIdx = (modDstIdx == MAX_CLIP_VERTS) ? 0 : modDstIdx;
+        #endif
+        }
+
+        if (leadVert.bc >= 0)
+        {
+        #ifndef DUMB_COMPILER
+            uint modDstIdx = (dstBegin + dstSize++) % MAX_CLIP_VERTS;
+        #endif
+            vertRingBuffer[modDstIdx] = leadVert.pt;
+        #ifdef DUMB_COMPILER
+            dstSize++;
+            modDstIdx++;
+            modDstIdx = (modDstIdx == MAX_CLIP_VERTS) ? 0 : modDstIdx;
+        #endif
+        }
+
+    #ifdef DUMB_COMPILER
+        modSrcIdx++;
+        modSrcIdx = (modSrcIdx == MAX_CLIP_VERTS) ? 0 : modSrcIdx;
+    #endif
+        tailVert = leadVert; // Avoid recomputation and overwriting the vertex in the ring buffer
+    }
+}
+
+void ClipFaceAgainstViewVolumeAndUpdateAaBb(uint f, uint behindMasksOfVerts[NUM_VERTS], uint firstVertexOffset,
+                                            inout float3 rapAaBbMinPt, inout float3 rapAaBbMaxPt)
+{
+    float4 vertRingBuffer[MAX_CLIP_VERTS];
+    uint srcBegin = 0, srcSize = 4;
+
+    uint clipMaskOfFace = 0; // Initially in front
+    uint vertMaskOfFace = s_VertMasksOfFaces[f];
+
+    for (int j = 0; j < 4; j++)
+    {
+        uint v = BitFieldExtract(vertMaskOfFace, 3 * j, 3);
+        // Non-zero if ANY of the vertices are behind any of the planes.
+        clipMaskOfFace |= behindMasksOfVerts[v];
+
+        // Note that not all edges may require clipping. However,
+        // filtering the vertex list is somewhat expensive, so we currently don't do it.
+        vertRingBuffer[j].x = gs_HapVertsX[firstVertexOffset + v];
+        vertRingBuffer[j].y = gs_HapVertsY[firstVertexOffset + v];
+        vertRingBuffer[j].z = gs_HapVertsZ[firstVertexOffset + v];
+        vertRingBuffer[j].w = gs_HapVertsW[firstVertexOffset + v];
+    }
+
+    const uint numPlanesToClipAgainst = countbits(clipMaskOfFace); // [1, 6]
+
+    // Sutherland-Hodgeman polygon clipping algorithm.
+    // It works by clipping the entire polygon against one clipping plane at a time.
+    for (uint j = 0; j < numPlanesToClipAgainst; j++)
+    {
+        uint p = firstbitlow(clipMaskOfFace);
+
+        uint dstBegin, dstSize;
+        ClipPolygonAgainstPlane(p, srcBegin, srcSize, vertRingBuffer, dstBegin, dstSize);
+
+        srcBegin = dstBegin;
+        srcSize  = dstSize;
+
+        clipMaskOfFace ^= 1 << p; // Clear the bit to continue using firstbitlow()
+    }
+
+#ifdef DUMB_COMPILER
+    uint modSrcIdx = srcBegin % MAX_CLIP_VERTS;
+#endif
+
+    for (int j = srcBegin; j < (srcBegin + srcSize); j++)
+    {
+    #ifndef DUMB_COMPILER
+        uint modSrcIdx = j % MAX_CLIP_VERTS;
+    #endif
+
+        float4 hapVert = vertRingBuffer[modSrcIdx];
+        float3 rapVert = hapVert.xyz * rcp(hapVert.w);
+
+        rapAaBbMinPt = min(rapAaBbMinPt, rapVert);
+        rapAaBbMaxPt = max(rapAaBbMaxPt, rapVert);
+
+    #ifdef DUMB_COMPILER
+        modSrcIdx++;
+        modSrcIdx = (modSrcIdx == MAX_CLIP_VERTS) ? 0 : modSrcIdx;
+    #endif
+    }
+}
+
+#else // !Z_BINNING
+
 #define MAX_PNTS        9       // strictly this should be 10=6+4 but we get more wavefronts and 10 seems to never hit (fingers crossed)
                                 // However, worst case the plane that would be skipped if such an extreme case ever happened would be backplane
                                 // clipping gets skipped which doesn't cause any errors.
@@ -44,6 +327,7 @@ void CalcBound(out bool2 bIsMinValid, out bool2 bIsMaxValid, out float2 vMin, ou
 
 #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightingConvexHullUtils.hlsl"
 
+#endif // Z_BINNING
 
 [numthreads(NR_THREADS, 1, 1)]
 void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
@@ -59,13 +343,248 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
     unsigned int g = groupID;
     unsigned int t = threadID;
 
-    const int subLigt = (int) (t/8);
-    const int lgtIndex = subLigt+(int) g*8;
-    const int sideIndex = (int) (t%8);
+    const int subLigt = (uint) (t/8);
+    const int lgtIndex = subLigt+(uint) g*8;
+    const int sideIndex = (uint) (t%8);
 
     const int eyeAdjustedLgtIndex = GenerateLightCullDataIndex(lgtIndex, g_iNrVisibLights, eyeIndex);
     SFiniteLightBound lgtDat = g_data[eyeAdjustedLgtIndex];
 
+#ifdef Z_BINNING
+    //**********************************************************************************************
+    // The goal of this program is to compute the AABB of the light in the NDC space ([0, 1] range).
+    // The light is represented by a convex volume (a cuboid) with 6 faces (planar quads) and 8 vertices.
+    //
+    // Since a light volume may be partially off-screen, we must clip it before computing the AABB.
+    // Clipping the resulting AABB (rather than the light volume itself) may result in a loose AABB.
+    //
+    // To avoid having to deal with toroidal properties of the perspective transform,
+    // we perform clipping using the homogeneous (projective) post-perspective coordinates.
+    // This clipping method in described in Blinn's paper titled "Line Clipping".
+    //
+    // The algorithm processes a light on 4 threads. While all 6 faces may require clipping in the
+    // worst case, clipping more than 4 faces is very uncommon (typically, we clip 0, 3 or 4).
+    // Note that some faces may require culling rather than clipping (the former is simpler).
+    //
+    // It's important to realize that face culling may end up culling 5 (or even all 6) faces.
+    // This means that the clipped light volume may be reduced to a single polygon, or nothing at all.
+    // (Imagine a view volume completely or partially inside a light volume).
+    // Therefore, we must perform view-volume-corner-inside-light-volume tests.
+    //
+    //
+    // Notation:
+    // rbp - real (3D) coordinates before perspective
+    // hbp - hom. (4D) coordinates before perspective
+    // hap - hom. (4D) coordinates after  perspective
+    // rap - real (3D) coordinates after  perspective (after division by w)
+    // *********************************************************************************************
+
+    const uint groupLocalLightIndex = t / THREADS_PER_LIGHT;
+    const uint firstVertexOffset    = NUM_VERTS * groupLocalLightIndex;
+
+    const float2 scale = lgtDat.scaleXY.xy;
+    const float3 rbpC  = lgtDat.center.xyz;
+    const float3 rbpX  = lgtDat.boxAxisX.xyz;
+    const float3 rbpY  = lgtDat.boxAxisY.xyz;
+    const float3 rbpZ  = lgtDat.boxAxisZ.xyz;
+
+#ifndef USE_WAVE_INTRINSICS
+    // Initialize the TGSM. All threads write the same value -> no data races.
+    // The hardware will coalesce the writes.
+    gs_CullClipFaceMasks[groupLocalLightIndex] = 0; // Initially inside
+    gs_RapAaBbMinPtX[groupLocalLightIndex]     = asuint(1.0f);
+    gs_RapAaBbMaxPtX[groupLocalLightIndex]     = asuint(0.0f);
+    gs_RapAaBbMinPtY[groupLocalLightIndex]     = asuint(1.0f);
+    gs_RapAaBbMaxPtY[groupLocalLightIndex]     = asuint(0.0f);
+    gs_RapAaBbMinPtZ[groupLocalLightIndex]     = asuint(1.0f);
+    gs_RapAaBbMaxPtZ[groupLocalLightIndex]     = asuint(0.0f);
+#endif // USE_WAVE_INTRINSICS
+
+    float3 rapAaBbMinPt = 1;
+    float3 rapAaBbMaxPt = 0;
+
+    // We must determine whether we have to clip or cull any of the faces.
+    // If all vertices of a face are inside with respect to all the culling planes,
+    // we can trivially accept that face. If all vertices of a face are behind
+    // any single plane, we can trivially reject (cull) that face.
+    uint cullClipFaceMask = 0; // Initially inside
+
+    // (1) Compute the vertices of the light volume.
+    for (uint i = 0; i < VERTS_PER_THREAD; i++)
+    {
+        uint v = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT;
+
+        // rbpVerts[0] = rbpC - rbpX * scale.x - rbpY * scale.y - rbpZ; // (-1, -1, -1)
+        // rbpVerts[1] = rbpC + rbpX * scale.x - rbpY * scale.y - rbpZ; // (+1, -1, -1)
+        // rbpVerts[2] = rbpC + rbpX * scale.x + rbpY * scale.y - rbpZ; // (+1, +1, -1)
+        // rbpVerts[3] = rbpC - rbpX * scale.x + rbpY * scale.y - rbpZ; // (-1, +1, -1)
+        // rbpVerts[4] = rbpC - rbpX           - rbpY           + rbpZ; // (-1, -1, +1)
+        // rbpVerts[5] = rbpC + rbpX           - rbpY           + rbpZ; // (+1, -1, +1)
+        // rbpVerts[6] = rbpC + rbpX           + rbpY           + rbpZ; // (+1, +1, +1)
+        // rbpVerts[7] = rbpC - rbpX           + rbpY           + rbpZ; // (-1, +1, +1)
+
+        float3 m; // See the comment above
+
+        m.x = (countbits(v % 4) == 1) ? 1 : -1;
+        m.y = (v & 2 != 0)            ? 1 : -1;
+        m.z = (v >= 4)                ? 1 : -1;
+
+        m.xy *= (v >= 4) ? 1 : scale;
+
+        float3 rbpVert = rbpC + m.x * rbpX + m.y * rbpY + m.z * rbpZ;
+        float4 hapVert = mul(g_mProjection, float4(rbpVert, 1));
+
+        // Make sure the W component is strictly positive.
+        // It is helpful in order to simplify clipping and to avoid perspective division by 0.
+        float w = hapVert.w;
+        float s = (w >= 0) ? 1 : -1;
+
+        // Transform the X and Y components: [-w, w] -> [0, w].
+        hapVert.x = (0.5 * s) * hapVert.x + ((0.5 * s) * w);
+        hapVert.y = (0.5 * s) * hapVert.y + ((0.5 * s) * w);
+        hapVert.z = s * hapVert.z;
+        hapVert.w = max(abs(w), FLT_MIN);
+
+        // For each vertex, we must determine whether it is within the bounds.
+        // For culling and clipping, we must know, per culling plane, whether the vertex
+        // is in the positive or the negative half-space.
+        uint behindMask = 0; // Initially in front
+
+        // Consider the vertex to be inside the view volume if:
+        // 0 <= x <= w
+        // 0 <= y <= w
+        // 0 <= z <= w
+        w = hapVert.w;
+
+        for (uint j = 0; j < (NUM_PLANES / 2); j++)
+        {
+            behindMask |= (hapVert[j] < 0 ? 1 : 0) << (2 * j + 0); // Planes crossing '0'
+            behindMask |= (hapVert[j] > w ? 1 : 0) << (2 * j + 1); // Planes crossing 'w'
+        }
+
+        if (behindMask == 0) // Inside?
+        {
+            float3 rapVert = hapVert.xyz * rcp(hapVert.w);
+
+            rapAaBbMinPt = min(rapAaBbMinPt, rapVert);
+            rapAaBbMaxPt = max(rapAaBbMaxPt, rapVert);
+        }
+        else // Outside
+        {
+            cullClipFaceMask |= s_FaceMasksOfVerts[v];
+        }
+
+        gs_HapVertsX[firstVertexOffset + v]          = hapVert.x;
+        gs_HapVertsY[firstVertexOffset + v]          = hapVert.y;
+        gs_HapVertsZ[firstVertexOffset + v]          = hapVert.z;
+        gs_HapVertsW[firstVertexOffset + v]          = hapVert.w;
+        gs_BehindMasksOfVerts[firstVertexOffset + v] = behindMask;
+    }
+
+#ifdef USE_WAVE_INTRINSICS
+    // ...
+#else
+    InterlockedOr(gs_CullClipFaceMasks[groupLocalLightIndex], cullClipFaceMask);
+
+    GroupMemoryBarrierWithGroupSync();
+
+    cullClipFaceMask = gs_CullClipFaceMasks[groupLocalLightIndex];
+#endif
+
+    if (cullClipFaceMask != 0)
+    {
+        // The light may be partially outside the view volume.
+    }
+
+    uint behindMasksOfVerts[NUM_VERTS];
+
+    for (uint i = 0; i < NUM_VERTS; i++)
+    {
+        behindMasksOfVerts[i] = gs_BehindMasksOfVerts[firstVertexOffset + i];
+    }
+
+    // (2) Cull the faces.
+    const uint cullFaceMask   = cullClipFaceMask;
+    const uint numFacesToCull = countbits(cullFaceMask); // [0, 6]
+
+    for (uint i = 0; i < FACES_PER_THREAD; i++)
+    {
+        uint n = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT;
+
+        if (n < numFacesToCull)
+        {
+            uint f = NthBitLow(cullFaceMask, n);
+
+            if (TryCullFace(f, behindMasksOfVerts))
+            {
+                cullClipFaceMask ^= 1 << f; // Clear the bit
+            }
+        }
+    }
+
+#ifdef USE_WAVE_INTRINSICS
+    // ...
+#else
+    InterlockedAnd(gs_CullClipFaceMasks[groupLocalLightIndex], cullClipFaceMask);
+
+    GroupMemoryBarrierWithGroupSync();
+
+    cullClipFaceMask = gs_CullClipFaceMasks[groupLocalLightIndex];
+#endif
+
+    // (3) Clip the faces.
+    const uint clipFaceMask   = cullClipFaceMask;
+    const uint numFacesToClip = countbits(clipFaceMask); // [0, 6]
+
+    for (uint i = 0; i < FACES_PER_THREAD; i++)
+    {
+        uint n = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT;
+
+        if (n < numFacesToCull)
+        {
+            uint f = NthBitLow(clipFaceMask, n);
+
+            ClipFaceAgainstViewVolumeAndUpdateAaBb(f, behindMasksOfVerts, firstVertexOffset,
+                                                   rapAaBbMinPt, rapAaBbMaxPt);
+        }
+    }
+
+#ifdef USE_WAVE_INTRINSICS
+    // ...
+#else
+    // Integer comparison works for floating-point numbers as long as the sign bit is 0.
+    // We must take care of the signed zero ourselves.
+    InterlockedMin(gs_RapAaBbMinPtX[groupLocalLightIndex], asuint(rapAaBbMinPt.x) & INT_MAX);
+    InterlockedMax(gs_RapAaBbMaxPtX[groupLocalLightIndex], asuint(rapAaBbMaxPt.x) & INT_MAX);
+    InterlockedMin(gs_RapAaBbMinPtY[groupLocalLightIndex], asuint(rapAaBbMinPt.y) & INT_MAX);
+    InterlockedMax(gs_RapAaBbMaxPtY[groupLocalLightIndex], asuint(rapAaBbMaxPt.y) & INT_MAX);
+    InterlockedMin(gs_RapAaBbMinPtZ[groupLocalLightIndex], asuint(rapAaBbMinPt.z) & INT_MAX);
+    InterlockedMax(gs_RapAaBbMaxPtZ[groupLocalLightIndex], asuint(rapAaBbMaxPt.z) & INT_MAX);
+
+    GroupMemoryBarrierWithGroupSync();
+
+    rapAaBbMinPt.x = asfloat(gs_RapAaBbMinPtX[groupLocalLightIndex]);
+    rapAaBbMaxPt.x = asfloat(gs_RapAaBbMaxPtX[groupLocalLightIndex]);
+    rapAaBbMinPt.y = asfloat(gs_RapAaBbMinPtY[groupLocalLightIndex]);
+    rapAaBbMaxPt.y = asfloat(gs_RapAaBbMaxPtY[groupLocalLightIndex]);
+    rapAaBbMinPt.z = asfloat(gs_RapAaBbMinPtZ[groupLocalLightIndex]);
+    rapAaBbMaxPt.z = asfloat(gs_RapAaBbMaxPtZ[groupLocalLightIndex]);
+#endif // USE_WAVE_INTRINSICS
+
+    if (t % THREADS_PER_LIGHT == 0)
+    {
+        // Each light's AABB is represented by two float3s, the min and max of the box.
+        // And for stereo, we have two sets of lights. Therefore, each eye has a set of mins, followed by
+        // a set of maxs, and each set is equal to g_iNrVisibLights.
+        const ScreenSpaceBoundsIndices boundsIndices = GenerateScreenSpaceBoundsIndices(lgtIndex, g_iNrVisibLights, eyeIndex);
+
+        float minLinearDepth = -1, maxLinearDepth = -1; // TODO
+
+        g_vBoundsBuffer[boundsIndices.min] = float4(rapAaBbMinPt, minLinearDepth);
+        g_vBoundsBuffer[boundsIndices.max] = float4(rapAaBbMaxPt, maxLinearDepth);
+    }
+
+#else // !Z_BINNING
     const float3 boxX = lgtDat.boxAxisX.xyz;
     const float3 boxY = lgtDat.boxAxisY.xyz;
     const float3 boxZ = -lgtDat.boxAxisZ.xyz;           // flip axis (so it points away from the light direction for a spot-light)
@@ -390,8 +909,10 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
             g_vBoundsBuffer[boundsIndices.max] = float4(0.5*vMax.x + 0.5, 0.5*vMax.y + 0.5, vMax.z*VIEWPORT_SCALE_Z, linMaZ);
         }
     }
+#endif // Z_BINNING
 }
 
+#ifndef Z_BINNING
 
 float4 GenNewVert(const float4 vVisib, const float4 vInvisib, const int p);
 
@@ -536,3 +1057,5 @@ void CalcBound(out bool2 bIsMinValid, out bool2 bIsMaxValid, out float2 vMin, ou
     vMin = B;
     vMax = A;
 }
+
+#endif // !Z_BINNING
\ No newline at end of file

From a6f00ce6dc90e9e308ff90080b62985856333a15 Mon Sep 17 00:00:00 2001
From: Evgenii <evgenii@unity3d.com>
Date: Wed, 5 Aug 2020 14:04:30 -0700
Subject: [PATCH 02/22] Support orthographic projection

---
 .../Runtime/Lighting/LightLoop/scrbound.compute                | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
index 6b37c450c6e..714a8002fa2 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
+++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
@@ -436,7 +436,8 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
 
         // Make sure the W component is strictly positive.
         // It is helpful in order to simplify clipping and to avoid perspective division by 0.
-        float w = hapVert.w;
+        // For the orthographic projection, we only consider (w = 1)
+        float w = g_isOrthographic ? 1 : hapVert.w;
         float s = (w >= 0) ? 1 : -1;
 
         // Transform the X and Y components: [-w, w] -> [0, w].

From f8eea291dbff3ed9e2a8ffc8ff8701b41f2b9651 Mon Sep 17 00:00:00 2001
From: Evgenii <evgenii@unity3d.com>
Date: Wed, 5 Aug 2020 14:38:37 -0700
Subject: [PATCH 03/22] k_identity -> k_Identity

---
 .../ShaderLibrary/Common.hlsl                        | 10 +++++-----
 .../ShaderLibrary/ImageBasedLighting.hlsl            |  4 ++--
 .../Editor/ShaderGraph/SharedCode.template.hlsl      |  2 +-
 .../Runtime/Material/AxF/AxF.hlsl                    | 10 +++++-----
 .../AxF/PreIntegratedFGD_CookTorrance.shader         |  2 +-
 .../Runtime/Material/Eye/Eye.hlsl                    |  4 ++--
 .../GGXConvolution/ComputeGgxIblSampleData.compute   |  4 ++--
 .../Runtime/Material/Lit/Lit.hlsl                    | 10 +++++-----
 .../Runtime/Material/Lit/SimpleLit.hlsl              |  2 +-
 .../Runtime/Material/StackLit/StackLit.hlsl          | 12 ++++++------
 .../RenderPipeline/ShaderPass/VaryingMesh.hlsl       |  2 +-
 11 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl b/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl
index 60449601be5..ce325adcde6 100644
--- a/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl
+++ b/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl
@@ -852,11 +852,11 @@ void CompositeOver(real3 colorFront, real3 alphaFront,
 // Space transformations
 // ----------------------------------------------------------------------------
 
-static const float3x3 k_identity3x3 = {1, 0, 0,
+static const float3x3 k_Identity3x3 = {1, 0, 0,
                                        0, 1, 0,
                                        0, 0, 1};
 
-static const float4x4 k_identity4x4 = {1, 0, 0, 0,
+static const float4x4 k_Identity4x4 = {1, 0, 0, 0,
                                        0, 1, 0, 0,
                                        0, 0, 1, 0,
                                        0, 0, 0, 1};
@@ -880,7 +880,7 @@ float4 ComputeClipSpacePosition(float2 positionNDC, float deviceDepth)
 // (position = positionCS) => (clipSpaceTransform = use default)
 // (position = positionVS) => (clipSpaceTransform = UNITY_MATRIX_P)
 // (position = positionWS) => (clipSpaceTransform = UNITY_MATRIX_VP)
-float4 ComputeClipSpacePosition(float3 position, float4x4 clipSpaceTransform = k_identity4x4)
+float4 ComputeClipSpacePosition(float3 position, float4x4 clipSpaceTransform = k_Identity4x4)
 {
     return mul(clipSpaceTransform, float4(position, 1.0));
 }
@@ -890,7 +890,7 @@ float4 ComputeClipSpacePosition(float3 position, float4x4 clipSpaceTransform = k
 // (position = positionCS) => (clipSpaceTransform = use default)
 // (position = positionVS) => (clipSpaceTransform = UNITY_MATRIX_P)
 // (position = positionWS) => (clipSpaceTransform = UNITY_MATRIX_VP)
-float3 ComputeNormalizedDeviceCoordinatesWithZ(float3 position, float4x4 clipSpaceTransform = k_identity4x4)
+float3 ComputeNormalizedDeviceCoordinatesWithZ(float3 position, float4x4 clipSpaceTransform = k_Identity4x4)
 {
     float4 positionCS = ComputeClipSpacePosition(position, clipSpaceTransform);
 
@@ -912,7 +912,7 @@ float3 ComputeNormalizedDeviceCoordinatesWithZ(float3 position, float4x4 clipSpa
 // (position = positionCS) => (clipSpaceTransform = use default)
 // (position = positionVS) => (clipSpaceTransform = UNITY_MATRIX_P)
 // (position = positionWS) => (clipSpaceTransform = UNITY_MATRIX_VP)
-float2 ComputeNormalizedDeviceCoordinates(float3 position, float4x4 clipSpaceTransform = k_identity4x4)
+float2 ComputeNormalizedDeviceCoordinates(float3 position, float4x4 clipSpaceTransform = k_Identity4x4)
 {
     return ComputeNormalizedDeviceCoordinatesWithZ(position, clipSpaceTransform).xy;
 }
diff --git a/com.unity.render-pipelines.core/ShaderLibrary/ImageBasedLighting.hlsl b/com.unity.render-pipelines.core/ShaderLibrary/ImageBasedLighting.hlsl
index 406fa64810a..b67991a54ee 100644
--- a/com.unity.render-pipelines.core/ShaderLibrary/ImageBasedLighting.hlsl
+++ b/com.unity.render-pipelines.core/ShaderLibrary/ImageBasedLighting.hlsl
@@ -183,7 +183,7 @@ void SampleVisibleAnisoGGXDir(real2 u,
     real3x3 viewToLocal;
     if (VeqN)
     {
-        viewToLocal = k_identity3x3;
+        viewToLocal = k_Identity3x3;
     }
     else
     {
@@ -366,7 +366,7 @@ real4 IntegrateGGXAndDisneyDiffuseFGD(real NdotV, real roughness, uint sampleCou
     real3 V   = real3(sqrt(1 - NdotV * NdotV), 0, NdotV);
     real4 acc = real4(0.0, 0.0, 0.0, 0.0);
 
-    real3x3 localToWorld = k_identity3x3;
+    real3x3 localToWorld = k_Identity3x3;
 
     for (uint i = 0; i < sampleCount; ++i)
     {
diff --git a/com.unity.render-pipelines.high-definition/Editor/ShaderGraph/SharedCode.template.hlsl b/com.unity.render-pipelines.high-definition/Editor/ShaderGraph/SharedCode.template.hlsl
index 93ebaac408a..54af96c8743 100644
--- a/com.unity.render-pipelines.high-definition/Editor/ShaderGraph/SharedCode.template.hlsl
+++ b/com.unity.render-pipelines.high-definition/Editor/ShaderGraph/SharedCode.template.hlsl
@@ -7,7 +7,7 @@
         // Init to some default value to make the computer quiet (else it output 'divide by zero' warning even if value is not used).
         // TODO: this is a really poor workaround, but the variable is used in a bunch of places
         // to compute normals which are then passed on elsewhere to compute other values...
-        output.tangentToWorld = k_identity3x3;
+        output.tangentToWorld = k_Identity3x3;
         output.positionSS = input.positionCS;       // input.positionCS is SV_Position
 
         $FragInputs.positionRWS:        output.positionRWS = input.positionRWS;
diff --git a/com.unity.render-pipelines.high-definition/Runtime/Material/AxF/AxF.hlsl b/com.unity.render-pipelines.high-definition/Runtime/Material/AxF/AxF.hlsl
index 29a7ad28cef..94758aa5406 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/Material/AxF/AxF.hlsl
+++ b/com.unity.render-pipelines.high-definition/Runtime/Material/AxF/AxF.hlsl
@@ -1338,14 +1338,14 @@ PreLightData    GetPreLightData(float3 viewWS_Clearcoat, PositionInputs posInput
     }
     else
     {
-        preLightData.ltcTransformDiffuse = k_identity3x3;   // Lambert
+        preLightData.ltcTransformDiffuse = k_Identity3x3;   // Lambert
     }
 
     // Load specular LTC & FGD
     switch ((_SVBRDF_BRDFType >> 1) & 7)
     {
     // Warning: all these LTC_MATRIX_INDEX_ are the same for now, and fitted for GGX, hence the code
-    // above that selected the UVs all used a preLightData.iblPerceptualRoughness value that used a 
+    // above that selected the UVs all used a preLightData.iblPerceptualRoughness value that used a
     // conversion formula for Beckmann NDF (exp) based BRDFs
     // (see switch ((_SVBRDF_BRDFType >> 1) & 7) above and usage of PerceptualRoughnessBeckmannToGGX)
     //
@@ -2037,7 +2037,7 @@ DirectLighting  EvaluateBSDF_Line(  LightLoopContext lightLoopContext,
 
     //-----------------------------------------------------------------------------
     // Use Lambert for diffuse
-    ltcValue = LTCEvaluate(P1, P2, B, k_identity3x3);    // No transform: Lambert uses identity
+    ltcValue = LTCEvaluate(P1, P2, B, k_Identity3x3);    // No transform: Lambert uses identity
     ltcValue *= lightData.diffuseDimmer;
     lighting.diffuse = ltcValue; // no FGD, lambert gives 1
 
@@ -2141,7 +2141,7 @@ DirectLighting  EvaluateBSDF_Line(  LightLoopContext lightLoopContext,
     {
         // Only lighting, not BSDF
         // Apply area light on lambert then multiply by PI to cancel Lambert
-        lighting.diffuse = LTCEvaluate(P1, P2, B, k_identity3x3);
+        lighting.diffuse = LTCEvaluate(P1, P2, B, k_Identity3x3);
         lighting.diffuse *= PI * lightData.diffuseDimmer;
     }
 #endif
@@ -2358,7 +2358,7 @@ DirectLighting  EvaluateBSDF_Rect(LightLoopContext lightLoopContext,
     {
         // Only lighting, not BSDF
         // Apply area light on lambert then multiply by PI to cancel Lambert
-        lighting.diffuse = PolygonIrradiance(mul(lightVerts, k_identity3x3));
+        lighting.diffuse = PolygonIrradiance(mul(lightVerts, k_Identity3x3));
         lighting.diffuse *= PI * lightData.diffuseDimmer;
     }
 #endif
diff --git a/com.unity.render-pipelines.high-definition/Runtime/Material/AxF/PreIntegratedFGD_CookTorrance.shader b/com.unity.render-pipelines.high-definition/Runtime/Material/AxF/PreIntegratedFGD_CookTorrance.shader
index 5dddd6a2842..8f2bcc9fd1b 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/Material/AxF/PreIntegratedFGD_CookTorrance.shader
+++ b/com.unity.render-pipelines.high-definition/Runtime/Material/AxF/PreIntegratedFGD_CookTorrance.shader
@@ -89,7 +89,7 @@ Shader "Hidden/HDRP/PreIntegratedFGD_CookTorrance"
                 float   NdotV    = ClampNdotV( dot(N, V) );
                 float4  acc      = float4(0.0, 0.0, 0.0, 0.0);
 
-                float3x3    localToWorld = GetLocalFrame(N); //TODO: N not needed, we use a frame aligned to N, should use k_identity3x3
+                float3x3    localToWorld = GetLocalFrame(N); //TODO: N not needed, we use a frame aligned to N, should use k_Identity3x3
 
                 for (uint i = 0; i < sampleCount; ++i)
                 {
diff --git a/com.unity.render-pipelines.high-definition/Runtime/Material/Eye/Eye.hlsl b/com.unity.render-pipelines.high-definition/Runtime/Material/Eye/Eye.hlsl
index df704ded1ff..3466ba4c714 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/Material/Eye/Eye.hlsl
+++ b/com.unity.render-pipelines.high-definition/Runtime/Material/Eye/Eye.hlsl
@@ -295,7 +295,7 @@ PreLightData GetPreLightData(float3 V, PositionInputs posInput, inout BSDFData b
     float2 uv = Remap01ToHalfTexelCoord(float2(bsdfData.perceptualRoughness, theta * INV_HALF_PI), LTC_LUT_SIZE);
 
     // Note we load the matrix transpose (avoid to have to transpose it in shader)
-    preLightData.ltcTransformDiffuse = k_identity3x3;
+    preLightData.ltcTransformDiffuse = k_Identity3x3;
 
     // Get the inverse LTC matrix for GGX
     // Note we load the matrix transpose (avoid to have to transpose it in shader)
@@ -660,7 +660,7 @@ DirectLighting EvaluateBSDF_Rect(   LightLoopContext lightLoopContext,
             {
                 // Only lighting, not BSDF
                 // Apply area light on lambert then multiply by PI to cancel Lambert
-                lighting.diffuse = PolygonIrradiance(mul(lightVerts, k_identity3x3));
+                lighting.diffuse = PolygonIrradiance(mul(lightVerts, k_Identity3x3));
                 lighting.diffuse *= PI * lightData.diffuseDimmer;
             }
 #endif
diff --git a/com.unity.render-pipelines.high-definition/Runtime/Material/GGXConvolution/ComputeGgxIblSampleData.compute b/com.unity.render-pipelines.high-definition/Runtime/Material/GGXConvolution/ComputeGgxIblSampleData.compute
index d8445386ed7..71c3ebfd5f9 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/Material/GGXConvolution/ComputeGgxIblSampleData.compute
+++ b/com.unity.render-pipelines.high-definition/Runtime/Material/GGXConvolution/ComputeGgxIblSampleData.compute
@@ -57,7 +57,7 @@ void ComputeGgxIblSampleData(uint3 groupThreadId : SV_GroupThreadID)
                 // TODO: might be interesting to try Mitchell's Poisson disk sampling algorithm.
                 // In our case, samples would not have disks associated with them, but rather solid angles.
                 float2 u = Golden2dSeq(i, sampleCount);
-                SampleGGXDir(u, V, k_identity3x3, roughness, localL, NdotL, NdotH, LdotH, true);
+                SampleGGXDir(u, V, k_Identity3x3, roughness, localL, NdotL, NdotH, LdotH, true);
 
                 if (NdotL > 0)
                 {
@@ -77,7 +77,7 @@ void ComputeGgxIblSampleData(uint3 groupThreadId : SV_GroupThreadID)
 
         float2 u = Golden2dSeq(sampleIndex, sampleCount);
 
-        SampleGGXDir(u, V, k_identity3x3, roughness, localL, NdotL, NdotH, LdotH, true);
+        SampleGGXDir(u, V, k_Identity3x3, roughness, localL, NdotL, NdotH, LdotH, true);
 
         float pdf    = 0.25 * D_GGX(NdotH, roughness);
         float omegaS = rcp(sampleCount) * rcp(pdf);
diff --git a/com.unity.render-pipelines.high-definition/Runtime/Material/Lit/Lit.hlsl b/com.unity.render-pipelines.high-definition/Runtime/Material/Lit/Lit.hlsl
index 959efa32a01..996785eb3c7 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/Material/Lit/Lit.hlsl
+++ b/com.unity.render-pipelines.high-definition/Runtime/Material/Lit/Lit.hlsl
@@ -1082,7 +1082,7 @@ PreLightData GetPreLightData(float3 V, PositionInputs posInput, inout BSDFData b
 
     // Note we load the matrix transpose (avoid to have to transpose it in shader)
 #ifdef USE_DIFFUSE_LAMBERT_BRDF
-    preLightData.ltcTransformDiffuse = k_identity3x3;
+    preLightData.ltcTransformDiffuse = k_Identity3x3;
 #else
     // Get the inverse LTC matrix for Disney Diffuse
     preLightData.ltcTransformDiffuse      = 0.0;
@@ -1417,7 +1417,7 @@ DirectLighting EvaluateBSDF_Line(   LightLoopContext lightLoopContext,
             // Use the Lambertian approximation for performance reasons.
             // The matrix multiplication should not generate any extra ALU on GCN.
             // TODO: double evaluation is very inefficient! This is a temporary solution.
-            ltcValue  = LTCEvaluate(P1, P2, B, mul(flipMatrix, k_identity3x3));
+            ltcValue  = LTCEvaluate(P1, P2, B, mul(flipMatrix, k_Identity3x3));
             ltcValue *= lightData.diffuseDimmer;
             // We use diffuse lighting for accumulation since it is going to be blurred during the SSS pass.
             // We don't multiply by 'bsdfData.diffuseColor' here. It's done only once in PostEvaluateBSDF().
@@ -1452,7 +1452,7 @@ DirectLighting EvaluateBSDF_Line(   LightLoopContext lightLoopContext,
         {
             // Only lighting, not BSDF
             // Apply area light on lambert then multiply by PI to cancel Lambert
-            lighting.diffuse = LTCEvaluate(P1, P2, B, k_identity3x3);
+            lighting.diffuse = LTCEvaluate(P1, P2, B, k_Identity3x3);
             lighting.diffuse *= PI * lightData.diffuseDimmer;
         }
     #endif
@@ -1572,7 +1572,7 @@ DirectLighting EvaluateBSDF_Rect(   LightLoopContext lightLoopContext,
 
                 // Use the Lambertian approximation for performance reasons.
                 // The matrix multiplication should not generate any extra ALU on GCN.
-                float3x3 ltcTransform = mul(flipMatrix, k_identity3x3);
+                float3x3 ltcTransform = mul(flipMatrix, k_Identity3x3);
 
                 // Polygon irradiance in the transformed configuration.
                 // TODO: double evaluation is very inefficient! This is a temporary solution.
@@ -1640,7 +1640,7 @@ DirectLighting EvaluateBSDF_Rect(   LightLoopContext lightLoopContext,
             {
                 // Only lighting, not BSDF
                 // Apply area light on lambert then multiply by PI to cancel Lambert
-                lighting.diffuse = PolygonIrradiance(mul(lightVerts, k_identity3x3));
+                lighting.diffuse = PolygonIrradiance(mul(lightVerts, k_Identity3x3));
                 lighting.diffuse *= PI * lightData.diffuseDimmer;
             }
         #endif
diff --git a/com.unity.render-pipelines.high-definition/Runtime/Material/Lit/SimpleLit.hlsl b/com.unity.render-pipelines.high-definition/Runtime/Material/Lit/SimpleLit.hlsl
index 6e311ca6951..75efb9f667e 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/Material/Lit/SimpleLit.hlsl
+++ b/com.unity.render-pipelines.high-definition/Runtime/Material/Lit/SimpleLit.hlsl
@@ -211,7 +211,7 @@ PreLightData GetPreLightData(float3 V, PositionInputs posInput, inout BSDFData b
     float theta = FastACosPos(clampedNdotV); // For Area light - UVs for sampling the LUTs
     float2 uv = LTC_LUT_OFFSET + LTC_LUT_SCALE * float2(bsdfData.perceptualRoughness, theta * INV_HALF_PI);
 
-    preLightData.ltcTransformDiffuse = k_identity3x3;
+    preLightData.ltcTransformDiffuse = k_Identity3x3;
 
     preLightData.ltcTransformSpecular      = 0.0;
     preLightData.ltcTransformSpecular._m22 = 1.0;
diff --git a/com.unity.render-pipelines.high-definition/Runtime/Material/StackLit/StackLit.hlsl b/com.unity.render-pipelines.high-definition/Runtime/Material/StackLit/StackLit.hlsl
index 80a53b76a8a..9f8a5c28ff7 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/Material/StackLit/StackLit.hlsl
+++ b/com.unity.render-pipelines.high-definition/Runtime/Material/StackLit/StackLit.hlsl
@@ -2261,7 +2261,7 @@ void PreLightData_SetupAreaLights(BSDFData bsdfData, float3 V, float3 N[NB_NORMA
 
 
 #ifdef USE_DIFFUSE_LAMBERT_BRDF
-    preLightData.ltcTransformDiffuse = k_identity3x3;
+    preLightData.ltcTransformDiffuse = k_Identity3x3;
 #else
     // TODO
     // Get the inverse LTC matrix for Disney Diffuse
@@ -2314,7 +2314,7 @@ void PreLightData_SetupAreaLightsAniso(BSDFData bsdfData, float3 V, float3 N[NB_
 
 
 #ifdef USE_DIFFUSE_LAMBERT_BRDF
-    preLightData.ltcTransformDiffuse = k_identity3x3;
+    preLightData.ltcTransformDiffuse = k_Identity3x3;
 #else
     // TODO
     // Get the inverse LTC matrix for Disney Diffuse
@@ -3801,7 +3801,7 @@ DirectLighting EvaluateBSDF_Line(   LightLoopContext lightLoopContext,
             // Use the Lambertian approximation for performance reasons.
             // The matrix multiplication should not generate any extra ALU on GCN.
             // TODO: double evaluation is very inefficient! This is a temporary solution.
-            ltcValue  = LTCEvaluate(localP1, localP2, B, mul(flipMatrix, k_identity3x3));
+            ltcValue  = LTCEvaluate(localP1, localP2, B, mul(flipMatrix, k_Identity3x3));
             ltcValue *= lightData.diffuseDimmer;
 
             // VLAYERED_DIFFUSE_ENERGY_HACKED_TERM:
@@ -3892,7 +3892,7 @@ DirectLighting EvaluateBSDF_Line(   LightLoopContext lightLoopContext,
 
             // Only lighting, not BSDF
             // Apply area light on lambert then multiply by PI to cancel Lambert
-            lighting.diffuse = LTCEvaluate(localP1, localP2, B, k_identity3x3);
+            lighting.diffuse = LTCEvaluate(localP1, localP2, B, k_Identity3x3);
             lighting.diffuse *= PI * lightData.diffuseDimmer;
         }
     #endif
@@ -4022,7 +4022,7 @@ DirectLighting EvaluateBSDF_Rect(   LightLoopContext lightLoopContext,
     
                 // Use the Lambertian approximation for performance reasons.
                 // The matrix multiplication should not generate any extra ALU on GCN.
-                float3x3 ltcTransform = mul(flipMatrix, k_identity3x3);
+                float3x3 ltcTransform = mul(flipMatrix, k_Identity3x3);
     
                 // Polygon irradiance in the transformed configuration.
                 // TODO: double evaluation is very inefficient! This is a temporary solution.
@@ -4136,7 +4136,7 @@ DirectLighting EvaluateBSDF_Rect(   LightLoopContext lightLoopContext,
 
                 // Only lighting, not BSDF
                 // Apply area light on lambert then multiply by PI to cancel Lambert
-                lighting.diffuse = PolygonIrradiance(mul(localLightVerts, k_identity3x3));
+                lighting.diffuse = PolygonIrradiance(mul(localLightVerts, k_Identity3x3));
                 lighting.diffuse *= PI * lightData.diffuseDimmer;
             }
         #endif
diff --git a/com.unity.render-pipelines.high-definition/Runtime/RenderPipeline/ShaderPass/VaryingMesh.hlsl b/com.unity.render-pipelines.high-definition/Runtime/RenderPipeline/ShaderPass/VaryingMesh.hlsl
index dca58fd987e..d2d12385dca 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/RenderPipeline/ShaderPass/VaryingMesh.hlsl
+++ b/com.unity.render-pipelines.high-definition/Runtime/RenderPipeline/ShaderPass/VaryingMesh.hlsl
@@ -140,7 +140,7 @@ FragInputs UnpackVaryingsMeshToFragInputs(PackedVaryingsMeshToPS input)
     // Init to some default value to make the computer quiet (else it output "divide by zero" warning even if value is not used).
     // TODO: this is a really poor workaround, but the variable is used in a bunch of places
     // to compute normals which are then passed on elsewhere to compute other values...
-    output.tangentToWorld = k_identity3x3;
+    output.tangentToWorld = k_Identity3x3;
 
     output.positionSS = input.positionCS; // input.positionCS is SV_Position
 

From ccbb9e16e6b27b85038f026d99a6267a96e6ab73 Mon Sep 17 00:00:00 2001
From: Evgenii <evgenii@unity3d.com>
Date: Wed, 5 Aug 2020 15:52:27 -0700
Subject: [PATCH 04/22] Turn 'scaleXY' into a scalar

---
 .../Runtime/Lighting/LightLoop/LightLoop.cs   | 30 +++++++++----------
 .../Lighting/LightLoop/LightLoop.cs.hlsl      |  2 +-
 .../Lighting/LightLoop/scrbound.compute       |  8 ++---
 .../Runtime/Material/Decal/DecalSystem.cs     |  6 ++--
 4 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs
index 1aa7b2292f8..3d6fc7dc90c 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs
+++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs
@@ -128,8 +128,8 @@ struct SFiniteLightBound
         public Vector3 boxAxisY; // Scaled by the extents (half-size)
         public Vector3 boxAxisZ; // Scaled by the extents (half-size)
         public Vector3 center;   // Center of the bounds (box) in camera space
-        public Vector2 scaleXY;  // Scale applied to the top of the box to turn it into a truncated pyramid
-        public float radius;     // Circumscribed sphere for the bounds (box)
+        public float   scaleXY;  // Scale applied to the top of the box to turn it into a truncated pyramid (X = Y)
+        public float   radius;     // Circumscribed sphere for the bounds (box)
     };
 
     [GenerateHLSL]
@@ -1628,9 +1628,9 @@ void GetLightVolumeDataAndBound(LightCategory lightCategory, GPULightType gpuLig
                 fAltDx *= range; fAltDy *= range;
 
                 // Handle case of pyramid with this select (currently unused)
-                var altDist = Mathf.Sqrt(fAltDy * fAltDy + (true ? 1.0f : 2.0f) * fAltDx * fAltDx);
-                bound.radius = altDist > (0.5f * range) ? altDist : (0.5f * range);       // will always pick fAltDist
-                bound.scaleXY = squeeze ? new Vector2(0.01f, 0.01f) : new Vector2(1.0f, 1.0f);
+                var altDist   = Mathf.Sqrt(fAltDy * fAltDy + (true ? 1.0f : 2.0f) * fAltDx * fAltDx);
+                bound.radius  = altDist > (0.5f * range) ? altDist : (0.5f * range);       // will always pick fAltDist
+                bound.scaleXY = squeeze ? 0.01f : 1.0f;
 
                 lightVolumeData.lightAxisX = vx;
                 lightVolumeData.lightAxisY = vy;
@@ -1653,8 +1653,8 @@ void GetLightVolumeDataAndBound(LightCategory lightCategory, GPULightType gpuLig
                 bound.boxAxisX = vx * range;
                 bound.boxAxisY = vy * range;
                 bound.boxAxisZ = vz * range;
-                bound.scaleXY.Set(1.0f, 1.0f);
-                bound.radius = range;
+                bound.scaleXY  = 1.0f;
+                bound.radius   = range;
 
                 // fill up ldata
                 lightVolumeData.lightAxisX = vx;
@@ -1675,7 +1675,7 @@ void GetLightVolumeDataAndBound(LightCategory lightCategory, GPULightType gpuLig
                 bound.boxAxisY = extents.y * yAxisVS;
                 bound.boxAxisZ = extents.z * zAxisVS;
                 bound.radius   = extents.magnitude;
-                bound.scaleXY.Set(1.0f, 1.0f);
+                bound.scaleXY  = 1.0f;
 
                 lightVolumeData.lightPos   = centerVS;
                 lightVolumeData.lightAxisX = xAxisVS;
@@ -1695,7 +1695,7 @@ void GetLightVolumeDataAndBound(LightCategory lightCategory, GPULightType gpuLig
                 bound.boxAxisY = extents.y * yAxisVS;
                 bound.boxAxisZ = extents.z * zAxisVS;
                 bound.radius   = extents.magnitude;
-                bound.scaleXY.Set(1.0f, 1.0f);
+                bound.scaleXY  = 1.0f;
 
                 lightVolumeData.lightPos   = centerVS;
                 lightVolumeData.lightAxisX = xAxisVS;
@@ -1715,7 +1715,7 @@ void GetLightVolumeDataAndBound(LightCategory lightCategory, GPULightType gpuLig
                 bound.boxAxisY = extents.y * yAxisVS;
                 bound.boxAxisZ = extents.z * zAxisVS;
                 bound.radius   = extents.magnitude;
-                bound.scaleXY.Set(1.0f, 1.0f);
+                bound.scaleXY  = 1.0f;
 
                 lightVolumeData.lightPos   = centerVS;
                 lightVolumeData.lightAxisX = xAxisVS;
@@ -1894,8 +1894,8 @@ void GetEnvLightVolumeDataAndBound(HDProbe probe, LightVolumeType lightVolumeTyp
                     bound.boxAxisX = influenceRightVS * influenceExtents.x;
                     bound.boxAxisY = influenceUpVS * influenceExtents.x;
                     bound.boxAxisZ = influenceForwardVS * influenceExtents.x;
-                    bound.scaleXY.Set(1.0f, 1.0f);
-                    bound.radius = influenceExtents.x;
+                    bound.scaleXY  = 1.0f;
+                    bound.radius   = influenceExtents.x;
                     break;
                 }
                 case LightVolumeType.Box:
@@ -1904,8 +1904,8 @@ void GetEnvLightVolumeDataAndBound(HDProbe probe, LightVolumeType lightVolumeTyp
                     bound.boxAxisX = influenceExtents.x * influenceRightVS;
                     bound.boxAxisY = influenceExtents.y * influenceUpVS;
                     bound.boxAxisZ = influenceExtents.z * influenceForwardVS;
-                    bound.scaleXY.Set(1.0f, 1.0f);
-                    bound.radius = influenceExtents.magnitude;
+                    bound.scaleXY  = 1.0f;
+                    bound.radius   = influenceExtents.magnitude;
 
                     // The culling system culls pixels that are further
                     //   than a threshold to the box influence extents.
@@ -1945,7 +1945,7 @@ void AddBoxVolumeDataAndBound(OrientedBBox obb, LightCategory category, LightFea
             bound.boxAxisY = obb.extentY * upVS;
             bound.boxAxisZ = obb.extentZ * forwardVS;
             bound.radius   = extents.magnitude;
-            bound.scaleXY.Set(1.0f, 1.0f);
+            bound.scaleXY  = 1.0f;
 
             // The culling system culls pixels that are further
             //   than a threshold to the box influence extents.
diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs.hlsl b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs.hlsl
index 5efdcddabfc..f158b3be894 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs.hlsl
+++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs.hlsl
@@ -66,7 +66,7 @@ struct SFiniteLightBound
     float3 boxAxisY;
     float3 boxAxisZ;
     float3 center;
-    float2 scaleXY;
+    float scaleXY;
     float radius;
 };
 
diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
index 714a8002fa2..38582d5e548 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
+++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
@@ -382,11 +382,11 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
     const uint groupLocalLightIndex = t / THREADS_PER_LIGHT;
     const uint firstVertexOffset    = NUM_VERTS * groupLocalLightIndex;
 
-    const float2 scale = lgtDat.scaleXY.xy;
+    const float  scale = lgtDat.scaleXY.x;    // scale.x = scale.y
     const float3 rbpC  = lgtDat.center.xyz;
-    const float3 rbpX  = lgtDat.boxAxisX.xyz;
-    const float3 rbpY  = lgtDat.boxAxisY.xyz;
-    const float3 rbpZ  = lgtDat.boxAxisZ.xyz;
+    const float3 rbpX  = lgtDat.boxAxisX.xyz; // Pre-scaled
+    const float3 rbpY  = lgtDat.boxAxisY.xyz; // Pre-scaled
+    const float3 rbpZ  = lgtDat.boxAxisZ.xyz; // Pre-scaled
 
 #ifndef USE_WAVE_INTRINSICS
     // Initialize the TGSM. All threads write the same value -> no data races.
diff --git a/com.unity.render-pipelines.high-definition/Runtime/Material/Decal/DecalSystem.cs b/com.unity.render-pipelines.high-definition/Runtime/Material/Decal/DecalSystem.cs
index 55c80ef43a1..c71e4846911 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/Material/Decal/DecalSystem.cs
+++ b/com.unity.render-pipelines.high-definition/Runtime/Material/Decal/DecalSystem.cs
@@ -579,12 +579,12 @@ private void GetDecalVolumeDataAndBound(Matrix4x4 decalToWorld, Matrix4x4 worldT
                 var influenceForwardVS = worldToView.MultiplyVector(influenceZ / influenceExtents.z);
                 var influencePositionVS = worldToView.MultiplyPoint(pos); // place the mesh pivot in the center
 
-                m_Bounds[m_DecalDatasCount].center = influencePositionVS;
+                m_Bounds[m_DecalDatasCount].center   = influencePositionVS;
                 m_Bounds[m_DecalDatasCount].boxAxisX = influenceRightVS * influenceExtents.x;
                 m_Bounds[m_DecalDatasCount].boxAxisY = influenceUpVS * influenceExtents.y;
                 m_Bounds[m_DecalDatasCount].boxAxisZ = influenceForwardVS * influenceExtents.z;
-                m_Bounds[m_DecalDatasCount].scaleXY.Set(1.0f, 1.0f);
-                m_Bounds[m_DecalDatasCount].radius = influenceExtents.magnitude;
+                m_Bounds[m_DecalDatasCount].scaleXY  = 1.0f;
+                m_Bounds[m_DecalDatasCount].radius   = influenceExtents.magnitude;
 
                 // The culling system culls pixels that are further
                 //   than a threshold to the box influence extents.

From 1559d1c80966b02b9ce2992fbb7bdc72c8af6d84 Mon Sep 17 00:00:00 2001
From: Evgenii <evgenii@unity3d.com>
Date: Wed, 5 Aug 2020 18:53:33 -0700
Subject: [PATCH 05/22] Test corners of the view volume

---
 .../Lighting/LightLoop/scrbound.compute       | 138 +++++++++++++++++-
 1 file changed, 130 insertions(+), 8 deletions(-)

diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
index 38582d5e548..77eb4f6a530 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
+++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
@@ -61,6 +61,55 @@ uint NthBitLow(uint value, uint n)
     return b;
 }
 
+float4x4 Translation4x4(float3 d)
+{
+    float4x4 M = k_Identity4x4;
+
+    M._14_24_34 = d; // Last column
+
+    return M;
+}
+
+float3x3 Rotation3x3(float3 xAxis, float3 yAxis, float3 zAxis)
+{
+    float3x3 R = float3x3(xAxis, yAxis, zAxis);
+    float3x3 C = transpose(R); // Row to column
+
+    return C;
+}
+
+float3x3 Invert3x3(float3x3 R)
+{
+    float3x3 C   = transpose(R); // Row to column
+    float    det = dot(C[0], cross(C[1], C[2]));
+    float3x3 adj = float3x3(cross(C[1], C[2]),
+                            cross(C[2], C[0]),
+                            cross(C[0], C[1]));
+
+    return rcp(det) * adj;
+}
+
+float4x4 Homogenize3x3(float3x3 R)
+{
+    float4x4 M = float4x4(float4(R[0], 0),
+                          float4(R[1], 0),
+                          float4(R[2], 0),
+                          float4(0,0,0,1));
+
+    return M;
+}
+
+float4x4 PerspectiveProjection4x4(float s, float g, float n, float f)
+{
+    float a = (f + n) * rcp(f - n);
+    float b = -2 * f * n * rcp(f - n);
+
+    return float4x4(g/s, 0, 0, 0,
+                      0, g, 0, 0,
+                      0, 0, a, b,
+                      0, 0, 1, 0);
+}
+
 // Clipping a plane by a cube may produce a hexagon (6-gon).
 // Clipping a hexagon by 4 planes may produce a decagon (10-gon).
 #define MAX_CLIP_VERTS    (10)
@@ -382,7 +431,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
     const uint groupLocalLightIndex = t / THREADS_PER_LIGHT;
     const uint firstVertexOffset    = NUM_VERTS * groupLocalLightIndex;
 
-    const float  scale = lgtDat.scaleXY.x;    // scale.x = scale.y
+    const float  scale = lgtDat.scaleXY;      // scale.x = scale.y
     const float3 rbpC  = lgtDat.center.xyz;
     const float3 rbpX  = lgtDat.boxAxisX.xyz; // Pre-scaled
     const float3 rbpY  = lgtDat.boxAxisY.xyz; // Pre-scaled
@@ -432,19 +481,21 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
         m.xy *= (v >= 4) ? 1 : scale;
 
         float3 rbpVert = rbpC + m.x * rbpX + m.y * rbpY + m.z * rbpZ;
+        // Avoid generating (w = 0).
+        rbpVert.z = (abs(rbpVert.z) >= FLT_EPS) ? rbpVert.z : FLT_EPS;
+
         float4 hapVert = mul(g_mProjection, float4(rbpVert, 1));
 
         // Make sure the W component is strictly positive.
         // It is helpful in order to simplify clipping and to avoid perspective division by 0.
-        // For the orthographic projection, we only consider (w = 1)
-        float w = g_isOrthographic ? 1 : hapVert.w;
+        float w = hapVert.w;
         float s = (w >= 0) ? 1 : -1;
 
         // Transform the X and Y components: [-w, w] -> [0, w].
         hapVert.x = (0.5 * s) * hapVert.x + ((0.5 * s) * w);
         hapVert.y = (0.5 * s) * hapVert.y + ((0.5 * s) * w);
         hapVert.z = s * hapVert.z;
-        hapVert.w = max(abs(w), FLT_MIN);
+        hapVert.w = s * hapVert.w;
 
         // For each vertex, we must determine whether it is within the bounds.
         // For culling and clipping, we must know, per culling plane, whether the vertex
@@ -453,8 +504,11 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
 
         // Consider the vertex to be inside the view volume if:
         // 0 <= x <= w
-        // 0 <= y <= w
+        // 0 <= y <= w   <-- include boundary points, to avoid clipping them later
         // 0 <= z <= w
+        // w is always valid
+        // For the orthographic projection, (w = 1), so no modifications are necessary.
+        // TODO: epsilon for numerical robustness?
         w = hapVert.w;
 
         for (uint j = 0; j < (NUM_PLANES / 2); j++)
@@ -492,9 +546,77 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
     cullClipFaceMask = gs_CullClipFaceMasks[groupLocalLightIndex];
 #endif
 
+    // (2) Test the corners of the view volume.
     if (cullClipFaceMask != 0)
     {
-        // The light may be partially outside the view volume.
+        // The light is partially outside the view volume.
+        // Therefore, some of the corners of the view volume may be inside the light volume.
+        // We perform aggressive culling, so we must make sure they are accounted for.
+        // The light volume is a special type of cuboid - a right frustum.
+        // We can exploit this fact by building a light-space projection matrix.
+        float4x4 invTranslateToLightSpace      = Translation4x4(-rbpC);
+        float4x4 invRotateAndScaleInLightSpace = Homogenize3x3(Invert3x3(Rotation3x3(rbpX, rbpY, rbpZ)));
+        // TODO: avoid full inversion by using unit vectors and passing magnitudes explicitly.
+
+        // This (orhographic) projection matrix maps a view-space point to a light-space [-1, 1]^3 cube.
+        float4x4 lightSpaceMatrix = mul(invRotateAndScaleInLightSpace, invTranslateToLightSpace);
+
+        if (scale != 1) // Perspective light space?
+        {
+            // Compute the parameters of the perspective projection.
+            float s = scale;
+            float e = -1 - 2 * (s * rcp(1 - s)); // Signed distance from the origin to the eye
+            float n = -e - 1;                    // Distance from the eye to the near plane
+            float f = -e + 1;                    // Distance from the eye to the far plane
+            float g = f;                         // Distance from the eye to the projection plane
+
+            float4x4 invTranslateEye = Translation4x4(float3(0, 0, -e));
+            float4x4 perspProjMatrix = PerspectiveProjection4x4(s, g, n, f);
+
+            lightSpaceMatrix = mul(perspProjMatrix, mul(invTranslateEye, lightSpaceMatrix));
+        }
+
+        for (uint i = 0; i < VERTS_PER_THREAD; i++)
+        {
+            uint v = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT;
+
+            // rapVertsCS[0] = (-1, -1, 0)
+            // rapVertsCS[1] = (+1, -1, 0)
+            // rapVertsCS[2] = (+1, +1, 0)
+            // rapVertsCS[3] = (-1, +1, 0)
+            // rapVertsCS[4] = (-1, -1, 1)
+            // rapVertsCS[5] = (+1, -1, 1)
+            // rapVertsCS[6] = (+1, +1, 1)
+            // rapVertsCS[7] = (-1, +1, 1)
+
+            float3 rapVertCS; // See the comment above
+
+            rapVertCS.x = (countbits(v % 4) == 1) ? 1 : -1;
+            rapVertCS.y = (v & 2 != 0)            ? 1 : -1;
+            rapVertCS.z = (v >= 4)                ? 1 :  0;
+
+            float4 hbpVertVS = mul(g_mInvProjection, float4(rapVertCS, 1)); // Clip to view space
+            float4 hapVertLS = mul(lightSpaceMatrix, hbpVertVS);            // View to light space
+
+            // Consider the vertex to be inside the light volume if:
+            // -w < x < w
+            // -w < y < w   <-- exclude boundary points, as we will not clip using these vertices
+            // -w < z < w
+            // 0  < w
+            // For the orthographic projection, (w = 1), so no modifications are necessary.
+            // TODO: epsilon for numerical robustness?
+
+            bool inside = Max3(abs(hapVertLS.x), abs(hapVertLS.y), abs(hapVertLS.z)) < hapVertLS.w;
+
+            if (inside)
+            {
+                float3 rapVertNDC = float3(rapVertCS.xy * 0.5 + 0.5, rapVertCS.z);
+
+                // Update the AABB.
+                rapAaBbMinPt = min(rapAaBbMinPt, rapVertNDC);
+                rapAaBbMaxPt = max(rapAaBbMaxPt, rapVertNDC);
+            }
+        }
     }
 
     uint behindMasksOfVerts[NUM_VERTS];
@@ -504,7 +626,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
         behindMasksOfVerts[i] = gs_BehindMasksOfVerts[firstVertexOffset + i];
     }
 
-    // (2) Cull the faces.
+    // (3) Cull the faces.
     const uint cullFaceMask   = cullClipFaceMask;
     const uint numFacesToCull = countbits(cullFaceMask); // [0, 6]
 
@@ -533,7 +655,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
     cullClipFaceMask = gs_CullClipFaceMasks[groupLocalLightIndex];
 #endif
 
-    // (3) Clip the faces.
+    // (4) Clip the faces.
     const uint clipFaceMask   = cullClipFaceMask;
     const uint numFacesToClip = countbits(clipFaceMask); // [0, 6]
 

From b8e8c836bc1d6bf9536d806b79598d8b1b283f02 Mon Sep 17 00:00:00 2001
From: Evgenii <evgenii@unity3d.com>
Date: Wed, 5 Aug 2020 18:58:56 -0700
Subject: [PATCH 06/22] Improve the placeholder for the linear depth

---
 .../Runtime/Lighting/LightLoop/scrbound.compute                 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
index 77eb4f6a530..0bbd71b58ed 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
+++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
@@ -701,7 +701,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
         // a set of maxs, and each set is equal to g_iNrVisibLights.
         const ScreenSpaceBoundsIndices boundsIndices = GenerateScreenSpaceBoundsIndices(lgtIndex, g_iNrVisibLights, eyeIndex);
 
-        float minLinearDepth = -1, maxLinearDepth = -1; // TODO
+        float minLinearDepth = 0, maxLinearDepth = FLT_MAX; // TODO
 
         g_vBoundsBuffer[boundsIndices.min] = float4(rapAaBbMinPt, minLinearDepth);
         g_vBoundsBuffer[boundsIndices.max] = float4(rapAaBbMaxPt, maxLinearDepth);

From 676eb5a564641650c60a805c0df6a0208b7a9ffd Mon Sep 17 00:00:00 2001
From: Evgenii <evgenii@unity3d.com>
Date: Wed, 5 Aug 2020 20:08:03 -0700
Subject: [PATCH 07/22] Fix aspect

---
 .../Runtime/Lighting/LightLoop/scrbound.compute      | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
index 0bbd71b58ed..92e39049c78 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
+++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
@@ -99,14 +99,14 @@ float4x4 Homogenize3x3(float3x3 R)
     return M;
 }
 
-float4x4 PerspectiveProjection4x4(float s, float g, float n, float f)
+float4x4 PerspectiveProjection4x4(float a, float g, float n, float f)
 {
-    float a = (f + n) * rcp(f - n);
-    float b = -2 * f * n * rcp(f - n);
+    float b = (f + n) * rcp(f - n);
+    float c = -2 * f * n * rcp(f - n);
 
-    return float4x4(g/s, 0, 0, 0,
+    return float4x4(g/a, 0, 0, 0,
                       0, g, 0, 0,
-                      0, 0, a, b,
+                      0, 0, b, c,
                       0, 0, 1, 0);
 }
 
@@ -571,7 +571,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
             float g = f;                         // Distance from the eye to the projection plane
 
             float4x4 invTranslateEye = Translation4x4(float3(0, 0, -e));
-            float4x4 perspProjMatrix = PerspectiveProjection4x4(s, g, n, f);
+            float4x4 perspProjMatrix = PerspectiveProjection4x4(1, g, n, f);
 
             lightSpaceMatrix = mul(perspProjMatrix, mul(invTranslateEye, lightSpaceMatrix));
         }

From 433e27e7b6f6f5f87e8fd7376f796a6695544fcc Mon Sep 17 00:00:00 2001
From: Evgenii <evgenii@unity3d.com>
Date: Wed, 5 Aug 2020 20:41:09 -0700
Subject: [PATCH 08/22] Bugfix

---
 .../Lighting/LightLoop/scrbound.compute       | 155 +++++++++---------
 1 file changed, 81 insertions(+), 74 deletions(-)

diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
index 92e39049c78..665296d158b 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
+++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
@@ -10,7 +10,7 @@
 #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs.hlsl"
 #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightCullUtils.hlsl"
 
-// #pragma enable_d3d11_debug_symbols
+#pragma enable_d3d11_debug_symbols
 #pragma only_renderers d3d11 playstation xboxone vulkan metal switch
 
 uniform int g_isOrthographic;
@@ -26,17 +26,12 @@ StructuredBuffer<SFiniteLightBound> g_data : register( t0 );
 // output buffer
 RWStructuredBuffer<float4> g_vBoundsBuffer : register( u0 );
 
-#define DUMB_COMPILER
+#define Z_BINNING
+// #define DUMB_COMPILER
 // #define USE_WAVE_INTRINSICS // We use TGSM and atomic operations if wave intrinsics are not supported
 
 #ifdef Z_BINNING
 
-// Computes r=(n/d) and rounds the result towards the largest adjacent integer.
-uint DivRoundUp(uint n, uint d)
-{
-    return (n + d - 1) / d; // No division by 0 checks
-}
-
 // Returns the location of the N-th set bit starting from the lowest order bit and working upward.
 // Slow implementation - do not use for large bit sets.
 // Could be optimized - see https://graphics.stanford.edu/~seander/bithacks.html
@@ -110,21 +105,24 @@ float4x4 PerspectiveProjection4x4(float a, float g, float n, float f)
                       0, 0, 1, 0);
 }
 
+#define CLEAR_SIGN_BIT(X)  (asuint(X) & INT_MAX)
+#define DIV_ROUND_UP(N, D) (((N) + (D) - 1) / (D)) // No division by 0 checks
+
 // Clipping a plane by a cube may produce a hexagon (6-gon).
 // Clipping a hexagon by 4 planes may produce a decagon (10-gon).
 #define MAX_CLIP_VERTS    (10)
-#define NUM_EDGES         (12)
 #define NUM_VERTS         (8)
 #define NUM_FACES         (6)
 #define NUM_PLANES        (6)
-#define THREADS_PER_LIGHT (4)
 #define THREADS_PER_GROUP (64)
+#define THREADS_PER_LIGHT (1) // Set to 1 for debugging
 #define LIGHTS_PER_GROUP  (THREADS_PER_GROUP / THREADS_PER_LIGHT)
 #define VERTS_PER_GROUP   (NUM_VERTS * LIGHTS_PER_GROUP)
 #define VERTS_PER_THREAD  (NUM_VERTS / THREADS_PER_LIGHT)
-#define FACES_PER_THREAD  DivRoundUp(NUM_FACES, THREADS_PER_LIGHT)
+#define FACES_PER_THREAD  DIV_ROUND_UP(NUM_FACES, THREADS_PER_LIGHT)
 
 // All planes and faces are always in the standard order (see below).
+// Near and far planes may be swapped for Reverse Z-Buffering, but it does not change the algorithm.
 #define FACE_LEFT   (1 << 0) // x = -1
 #define FACE_RIGHT  (1 << 1) // x = +1
 #define FACE_FRONT  (1 << 2) // y = -1
@@ -136,6 +134,8 @@ float4x4 PerspectiveProjection4x4(float a, float g, float n, float f)
 // TODO: the compiler generates 'tbuffer_load_format_x' instructions
 // when we access the look-up tables. Can we avoid this?
 
+// TODO: try vert order (0 0 0), (1 0 0), (0 1 0), (1 1 0), (0 0 1), (1 0 1), (0 1 1), (1 1 1)
+
 // All vertices are always in the standard order (see below).
 static const uint s_FaceMasksOfVerts[NUM_VERTS] =
 {
@@ -153,12 +153,12 @@ static const uint s_FaceMasksOfVerts[NUM_VERTS] =
 // with normals pointing in the interior of the volume.
 static const uint s_VertMasksOfFaces[NUM_FACES] =
 {
-    3 << 9 | 7 << 6 | 4 << 3 | 0 << 0, // 0: FACE_LEFT
-    5 << 9 | 6 << 6 | 2 << 3 | 1 << 0, // 1: FACE_RIGHT
-    4 << 9 | 5 << 6 | 1 << 3 | 0 << 0, // 2: FACE_FRONT
-    2 << 9 | 6 << 6 | 7 << 3 | 3 << 0, // 3: FACE_BACK
-    1 << 9 | 2 << 6 | 3 << 3 | 0 << 0, // 4: FACE_TOP
-    7 << 9 | 6 << 6 | 5 << 3 | 4 << 0  // 5: FACE_BOTTOM
+    (3) << 9 | (7) << 6 | (4) << 3 | (0) << 0, // 0: FACE_LEFT
+    (5) << 9 | (6) << 6 | (2) << 3 | (1) << 0, // 1: FACE_RIGHT
+    (4) << 9 | (5) << 6 | (1) << 3 | (0) << 0, // 2: FACE_FRONT
+    (6) << 9 | (7) << 6 | (3) << 3 | (2) << 0, // 3: FACE_BACK
+    (1) << 9 | (2) << 6 | (3) << 3 | (0) << 0, // 4: FACE_TOP
+    (7) << 9 | (6) << 6 | (5) << 3 | (4) << 0  // 5: FACE_BOTTOM
 };
 
 // 5 arrays * 128 elements * 4 bytes each = 2560 bytes.
@@ -173,7 +173,7 @@ groupshared uint  gs_BehindMasksOfVerts[VERTS_PER_GROUP]; // 6 planes each (HLSL
 groupshared uint  gs_CullClipFaceMasks[LIGHTS_PER_GROUP]; // 6 faces  each (HLSL does not support small data types)
 
 // 6 arrays * 16 elements * 4 bytes each = 384 bytes.
-// Note that these are actually floats reinterpreted as uints.
+// These are actually floats reinterpreted as uints.
 // The reason is because floating-point atomic operations are not supported.
 groupshared uint  gs_RapAaBbMinPtX[LIGHTS_PER_GROUP];
 groupshared uint  gs_RapAaBbMaxPtX[LIGHTS_PER_GROUP];
@@ -307,8 +307,8 @@ void ClipFaceAgainstViewVolumeAndUpdateAaBb(uint f, uint behindMasksOfVerts[NUM_
         // Non-zero if ANY of the vertices are behind any of the planes.
         clipMaskOfFace |= behindMasksOfVerts[v];
 
-        // Note that not all edges may require clipping. However,
-        // filtering the vertex list is somewhat expensive, so we currently don't do it.
+        // Not all edges may require clipping. However, filtering the vertex list
+        // is somewhat expensive, so we currently don't do it.
         vertRingBuffer[j].x = gs_HapVertsX[firstVertexOffset + v];
         vertRingBuffer[j].y = gs_HapVertsY[firstVertexOffset + v];
         vertRingBuffer[j].z = gs_HapVertsZ[firstVertexOffset + v];
@@ -336,14 +336,14 @@ void ClipFaceAgainstViewVolumeAndUpdateAaBb(uint f, uint behindMasksOfVerts[NUM_
     uint modSrcIdx = srcBegin % MAX_CLIP_VERTS;
 #endif
 
-    for (int j = srcBegin; j < (srcBegin + srcSize); j++)
+    for (uint j = srcBegin; j < (srcBegin + srcSize); j++)
     {
     #ifndef DUMB_COMPILER
         uint modSrcIdx = j % MAX_CLIP_VERTS;
     #endif
 
         float4 hapVert = vertRingBuffer[modSrcIdx];
-        float3 rapVert = hapVert.xyz * rcp(hapVert.w);
+        float3 rapVert = saturate(hapVert.xyz * rcp(hapVert.w)); // Must not generate negative values
 
         rapAaBbMinPt = min(rapAaBbMinPt, rapVert);
         rapAaBbMaxPt = max(rapAaBbMaxPt, rapVert);
@@ -357,7 +357,10 @@ void ClipFaceAgainstViewVolumeAndUpdateAaBb(uint f, uint behindMasksOfVerts[NUM_
 
 #else // !Z_BINNING
 
-#define MAX_PNTS        9       // strictly this should be 10=6+4 but we get more wavefronts and 10 seems to never hit (fingers crossed)
+#define THREADS_PER_LIGHT (8)
+#define THREADS_PER_GROUP (64)
+#define LIGHTS_PER_GROUP  (THREADS_PER_GROUP / THREADS_PER_LIGHT)
+#define MAX_PNTS          (9)   // strictly this should be 10=6+4 but we get more wavefronts and 10 seems to never hit (fingers crossed)
                                 // However, worst case the plane that would be skipped if such an extreme case ever happened would be backplane
                                 // clipping gets skipped which doesn't cause any errors.
 
@@ -392,8 +395,8 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
     unsigned int g = groupID;
     unsigned int t = threadID;
 
-    const int subLigt = (uint) (t/8);
-    const int lgtIndex = subLigt+(uint) g*8;
+    const int subLigt = (uint) (t/THREADS_PER_LIGHT);
+    const int lgtIndex = subLigt+(uint) g*LIGHTS_PER_GROUP;
     const int sideIndex = (uint) (t%8);
 
     const int eyeAdjustedLgtIndex = GenerateLightCullDataIndex(lgtIndex, g_iNrVisibLights, eyeIndex);
@@ -407,13 +410,13 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
     // Since a light volume may be partially off-screen, we must clip it before computing the AABB.
     // Clipping the resulting AABB (rather than the light volume itself) may result in a loose AABB.
     //
-    // To avoid having to deal with toroidal properties of the perspective transform,
+    // To avoid having to deal with the "Moebius twist" property of the perspective transform,
     // we perform clipping using the homogeneous (projective) post-perspective coordinates.
     // This clipping method in described in Blinn's paper titled "Line Clipping".
     //
     // The algorithm processes a light on 4 threads. While all 6 faces may require clipping in the
     // worst case, clipping more than 4 faces is very uncommon (typically, we clip 0, 3 or 4).
-    // Note that some faces may require culling rather than clipping (the former is simpler).
+    // Some faces may require culling rather than clipping (the former is simpler).
     //
     // It's important to realize that face culling may end up culling 5 (or even all 6) faces.
     // This means that the clipped light volume may be reduced to a single polygon, or nothing at all.
@@ -433,20 +436,23 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
 
     const float  scale = lgtDat.scaleXY;      // scale.x = scale.y
     const float3 rbpC  = lgtDat.center.xyz;
+    // TODO: store X, Y, Scale
     const float3 rbpX  = lgtDat.boxAxisX.xyz; // Pre-scaled
     const float3 rbpY  = lgtDat.boxAxisY.xyz; // Pre-scaled
     const float3 rbpZ  = lgtDat.boxAxisZ.xyz; // Pre-scaled
 
 #ifndef USE_WAVE_INTRINSICS
-    // Initialize the TGSM. All threads write the same value -> no data races.
-    // The hardware will coalesce the writes.
-    gs_CullClipFaceMasks[groupLocalLightIndex] = 0; // Initially inside
-    gs_RapAaBbMinPtX[groupLocalLightIndex]     = asuint(1.0f);
-    gs_RapAaBbMaxPtX[groupLocalLightIndex]     = asuint(0.0f);
-    gs_RapAaBbMinPtY[groupLocalLightIndex]     = asuint(1.0f);
-    gs_RapAaBbMaxPtY[groupLocalLightIndex]     = asuint(0.0f);
-    gs_RapAaBbMinPtZ[groupLocalLightIndex]     = asuint(1.0f);
-    gs_RapAaBbMaxPtZ[groupLocalLightIndex]     = asuint(0.0f);
+    // (0) Initialize the TGSM.
+    if (t % THREADS_PER_LIGHT == 0) // Avoid bank conflicts
+    {
+        gs_CullClipFaceMasks[groupLocalLightIndex] = 0; // Initially inside
+        gs_RapAaBbMinPtX[groupLocalLightIndex]     = asuint(1.0f);
+        gs_RapAaBbMaxPtX[groupLocalLightIndex]     = asuint(0.0f);
+        gs_RapAaBbMinPtY[groupLocalLightIndex]     = asuint(1.0f);
+        gs_RapAaBbMaxPtY[groupLocalLightIndex]     = asuint(0.0f);
+        gs_RapAaBbMinPtZ[groupLocalLightIndex]     = asuint(1.0f);
+        gs_RapAaBbMaxPtZ[groupLocalLightIndex]     = asuint(0.0f);
+    }
 #endif // USE_WAVE_INTRINSICS
 
     float3 rapAaBbMinPt = 1;
@@ -482,20 +488,16 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
 
         float3 rbpVert = rbpC + m.x * rbpX + m.y * rbpY + m.z * rbpZ;
         // Avoid generating (w = 0).
-        rbpVert.z = (abs(rbpVert.z) >= FLT_EPS) ? rbpVert.z : FLT_EPS;
+        rbpVert.z = (abs(rbpVert.z) > FLT_MIN) ? rbpVert.z : FLT_MIN;
 
         float4 hapVert = mul(g_mProjection, float4(rbpVert, 1));
 
-        // Make sure the W component is strictly positive.
-        // It is helpful in order to simplify clipping and to avoid perspective division by 0.
-        float w = hapVert.w;
-        float s = (w >= 0) ? 1 : -1;
+        // Warning: the W component may be negative.
+        // Flipping the -W pyramid by negating all coordinates is incorrect
+        // and will break both classification and clipping.
 
         // Transform the X and Y components: [-w, w] -> [0, w].
-        hapVert.x = (0.5 * s) * hapVert.x + ((0.5 * s) * w);
-        hapVert.y = (0.5 * s) * hapVert.y + ((0.5 * s) * w);
-        hapVert.z = s * hapVert.z;
-        hapVert.w = s * hapVert.w;
+        hapVert.xy = 0.5 * hapVert.xy + (0.5 * hapVert.w);
 
         // For each vertex, we must determine whether it is within the bounds.
         // For culling and clipping, we must know, per culling plane, whether the vertex
@@ -509,17 +511,18 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
         // w is always valid
         // For the orthographic projection, (w = 1), so no modifications are necessary.
         // TODO: epsilon for numerical robustness?
-        w = hapVert.w;
 
         for (uint j = 0; j < (NUM_PLANES / 2); j++)
         {
+            float w = hapVert.w;
+
             behindMask |= (hapVert[j] < 0 ? 1 : 0) << (2 * j + 0); // Planes crossing '0'
             behindMask |= (hapVert[j] > w ? 1 : 0) << (2 * j + 1); // Planes crossing 'w'
         }
 
         if (behindMask == 0) // Inside?
         {
-            float3 rapVert = hapVert.xyz * rcp(hapVert.w);
+            float3 rapVert = saturate(hapVert.xyz * rcp(hapVert.w)); // Must not generate negative values
 
             rapAaBbMinPt = min(rapAaBbMinPt, rapVert);
             rapAaBbMaxPt = max(rapAaBbMaxPt, rapVert);
@@ -573,7 +576,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
             float4x4 invTranslateEye = Translation4x4(float3(0, 0, -e));
             float4x4 perspProjMatrix = PerspectiveProjection4x4(1, g, n, f);
 
-            lightSpaceMatrix = mul(perspProjMatrix, mul(invTranslateEye, lightSpaceMatrix));
+            lightSpaceMatrix = mul(mul(perspProjMatrix, invTranslateEye), lightSpaceMatrix);
         }
 
         for (uint i = 0; i < VERTS_PER_THREAD; i++)
@@ -601,7 +604,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
             // Consider the vertex to be inside the light volume if:
             // -w < x < w
             // -w < y < w   <-- exclude boundary points, as we will not clip using these vertices
-            // -w < z < w
+            // -w < z < w   <-- assume that Z-precision is not very important here
             // 0  < w
             // For the orthographic projection, (w = 1), so no modifications are necessary.
             // TODO: epsilon for numerical robustness?
@@ -627,20 +630,22 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
     }
 
     // (3) Cull the faces.
-    const uint cullFaceMask   = cullClipFaceMask;
-    const uint numFacesToCull = countbits(cullFaceMask); // [0, 6]
-
-    for (uint i = 0; i < FACES_PER_THREAD; i++)
     {
-        uint n = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT;
+        const uint cullFaceMask   = cullClipFaceMask;
+        const uint numFacesToCull = countbits(cullFaceMask); // [0, 6]
 
-        if (n < numFacesToCull)
+        for (uint i = 0; i < FACES_PER_THREAD; i++)
         {
-            uint f = NthBitLow(cullFaceMask, n);
+            uint n = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT;
 
-            if (TryCullFace(f, behindMasksOfVerts))
+            if (n < numFacesToCull)
             {
-                cullClipFaceMask ^= 1 << f; // Clear the bit
+                uint f = NthBitLow(cullFaceMask, n);
+
+                if (TryCullFace(f, behindMasksOfVerts))
+                {
+                    cullClipFaceMask ^= 1 << f; // Clear the bit
+                }
             }
         }
     }
@@ -656,19 +661,21 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
 #endif
 
     // (4) Clip the faces.
-    const uint clipFaceMask   = cullClipFaceMask;
-    const uint numFacesToClip = countbits(clipFaceMask); // [0, 6]
-
-    for (uint i = 0; i < FACES_PER_THREAD; i++)
     {
-        uint n = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT;
+        const uint clipFaceMask   = cullClipFaceMask;
+        const uint numFacesToClip = countbits(clipFaceMask); // [0, 6]
 
-        if (n < numFacesToCull)
+        for (uint i = 0; i < FACES_PER_THREAD; i++)
         {
-            uint f = NthBitLow(clipFaceMask, n);
+            uint n = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT;
+
+            if (n < numFacesToClip)
+            {
+                uint f = NthBitLow(clipFaceMask, n);
 
-            ClipFaceAgainstViewVolumeAndUpdateAaBb(f, behindMasksOfVerts, firstVertexOffset,
-                                                   rapAaBbMinPt, rapAaBbMaxPt);
+                ClipFaceAgainstViewVolumeAndUpdateAaBb(f, behindMasksOfVerts, firstVertexOffset,
+                                                       rapAaBbMinPt, rapAaBbMaxPt);
+            }
         }
     }
 
@@ -677,12 +684,12 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
 #else
     // Integer comparison works for floating-point numbers as long as the sign bit is 0.
     // We must take care of the signed zero ourselves.
-    InterlockedMin(gs_RapAaBbMinPtX[groupLocalLightIndex], asuint(rapAaBbMinPt.x) & INT_MAX);
-    InterlockedMax(gs_RapAaBbMaxPtX[groupLocalLightIndex], asuint(rapAaBbMaxPt.x) & INT_MAX);
-    InterlockedMin(gs_RapAaBbMinPtY[groupLocalLightIndex], asuint(rapAaBbMinPt.y) & INT_MAX);
-    InterlockedMax(gs_RapAaBbMaxPtY[groupLocalLightIndex], asuint(rapAaBbMaxPt.y) & INT_MAX);
-    InterlockedMin(gs_RapAaBbMinPtZ[groupLocalLightIndex], asuint(rapAaBbMinPt.z) & INT_MAX);
-    InterlockedMax(gs_RapAaBbMaxPtZ[groupLocalLightIndex], asuint(rapAaBbMaxPt.z) & INT_MAX);
+    InterlockedMin(gs_RapAaBbMinPtX[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(rapAaBbMinPt.x)));
+    InterlockedMax(gs_RapAaBbMaxPtX[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(rapAaBbMaxPt.x)));
+    InterlockedMin(gs_RapAaBbMinPtY[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(rapAaBbMinPt.y)));
+    InterlockedMax(gs_RapAaBbMaxPtY[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(rapAaBbMaxPt.y)));
+    InterlockedMin(gs_RapAaBbMinPtZ[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(rapAaBbMinPt.z)));
+    InterlockedMax(gs_RapAaBbMaxPtZ[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(rapAaBbMaxPt.z)));
 
     GroupMemoryBarrierWithGroupSync();
 
@@ -694,7 +701,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
     rapAaBbMaxPt.z = asfloat(gs_RapAaBbMaxPtZ[groupLocalLightIndex]);
 #endif // USE_WAVE_INTRINSICS
 
-    if (t % THREADS_PER_LIGHT == 0)
+    if (t % THREADS_PER_LIGHT == 0) // Avoid bank conflicts
     {
         // Each light's AABB is represented by two float3s, the min and max of the box.
         // And for stereo, we have two sets of lights. Therefore, each eye has a set of mins, followed by

From 8a2458a5f5d2f362fce4090f8e9466cfa86712d3 Mon Sep 17 00:00:00 2001
From: Evgenii <evgenii@unity3d.com>
Date: Thu, 6 Aug 2020 21:38:10 -0700
Subject: [PATCH 09/22] Optimize

---
 .../Lighting/LightLoop/scrbound.compute       | 28 ++++++++-----------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
index 665296d158b..bee9499abdd 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
+++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
@@ -10,7 +10,7 @@
 #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs.hlsl"
 #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightCullUtils.hlsl"
 
-#pragma enable_d3d11_debug_symbols
+// #pragma enable_d3d11_debug_symbols
 #pragma only_renderers d3d11 playstation xboxone vulkan metal switch
 
 uniform int g_isOrthographic;
@@ -27,7 +27,7 @@ StructuredBuffer<SFiniteLightBound> g_data : register( t0 );
 RWStructuredBuffer<float4> g_vBoundsBuffer : register( u0 );
 
 #define Z_BINNING
-// #define DUMB_COMPILER
+#define DUMB_COMPILER
 // #define USE_WAVE_INTRINSICS // We use TGSM and atomic operations if wave intrinsics are not supported
 
 #ifdef Z_BINNING
@@ -80,7 +80,6 @@ float3x3 Invert3x3(float3x3 R)
     float3x3 adj = float3x3(cross(C[1], C[2]),
                             cross(C[2], C[0]),
                             cross(C[0], C[1]));
-
     return rcp(det) * adj;
 }
 
@@ -90,14 +89,13 @@ float4x4 Homogenize3x3(float3x3 R)
                           float4(R[1], 0),
                           float4(R[2], 0),
                           float4(0,0,0,1));
-
     return M;
 }
 
 float4x4 PerspectiveProjection4x4(float a, float g, float n, float f)
 {
-    float b = (f + n) * rcp(f - n);
-    float c = -2 * f * n * rcp(f - n);
+    float b = (f + n) * rcp(f - n);    // z: [-1, 1]
+    float c = -2 * f * n * rcp(f - n); // No Z-reversal
 
     return float4x4(g/a, 0, 0, 0,
                       0, g, 0, 0,
@@ -115,14 +113,14 @@ float4x4 PerspectiveProjection4x4(float a, float g, float n, float f)
 #define NUM_FACES         (6)
 #define NUM_PLANES        (6)
 #define THREADS_PER_GROUP (64)
-#define THREADS_PER_LIGHT (1) // Set to 1 for debugging
+#define THREADS_PER_LIGHT (4) // Set to 1 for debugging
 #define LIGHTS_PER_GROUP  (THREADS_PER_GROUP / THREADS_PER_LIGHT)
 #define VERTS_PER_GROUP   (NUM_VERTS * LIGHTS_PER_GROUP)
 #define VERTS_PER_THREAD  (NUM_VERTS / THREADS_PER_LIGHT)
 #define FACES_PER_THREAD  DIV_ROUND_UP(NUM_FACES, THREADS_PER_LIGHT)
 
 // All planes and faces are always in the standard order (see below).
-// Near and far planes may be swapped for Reverse Z-Buffering, but it does not change the algorithm.
+// Near and far planes are swapped in the case of Z-reversal, but it does not affect the algorithm.
 #define FACE_LEFT   (1 << 0) // x = -1
 #define FACE_RIGHT  (1 << 1) // x = +1
 #define FACE_FRONT  (1 << 2) // y = -1
@@ -481,7 +479,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
         float3 m; // See the comment above
 
         m.x = (countbits(v % 4) == 1) ? 1 : -1;
-        m.y = (v & 2 != 0)            ? 1 : -1;
+        m.y = ((v & 2) != 0)          ? 1 : -1;
         m.z = (v >= 4)                ? 1 : -1;
 
         m.xy *= (v >= 4) ? 1 : scale;
@@ -506,7 +504,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
 
         // Consider the vertex to be inside the view volume if:
         // 0 <= x <= w
-        // 0 <= y <= w   <-- include boundary points, to avoid clipping them later
+        // 0 <= y <= w   <-- include boundary points to avoid clipping them later
         // 0 <= z <= w
         // w is always valid
         // For the orthographic projection, (w = 1), so no modifications are necessary.
@@ -561,7 +559,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
         float4x4 invRotateAndScaleInLightSpace = Homogenize3x3(Invert3x3(Rotation3x3(rbpX, rbpY, rbpZ)));
         // TODO: avoid full inversion by using unit vectors and passing magnitudes explicitly.
 
-        // This (orhographic) projection matrix maps a view-space point to a light-space [-1, 1]^3 cube.
+        // This (orthographic) projection matrix maps a view-space point to a light-space [-1, 1]^3 cube.
         float4x4 lightSpaceMatrix = mul(invRotateAndScaleInLightSpace, invTranslateToLightSpace);
 
         if (scale != 1) // Perspective light space?
@@ -595,7 +593,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
             float3 rapVertCS; // See the comment above
 
             rapVertCS.x = (countbits(v % 4) == 1) ? 1 : -1;
-            rapVertCS.y = (v & 2 != 0)            ? 1 : -1;
+            rapVertCS.y = ((v & 2) != 0)          ? 1 : -1;
             rapVertCS.z = (v >= 4)                ? 1 :  0;
 
             float4 hbpVertVS = mul(g_mInvProjection, float4(rapVertCS, 1)); // Clip to view space
@@ -708,10 +706,8 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
         // a set of maxs, and each set is equal to g_iNrVisibLights.
         const ScreenSpaceBoundsIndices boundsIndices = GenerateScreenSpaceBoundsIndices(lgtIndex, g_iNrVisibLights, eyeIndex);
 
-        float minLinearDepth = 0, maxLinearDepth = FLT_MAX; // TODO
-
-        g_vBoundsBuffer[boundsIndices.min] = float4(rapAaBbMinPt, minLinearDepth);
-        g_vBoundsBuffer[boundsIndices.max] = float4(rapAaBbMaxPt, maxLinearDepth);
+        g_vBoundsBuffer[boundsIndices.min] = float4(rapAaBbMinPt, 0);      // TODO: add me - lin depth
+        g_vBoundsBuffer[boundsIndices.max] = float4(rapAaBbMaxPt, 100000); //
     }
 
 #else // !Z_BINNING

From 0453c6c5d0a15db7fe5efe5a48bf8f8282f0da6a Mon Sep 17 00:00:00 2001
From: Evgenii <evgenii@unity3d.com>
Date: Fri, 7 Aug 2020 13:15:40 -0700
Subject: [PATCH 10/22] Also store view space Z

---
 .../Lighting/LightLoop/scrbound.compute       | 112 ++++++++++--------
 1 file changed, 65 insertions(+), 47 deletions(-)

diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
index bee9499abdd..abb17765989 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
+++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
@@ -170,15 +170,17 @@ groupshared uint  gs_BehindMasksOfVerts[VERTS_PER_GROUP]; // 6 planes each (HLSL
 // 1 array *  16 elements * 4 bytes each = 64 bytes.
 groupshared uint  gs_CullClipFaceMasks[LIGHTS_PER_GROUP]; // 6 faces  each (HLSL does not support small data types)
 
-// 6 arrays * 16 elements * 4 bytes each = 384 bytes.
+// 8 arrays * 16 elements * 4 bytes each = 512 bytes.
 // These are actually floats reinterpreted as uints.
 // The reason is because floating-point atomic operations are not supported.
-groupshared uint  gs_RapAaBbMinPtX[LIGHTS_PER_GROUP];
-groupshared uint  gs_RapAaBbMaxPtX[LIGHTS_PER_GROUP];
-groupshared uint  gs_RapAaBbMinPtY[LIGHTS_PER_GROUP];
-groupshared uint  gs_RapAaBbMaxPtY[LIGHTS_PER_GROUP];
-groupshared uint  gs_RapAaBbMinPtZ[LIGHTS_PER_GROUP];
-groupshared uint  gs_RapAaBbMaxPtZ[LIGHTS_PER_GROUP];
+groupshared uint  gs_NdcAaBbMinPtX[LIGHTS_PER_GROUP];
+groupshared uint  gs_NdcAaBbMaxPtX[LIGHTS_PER_GROUP];
+groupshared uint  gs_NdcAaBbMinPtY[LIGHTS_PER_GROUP];
+groupshared uint  gs_NdcAaBbMaxPtY[LIGHTS_PER_GROUP];
+groupshared uint  gs_NdcAaBbMinPtZ[LIGHTS_PER_GROUP]; // Note that min-max Z cannot be trivially reconstructed
+groupshared uint  gs_NdcAaBbMaxPtZ[LIGHTS_PER_GROUP]; // from min-max W if the projection is oblique.
+groupshared uint  gs_NdcAaBbMinPtW[LIGHTS_PER_GROUP]; // View-space Z coordinate
+groupshared uint  gs_NdcAaBbMaxPtW[LIGHTS_PER_GROUP]; // View-space Z coordinate
 #endif // USE_WAVE_INTRINSICS
 
 // Returns 'true' if it manages to cull the face.
@@ -290,8 +292,8 @@ void ClipPolygonAgainstPlane(uint p, uint srcBegin, uint srcSize,
     }
 }
 
-void ClipFaceAgainstViewVolumeAndUpdateAaBb(uint f, uint behindMasksOfVerts[NUM_VERTS], uint firstVertexOffset,
-                                            inout float3 rapAaBbMinPt, inout float3 rapAaBbMaxPt)
+void ClipFaceAgainstViewVolumeAndUpdateAaBb(uint f, uint behindMasksOfVerts[NUM_VERTS], uint firstVertexOffset, float4x4 g_mInvProjection,
+                                            inout float4 ndcAaBbMinPt, inout float4 ndcAaBbMaxPt)
 {
     float4 vertRingBuffer[MAX_CLIP_VERTS];
     uint srcBegin = 0, srcSize = 4;
@@ -340,11 +342,15 @@ void ClipFaceAgainstViewVolumeAndUpdateAaBb(uint f, uint behindMasksOfVerts[NUM_
         uint modSrcIdx = j % MAX_CLIP_VERTS;
     #endif
 
-        float4 hapVert = vertRingBuffer[modSrcIdx];
-        float3 rapVert = saturate(hapVert.xyz * rcp(hapVert.w)); // Must not generate negative values
+        float4 hapVert    = vertRingBuffer[modSrcIdx];
+        float4 hbpVertVS  = mul(g_mInvProjection, hapVert);         // Just to support orthographic projection
+        float3 rapVertNDC = saturate(hapVert.xyz * rcp(hapVert.w)); // Must not generate negative values
+        float  rbpVertVSz = hbpVertVS.z * rcp(hbpVertVS.w);
 
-        rapAaBbMinPt = min(rapAaBbMinPt, rapVert);
-        rapAaBbMaxPt = max(rapAaBbMaxPt, rapVert);
+        ndcAaBbMinPt.xyz  = min(ndcAaBbMinPt.xyz, rapVertNDC);
+        ndcAaBbMaxPt.xyz  = max(ndcAaBbMaxPt.xyz, rapVertNDC);
+        ndcAaBbMinPt.w    = min(ndcAaBbMinPt.w,   rbpVertVSz);
+        ndcAaBbMaxPt.w    = max(ndcAaBbMaxPt.w,   rbpVertVSz);
 
     #ifdef DUMB_COMPILER
         modSrcIdx++;
@@ -444,17 +450,19 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
     if (t % THREADS_PER_LIGHT == 0) // Avoid bank conflicts
     {
         gs_CullClipFaceMasks[groupLocalLightIndex] = 0; // Initially inside
-        gs_RapAaBbMinPtX[groupLocalLightIndex]     = asuint(1.0f);
-        gs_RapAaBbMaxPtX[groupLocalLightIndex]     = asuint(0.0f);
-        gs_RapAaBbMinPtY[groupLocalLightIndex]     = asuint(1.0f);
-        gs_RapAaBbMaxPtY[groupLocalLightIndex]     = asuint(0.0f);
-        gs_RapAaBbMinPtZ[groupLocalLightIndex]     = asuint(1.0f);
-        gs_RapAaBbMaxPtZ[groupLocalLightIndex]     = asuint(0.0f);
+        gs_NdcAaBbMinPtX[groupLocalLightIndex]     = asuint(1.0f);
+        gs_NdcAaBbMaxPtX[groupLocalLightIndex]     = asuint(0.0f);
+        gs_NdcAaBbMinPtY[groupLocalLightIndex]     = asuint(1.0f);
+        gs_NdcAaBbMaxPtY[groupLocalLightIndex]     = asuint(0.0f);
+        gs_NdcAaBbMinPtZ[groupLocalLightIndex]     = asuint(1.0f);
+        gs_NdcAaBbMaxPtZ[groupLocalLightIndex]     = asuint(0.0f);
+        gs_NdcAaBbMinPtW[groupLocalLightIndex]     = asuint(FLT_INF);
+        gs_NdcAaBbMaxPtW[groupLocalLightIndex]     = asuint(0.0f);
     }
 #endif // USE_WAVE_INTRINSICS
 
-    float3 rapAaBbMinPt = 1;
-    float3 rapAaBbMaxPt = 0;
+    float4 ndcAaBbMinPt = float4(1, 1, 1, FLT_INF);
+    float4 ndcAaBbMaxPt = 0;
 
     // We must determine whether we have to clip or cull any of the faces.
     // If all vertices of a face are inside with respect to all the culling planes,
@@ -484,11 +492,11 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
 
         m.xy *= (v >= 4) ? 1 : scale;
 
-        float3 rbpVert = rbpC + m.x * rbpX + m.y * rbpY + m.z * rbpZ;
+        float3 rbpVertVS = rbpC + m.x * rbpX + m.y * rbpY + m.z * rbpZ;
         // Avoid generating (w = 0).
-        rbpVert.z = (abs(rbpVert.z) > FLT_MIN) ? rbpVert.z : FLT_MIN;
+        rbpVertVS.z = (abs(rbpVertVS.z) > FLT_MIN) ? rbpVertVS.z : FLT_MIN;
 
-        float4 hapVert = mul(g_mProjection, float4(rbpVert, 1));
+        float4 hapVert = mul(g_mProjection, float4(rbpVertVS, 1));
 
         // Warning: the W component may be negative.
         // Flipping the -W pyramid by negating all coordinates is incorrect
@@ -497,6 +505,8 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
         // Transform the X and Y components: [-w, w] -> [0, w].
         hapVert.xy = 0.5 * hapVert.xy + (0.5 * hapVert.w);
 
+        // TODO: multiply vertex by ViewZ if orthographic for unified processing!
+
         // For each vertex, we must determine whether it is within the bounds.
         // For culling and clipping, we must know, per culling plane, whether the vertex
         // is in the positive or the negative half-space.
@@ -520,10 +530,12 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
 
         if (behindMask == 0) // Inside?
         {
-            float3 rapVert = saturate(hapVert.xyz * rcp(hapVert.w)); // Must not generate negative values
+            float3 rapVertNDC = saturate(hapVert.xyz * rcp(hapVert.w)); // Must not generate negative values
 
-            rapAaBbMinPt = min(rapAaBbMinPt, rapVert);
-            rapAaBbMaxPt = max(rapAaBbMaxPt, rapVert);
+            ndcAaBbMinPt.xyz = min(ndcAaBbMinPt.xyz, rapVertNDC);
+            ndcAaBbMaxPt.xyz = max(ndcAaBbMaxPt.xyz, rapVertNDC);
+            ndcAaBbMinPt.w   = min(ndcAaBbMinPt.w,   rbpVertVS.z);
+            ndcAaBbMaxPt.w   = max(ndcAaBbMaxPt.w,   rbpVertVS.z);
         }
         else // Outside
         {
@@ -612,10 +624,12 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
             if (inside)
             {
                 float3 rapVertNDC = float3(rapVertCS.xy * 0.5 + 0.5, rapVertCS.z);
+                float  rbpVertVSz = hbpVertVS.z * rcp(hbpVertVS.w);
 
-                // Update the AABB.
-                rapAaBbMinPt = min(rapAaBbMinPt, rapVertNDC);
-                rapAaBbMaxPt = max(rapAaBbMaxPt, rapVertNDC);
+                ndcAaBbMinPt.xyz = min(ndcAaBbMinPt.xyz, rapVertNDC);
+                ndcAaBbMaxPt.xyz = max(ndcAaBbMaxPt.xyz, rapVertNDC);
+                ndcAaBbMinPt.w   = min(ndcAaBbMinPt.w,   rbpVertVSz);
+                ndcAaBbMaxPt.w   = max(ndcAaBbMaxPt.w,   rbpVertVSz);
             }
         }
     }
@@ -671,8 +685,8 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
             {
                 uint f = NthBitLow(clipFaceMask, n);
 
-                ClipFaceAgainstViewVolumeAndUpdateAaBb(f, behindMasksOfVerts, firstVertexOffset,
-                                                       rapAaBbMinPt, rapAaBbMaxPt);
+                ClipFaceAgainstViewVolumeAndUpdateAaBb(f, behindMasksOfVerts, firstVertexOffset, g_mInvProjection,
+                                                       ndcAaBbMinPt, ndcAaBbMaxPt);
             }
         }
     }
@@ -681,22 +695,26 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
     // ...
 #else
     // Integer comparison works for floating-point numbers as long as the sign bit is 0.
-    // We must take care of the signed zero ourselves.
-    InterlockedMin(gs_RapAaBbMinPtX[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(rapAaBbMinPt.x)));
-    InterlockedMax(gs_RapAaBbMaxPtX[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(rapAaBbMaxPt.x)));
-    InterlockedMin(gs_RapAaBbMinPtY[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(rapAaBbMinPt.y)));
-    InterlockedMax(gs_RapAaBbMaxPtY[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(rapAaBbMaxPt.y)));
-    InterlockedMin(gs_RapAaBbMinPtZ[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(rapAaBbMinPt.z)));
-    InterlockedMax(gs_RapAaBbMaxPtZ[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(rapAaBbMaxPt.z)));
+    // We must take care of the signed zero ourselves. saturate() does not help here.
+    InterlockedMin(gs_NdcAaBbMinPtX[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.x)));
+    InterlockedMax(gs_NdcAaBbMaxPtX[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.x)));
+    InterlockedMin(gs_NdcAaBbMinPtY[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.y)));
+    InterlockedMax(gs_NdcAaBbMaxPtY[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.y)));
+    InterlockedMin(gs_NdcAaBbMinPtZ[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.z)));
+    InterlockedMax(gs_NdcAaBbMaxPtZ[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.z)));
+    InterlockedMin(gs_NdcAaBbMinPtW[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.w)));
+    InterlockedMax(gs_NdcAaBbMaxPtW[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.w)));
 
     GroupMemoryBarrierWithGroupSync();
 
-    rapAaBbMinPt.x = asfloat(gs_RapAaBbMinPtX[groupLocalLightIndex]);
-    rapAaBbMaxPt.x = asfloat(gs_RapAaBbMaxPtX[groupLocalLightIndex]);
-    rapAaBbMinPt.y = asfloat(gs_RapAaBbMinPtY[groupLocalLightIndex]);
-    rapAaBbMaxPt.y = asfloat(gs_RapAaBbMaxPtY[groupLocalLightIndex]);
-    rapAaBbMinPt.z = asfloat(gs_RapAaBbMinPtZ[groupLocalLightIndex]);
-    rapAaBbMaxPt.z = asfloat(gs_RapAaBbMaxPtZ[groupLocalLightIndex]);
+    ndcAaBbMinPt.x = asfloat(gs_NdcAaBbMinPtX[groupLocalLightIndex]);
+    ndcAaBbMaxPt.x = asfloat(gs_NdcAaBbMaxPtX[groupLocalLightIndex]);
+    ndcAaBbMinPt.y = asfloat(gs_NdcAaBbMinPtY[groupLocalLightIndex]);
+    ndcAaBbMaxPt.y = asfloat(gs_NdcAaBbMaxPtY[groupLocalLightIndex]);
+    ndcAaBbMinPt.z = asfloat(gs_NdcAaBbMinPtZ[groupLocalLightIndex]);
+    ndcAaBbMaxPt.z = asfloat(gs_NdcAaBbMaxPtZ[groupLocalLightIndex]);
+    ndcAaBbMinPt.w = asfloat(gs_NdcAaBbMinPtW[groupLocalLightIndex]);
+    ndcAaBbMaxPt.w = asfloat(gs_NdcAaBbMaxPtW[groupLocalLightIndex]);
 #endif // USE_WAVE_INTRINSICS
 
     if (t % THREADS_PER_LIGHT == 0) // Avoid bank conflicts
@@ -706,8 +724,8 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
         // a set of maxs, and each set is equal to g_iNrVisibLights.
         const ScreenSpaceBoundsIndices boundsIndices = GenerateScreenSpaceBoundsIndices(lgtIndex, g_iNrVisibLights, eyeIndex);
 
-        g_vBoundsBuffer[boundsIndices.min] = float4(rapAaBbMinPt, 0);      // TODO: add me - lin depth
-        g_vBoundsBuffer[boundsIndices.max] = float4(rapAaBbMaxPt, 100000); //
+        g_vBoundsBuffer[boundsIndices.min] = ndcAaBbMinPt;
+        g_vBoundsBuffer[boundsIndices.max] = ndcAaBbMaxPt;
     }
 
 #else // !Z_BINNING

From 7aa331ce25d7f7c2362ecf4fd162638d6588ed7a Mon Sep 17 00:00:00 2001
From: Evgenii <evgenii@unity3d.com>
Date: Sat, 8 Aug 2020 14:05:14 -0700
Subject: [PATCH 11/22] Optimize orthographic

---
 .../Lighting/LightLoop/scrbound.compute       | 65 ++++++++++---------
 1 file changed, 35 insertions(+), 30 deletions(-)

diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
index abb17765989..a2cbbbd9e74 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
+++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
@@ -120,7 +120,7 @@ float4x4 PerspectiveProjection4x4(float a, float g, float n, float f)
 #define FACES_PER_THREAD  DIV_ROUND_UP(NUM_FACES, THREADS_PER_LIGHT)
 
 // All planes and faces are always in the standard order (see below).
-// Near and far planes are swapped in the case of Z-reversal, but it does not affect the algorithm.
+// Near and far planes are swapped in the case of Z-reversal, but it does not change the algorithm.
 #define FACE_LEFT   (1 << 0) // x = -1
 #define FACE_RIGHT  (1 << 1) // x = +1
 #define FACE_FRONT  (1 << 2) // y = -1
@@ -254,7 +254,7 @@ void ClipPolygonAgainstPlane(uint p, uint srcBegin, uint srcSize,
         // 1. v0 in,  v1 out -> add intersection
         // 2. v0 out, v1 in  -> add intersection, add v1
         // 3. v0 in,  v1 in  -> add v1
-        // (bc >= 0) <-> in, (bc < 0) <-> out. Beware of the signed zero.
+        // (bc >= 0) <-> in, (bc < 0) <-> out. Beware of -0.
 
         if ((tailVert.bc >= 0) != (leadVert.bc >= 0))
         {
@@ -292,11 +292,12 @@ void ClipPolygonAgainstPlane(uint p, uint srcBegin, uint srcSize,
     }
 }
 
-void ClipFaceAgainstViewVolumeAndUpdateAaBb(uint f, uint behindMasksOfVerts[NUM_VERTS], uint firstVertexOffset, float4x4 g_mInvProjection,
-                                            inout float4 ndcAaBbMinPt, inout float4 ndcAaBbMaxPt)
+void ClipFaceAgainstViewVolume(uint f, uint behindMasksOfVerts[NUM_VERTS], uint firstVertexOffset,
+                               out uint srcBegin, out uint srcSize,
+                               out float4 vertRingBuffer[MAX_CLIP_VERTS])
 {
-    float4 vertRingBuffer[MAX_CLIP_VERTS];
-    uint srcBegin = 0, srcSize = 4;
+    srcBegin = 0;
+    srcSize  = 4;
 
     uint clipMaskOfFace = 0; // Initially in front
     uint vertMaskOfFace = s_VertMasksOfFaces[f];
@@ -331,7 +332,12 @@ void ClipFaceAgainstViewVolumeAndUpdateAaBb(uint f, uint behindMasksOfVerts[NUM_
 
         clipMaskOfFace ^= 1 << p; // Clear the bit to continue using firstbitlow()
     }
+}
 
+void UpdateAaBb(uint srcBegin, uint srcSize, float4 vertRingBuffer[MAX_CLIP_VERTS],
+                bool isOrthoProj, float4x4 invProj,
+                inout float4 ndcAaBbMinPt, inout float4 ndcAaBbMaxPt)
+{
 #ifdef DUMB_COMPILER
     uint modSrcIdx = srcBegin % MAX_CLIP_VERTS;
 #endif
@@ -341,17 +347,18 @@ void ClipFaceAgainstViewVolumeAndUpdateAaBb(uint f, uint behindMasksOfVerts[NUM_
     #ifndef DUMB_COMPILER
         uint modSrcIdx = j % MAX_CLIP_VERTS;
     #endif
-
         float4 hapVert    = vertRingBuffer[modSrcIdx];
-        float4 hbpVertVS  = mul(g_mInvProjection, hapVert);         // Just to support orthographic projection
-        float3 rapVertNDC = saturate(hapVert.xyz * rcp(hapVert.w)); // Must not generate negative values
-        float  rbpVertVSz = hbpVertVS.z * rcp(hbpVertVS.w);
+        // Clamp to the bounds in case of numerical errors (may still generate -0).
+        float3 rapVertNDC = saturate(hapVert.xyz * rcp(hapVert.w));
+        float  rbpVertVSz = hapVert.w;
 
-        ndcAaBbMinPt.xyz  = min(ndcAaBbMinPt.xyz, rapVertNDC);
-        ndcAaBbMaxPt.xyz  = max(ndcAaBbMaxPt.xyz, rapVertNDC);
-        ndcAaBbMinPt.w    = min(ndcAaBbMinPt.w,   rbpVertVSz);
-        ndcAaBbMaxPt.w    = max(ndcAaBbMaxPt.w,   rbpVertVSz);
+        if (isOrthoProj) // Must replace (w = 1)
+        {
+            rbpVertVSz = dot(invProj[2], hapVert);
+        }
 
+        ndcAaBbMinPt = min(ndcAaBbMinPt, float4(rapVertNDC, rbpVertVSz));
+        ndcAaBbMaxPt = max(ndcAaBbMaxPt, float4(rapVertNDC, rbpVertVSz));
     #ifdef DUMB_COMPILER
         modSrcIdx++;
         modSrcIdx = (modSrcIdx == MAX_CLIP_VERTS) ? 0 : modSrcIdx;
@@ -501,12 +508,11 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
         // Warning: the W component may be negative.
         // Flipping the -W pyramid by negating all coordinates is incorrect
         // and will break both classification and clipping.
+        // For the orthographic projection, (w = 1).
 
         // Transform the X and Y components: [-w, w] -> [0, w].
         hapVert.xy = 0.5 * hapVert.xy + (0.5 * hapVert.w);
 
-        // TODO: multiply vertex by ViewZ if orthographic for unified processing!
-
         // For each vertex, we must determine whether it is within the bounds.
         // For culling and clipping, we must know, per culling plane, whether the vertex
         // is in the positive or the negative half-space.
@@ -517,7 +523,6 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
         // 0 <= y <= w   <-- include boundary points to avoid clipping them later
         // 0 <= z <= w
         // w is always valid
-        // For the orthographic projection, (w = 1), so no modifications are necessary.
         // TODO: epsilon for numerical robustness?
 
         for (uint j = 0; j < (NUM_PLANES / 2); j++)
@@ -530,12 +535,11 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
 
         if (behindMask == 0) // Inside?
         {
-            float3 rapVertNDC = saturate(hapVert.xyz * rcp(hapVert.w)); // Must not generate negative values
+            // Clamp to the bounds in case of numerical errors (may still generate -0).
+            float3 rapVertNDC = saturate(hapVert.xyz * rcp(hapVert.w));
 
-            ndcAaBbMinPt.xyz = min(ndcAaBbMinPt.xyz, rapVertNDC);
-            ndcAaBbMaxPt.xyz = max(ndcAaBbMaxPt.xyz, rapVertNDC);
-            ndcAaBbMinPt.w   = min(ndcAaBbMinPt.w,   rbpVertVS.z);
-            ndcAaBbMaxPt.w   = max(ndcAaBbMaxPt.w,   rbpVertVS.z);
+            ndcAaBbMinPt = min(ndcAaBbMinPt, float4(rapVertNDC, rbpVertVS.z));
+            ndcAaBbMaxPt = max(ndcAaBbMaxPt, float4(rapVertNDC, rbpVertVS.z));
         }
         else // Outside
         {
@@ -616,7 +620,6 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
             // -w < y < w   <-- exclude boundary points, as we will not clip using these vertices
             // -w < z < w   <-- assume that Z-precision is not very important here
             // 0  < w
-            // For the orthographic projection, (w = 1), so no modifications are necessary.
             // TODO: epsilon for numerical robustness?
 
             bool inside = Max3(abs(hapVertLS.x), abs(hapVertLS.y), abs(hapVertLS.z)) < hapVertLS.w;
@@ -626,10 +629,8 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
                 float3 rapVertNDC = float3(rapVertCS.xy * 0.5 + 0.5, rapVertCS.z);
                 float  rbpVertVSz = hbpVertVS.z * rcp(hbpVertVS.w);
 
-                ndcAaBbMinPt.xyz = min(ndcAaBbMinPt.xyz, rapVertNDC);
-                ndcAaBbMaxPt.xyz = max(ndcAaBbMaxPt.xyz, rapVertNDC);
-                ndcAaBbMinPt.w   = min(ndcAaBbMinPt.w,   rbpVertVSz);
-                ndcAaBbMaxPt.w   = max(ndcAaBbMaxPt.w,   rbpVertVSz);
+                ndcAaBbMinPt = min(ndcAaBbMinPt, float4(rapVertNDC, rbpVertVSz));
+                ndcAaBbMaxPt = max(ndcAaBbMaxPt, float4(rapVertNDC, rbpVertVSz));
             }
         }
     }
@@ -685,8 +686,12 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
             {
                 uint f = NthBitLow(clipFaceMask, n);
 
-                ClipFaceAgainstViewVolumeAndUpdateAaBb(f, behindMasksOfVerts, firstVertexOffset, g_mInvProjection,
-                                                       ndcAaBbMinPt, ndcAaBbMaxPt);
+                uint   srcBegin, srcSize;
+                float4 vertRingBuffer[MAX_CLIP_VERTS];
+                ClipFaceAgainstViewVolume(f, behindMasksOfVerts, firstVertexOffset,
+                                          srcBegin, srcSize, vertRingBuffer);
+                UpdateAaBb(srcBegin, srcSize, vertRingBuffer, g_isOrthographic != 0, g_mInvProjection,
+                           ndcAaBbMinPt, ndcAaBbMaxPt);
             }
         }
     }
@@ -695,7 +700,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
     // ...
 #else
     // Integer comparison works for floating-point numbers as long as the sign bit is 0.
-    // We must take care of the signed zero ourselves. saturate() does not help here.
+    // We must take care of -0 ourselves. saturate() does not help here.
     InterlockedMin(gs_NdcAaBbMinPtX[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.x)));
     InterlockedMax(gs_NdcAaBbMaxPtX[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.x)));
     InterlockedMin(gs_NdcAaBbMinPtY[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.y)));

From 05a222e96f602f49088e141a647e8e8255e04892 Mon Sep 17 00:00:00 2001
From: Evgenii <evgenii@unity3d.com>
Date: Mon, 10 Aug 2020 13:10:23 -0700
Subject: [PATCH 12/22] Optimize LUT

---
 .../Lighting/LightLoop/scrbound.compute       | 144 +++++++++---------
 1 file changed, 71 insertions(+), 73 deletions(-)

diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
index a2cbbbd9e74..32c403ce85d 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
+++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
@@ -65,7 +65,8 @@ float4x4 Translation4x4(float3 d)
     return M;
 }
 
-float3x3 Rotation3x3(float3 xAxis, float3 yAxis, float3 zAxis)
+// Scale followed by rotation (scaled axes).
+float3x3 ScaledRotation3x3(float3 xAxis, float3 yAxis, float3 zAxis)
 {
     float3x3 R = float3x3(xAxis, yAxis, zAxis);
     float3x3 C = transpose(R); // Row to column
@@ -94,7 +95,7 @@ float4x4 Homogenize3x3(float3x3 R)
 
 float4x4 PerspectiveProjection4x4(float a, float g, float n, float f)
 {
-    float b = (f + n) * rcp(f - n);    // z: [-1, 1]
+    float b = (f + n) * rcp(f - n);    // Z in [-1, 1]
     float c = -2 * f * n * rcp(f - n); // No Z-reversal
 
     return float4x4(g/a, 0, 0, 0,
@@ -103,7 +104,7 @@ float4x4 PerspectiveProjection4x4(float a, float g, float n, float f)
                       0, 0, 1, 0);
 }
 
-#define CLEAR_SIGN_BIT(X)  (asuint(X) & INT_MAX)
+#define CLEAR_SIGN_BIT(X)  (asint(X) & INT_MAX)
 #define DIV_ROUND_UP(N, D) (((N) + (D) - 1) / (D)) // No division by 0 checks
 
 // Clipping a plane by a cube may produce a hexagon (6-gon).
@@ -121,43 +122,60 @@ float4x4 PerspectiveProjection4x4(float a, float g, float n, float f)
 
 // All planes and faces are always in the standard order (see below).
 // Near and far planes are swapped in the case of Z-reversal, but it does not change the algorithm.
-#define FACE_LEFT   (1 << 0) // x = -1
-#define FACE_RIGHT  (1 << 1) // x = +1
-#define FACE_FRONT  (1 << 2) // y = -1
-#define FACE_BACK   (1 << 3) // y = +1
-#define FACE_TOP    (1 << 4) // z = -1
-#define FACE_BOTTOM (1 << 5) // z = +1
+#define FACE_LEFT   (1 << 0) // -X     z
+#define FACE_RIGHT  (1 << 1) // +X    /
+#define FACE_TOP    (1 << 2) // -Y   0 -- x
+#define FACE_BOTTOM (1 << 3) // +Y   |
+#define FACE_FRONT  (1 << 4) // -Z   y
+#define FACE_BACK   (1 << 5) // +Z
 #define FACE_MASK   ((1 << NUM_FACES) - 1)
 
-// TODO: the compiler generates 'tbuffer_load_format_x' instructions
-// when we access the look-up tables. Can we avoid this?
-
-// TODO: try vert order (0 0 0), (1 0 0), (0 1 0), (1 1 0), (0 0 1), (1 0 1), (0 1 1), (1 1 1)
+// A list of vertices for each face (CCW order w.r.t. its normal, starting from the LSB).
+#define VERT_LIST_LEFT   ((2) << 9 | (6) << 6 | (4) << 3 | (0) << 0)
+#define VERT_LIST_RIGHT  ((5) << 9 | (7) << 6 | (3) << 3 | (1) << 0)
+#define VERT_LIST_TOP    ((1) << 9 | (3) << 6 | (2) << 3 | (0) << 0)
+#define VERT_LIST_BOTTOM ((6) << 9 | (7) << 6 | (5) << 3 | (4) << 0)
+#define VERT_LIST_FRONT  ((4) << 9 | (5) << 6 | (1) << 3 | (0) << 0)
+#define VERT_LIST_BACK   ((3) << 9 | (7) << 6 | (6) << 3 | (2) << 0)
 
 // All vertices are always in the standard order (see below).
-static const uint s_FaceMasksOfVerts[NUM_VERTS] =
+uint GetFaceMaskOfVertex(uint v)
 {
-    FACE_LEFT  | FACE_FRONT | FACE_TOP,    // 0: (-1, -1, -1)
-    FACE_RIGHT | FACE_FRONT | FACE_TOP,    // 1: (+1, -1, -1)
-    FACE_RIGHT | FACE_BACK  | FACE_TOP,    // 2: (+1, +1, -1)
-    FACE_LEFT  | FACE_BACK  | FACE_TOP,    // 3: (-1, +1, -1)
-    FACE_LEFT  | FACE_FRONT | FACE_BOTTOM, // 4: (-1, -1, +1)
-    FACE_RIGHT | FACE_FRONT | FACE_BOTTOM, // 5: (+1, -1, +1)
-    FACE_RIGHT | FACE_BACK  | FACE_BOTTOM, // 6: (+1, +1, +1)
-    FACE_LEFT  | FACE_BACK  | FACE_BOTTOM  // 7: (-1, +1, +1)
+    // 0: (-1, -1, -1) -> { FACE_LEFT  | FACE_TOP    | FACE_FRONT }
+    // 1: (+1, -1, -1) -> { FACE_RIGHT | FACE_TOP    | FACE_FRONT }
+    // 2: (-1, +1, -1) -> { FACE_LEFT  | FACE_BOTTOM | FACE_FRONT }
+    // 3: (+1, +1, -1) -> { FACE_RIGHT | FACE_BOTTOM | FACE_FRONT }
+    // 4: (-1, -1, +1) -> { FACE_LEFT  | FACE_TOP    | FACE_BACK  }
+    // 5: (+1, -1, +1) -> { FACE_RIGHT | FACE_TOP    | FACE_BACK  }
+    // 6: (-1, +1, +1) -> { FACE_LEFT  | FACE_BOTTOM | FACE_BACK  }
+    // 7: (+1, +1, +1) -> { FACE_RIGHT | FACE_BOTTOM | FACE_BACK  }
+    // ((v & 1) == 0) ? 1 : 2) | ((v & 2) == 0) ? 4 : 8) | ((v & 4) == 0) ? 16 : 32)
+    uint f = (FACE_LEFT  << BitFieldExtract(v, 0, 1))
+           | (FACE_TOP   << BitFieldExtract(v, 1, 1))
+           | (FACE_FRONT << BitFieldExtract(v, 2, 1));
+
+    return f;
 };
 
-// CCW order (starting with the LSB) of vertices for each face (w.r.t. its normal),
-// with normals pointing in the interior of the volume.
-static const uint s_VertMasksOfFaces[NUM_FACES] =
+float3 GenerateVertexOfStandardCube(uint v)
 {
-    (3) << 9 | (7) << 6 | (4) << 3 | (0) << 0, // 0: FACE_LEFT
-    (5) << 9 | (6) << 6 | (2) << 3 | (1) << 0, // 1: FACE_RIGHT
-    (4) << 9 | (5) << 6 | (1) << 3 | (0) << 0, // 2: FACE_FRONT
-    (6) << 9 | (7) << 6 | (3) << 3 | (2) << 0, // 3: FACE_BACK
-    (1) << 9 | (2) << 6 | (3) << 3 | (0) << 0, // 4: FACE_TOP
-    (7) << 9 | (6) << 6 | (5) << 3 | (4) << 0  // 5: FACE_BOTTOM
-};
+    float3 p;
+
+    p.x = ((v & 1) == 0) ? -1 : 1;
+    p.y = ((v & 2) == 0) ? -1 : 1;
+    p.z = ((v & 4) == 0) ? -1 : 1;
+
+    return p;
+}
+
+uint GetVertexListOfFace(uint f)
+{
+    static const uint3 allVertLists = uint3((VERT_LIST_RIGHT  << 12) | VERT_LIST_LEFT,
+                                            (VERT_LIST_BOTTOM << 12) | VERT_LIST_TOP,
+                                            (VERT_LIST_BACK   << 12) | VERT_LIST_FRONT);
+
+    return BitFieldExtract(allVertLists[f >> 1], 12 * (f & 1), 12);
+}
 
 // 5 arrays * 128 elements * 4 bytes each = 2560 bytes.
 groupshared float gs_HapVertsX[VERTS_PER_GROUP];
@@ -187,11 +205,11 @@ groupshared uint  gs_NdcAaBbMaxPtW[LIGHTS_PER_GROUP]; // View-space Z coordinate
 bool TryCullFace(uint f, uint behindMasksOfVerts[NUM_VERTS])
 {
     uint cullMaskOfFace = FACE_MASK; // Initially behind
-    uint vertMaskOfFace = s_VertMasksOfFaces[f];
+    uint vertListOfFace = GetVertexListOfFace(f);
 
     for (int j = 0; j < 4; j++)
     {
-        uint v = BitFieldExtract(vertMaskOfFace, 3 * j, 3);
+        uint v = BitFieldExtract(vertListOfFace, 3 * j, 3);
         // Non-zero if ALL the vertices are behind any of the planes.
         cullMaskOfFace &= behindMasksOfVerts[v];
     }
@@ -207,9 +225,9 @@ struct ClipVertex
 
 ClipVertex CreateClipVertex(uint p, float4 v)
 {
-    bool evenPlane = (p % 2) == 0;
+    bool evenPlane = (p & 1) == 0;
 
-    float c = v[p / 2];
+    float c = v[p >> 1];
     float w = v.w;
 
     ClipVertex cv;
@@ -300,7 +318,7 @@ void ClipFaceAgainstViewVolume(uint f, uint behindMasksOfVerts[NUM_VERTS], uint
     srcSize  = 4;
 
     uint clipMaskOfFace = 0; // Initially in front
-    uint vertMaskOfFace = s_VertMasksOfFaces[f];
+    uint vertListOfFace = GetVertexListOfFace(f);
 
     for (int j = 0; j < 4; j++)
     {
@@ -316,11 +334,9 @@ void ClipFaceAgainstViewVolume(uint f, uint behindMasksOfVerts[NUM_VERTS], uint
         vertRingBuffer[j].w = gs_HapVertsW[firstVertexOffset + v];
     }
 
-    const uint numPlanesToClipAgainst = countbits(clipMaskOfFace); // [1, 6]
-
     // Sutherland-Hodgeman polygon clipping algorithm.
     // It works by clipping the entire polygon against one clipping plane at a time.
-    for (uint j = 0; j < numPlanesToClipAgainst; j++)
+    while (clipMaskOfFace != 0)
     {
         uint p = firstbitlow(clipMaskOfFace);
 
@@ -341,13 +357,12 @@ void UpdateAaBb(uint srcBegin, uint srcSize, float4 vertRingBuffer[MAX_CLIP_VERT
 #ifdef DUMB_COMPILER
     uint modSrcIdx = srcBegin % MAX_CLIP_VERTS;
 #endif
-
     for (uint j = srcBegin; j < (srcBegin + srcSize); j++)
     {
     #ifndef DUMB_COMPILER
         uint modSrcIdx = j % MAX_CLIP_VERTS;
     #endif
-        float4 hapVert    = vertRingBuffer[modSrcIdx];
+        float4 hapVert = vertRingBuffer[modSrcIdx];
         // Clamp to the bounds in case of numerical errors (may still generate -0).
         float3 rapVertNDC = saturate(hapVert.xyz * rcp(hapVert.w));
         float  rbpVertVSz = hapVert.w;
@@ -482,22 +497,17 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
     {
         uint v = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT;
 
-        // rbpVerts[0] = rbpC - rbpX * scale.x - rbpY * scale.y - rbpZ; // (-1, -1, -1)
-        // rbpVerts[1] = rbpC + rbpX * scale.x - rbpY * scale.y - rbpZ; // (+1, -1, -1)
-        // rbpVerts[2] = rbpC + rbpX * scale.x + rbpY * scale.y - rbpZ; // (+1, +1, -1)
-        // rbpVerts[3] = rbpC - rbpX * scale.x + rbpY * scale.y - rbpZ; // (-1, +1, -1)
-        // rbpVerts[4] = rbpC - rbpX           - rbpY           + rbpZ; // (-1, -1, +1)
-        // rbpVerts[5] = rbpC + rbpX           - rbpY           + rbpZ; // (+1, -1, +1)
-        // rbpVerts[6] = rbpC + rbpX           + rbpY           + rbpZ; // (+1, +1, +1)
-        // rbpVerts[7] = rbpC - rbpX           + rbpY           + rbpZ; // (-1, +1, +1)
+        // rbpVerts[0] = rbpC - rbpX * scale - rbpY * scale - rbpZ; (-s, -s, -1)
+        // rbpVerts[1] = rbpC + rbpX * scale - rbpY * scale - rbpZ; (+s, -s, -1)
+        // rbpVerts[2] = rbpC - rbpX * scale + rbpY * scale - rbpZ; (-s, +s, -1)
+        // rbpVerts[3] = rbpC + rbpX * scale + rbpY * scale - rbpZ; (+s, +s, -1)
+        // rbpVerts[4] = rbpC - rbpX         - rbpY         + rbpZ; (-1, -1, +1)
+        // rbpVerts[5] = rbpC + rbpX         - rbpY         + rbpZ; (+1, -1, +1)
+        // rbpVerts[6] = rbpC - rbpX         + rbpY         + rbpZ; (-1, +1, +1)
+        // rbpVerts[7] = rbpC + rbpX         + rbpY         + rbpZ; (+1, +1, +1)
 
-        float3 m; // See the comment above
-
-        m.x = (countbits(v % 4) == 1) ? 1 : -1;
-        m.y = ((v & 2) != 0)          ? 1 : -1;
-        m.z = (v >= 4)                ? 1 : -1;
-
-        m.xy *= (v >= 4) ? 1 : scale;
+        float3 m = GenerateVertexOfStandardCube(v);
+        m.xy *= ((v & 4) == 0) ? scale : 1; // X, Y in [-scale, scale]
 
         float3 rbpVertVS = rbpC + m.x * rbpX + m.y * rbpY + m.z * rbpZ;
         // Avoid generating (w = 0).
@@ -543,7 +553,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
         }
         else // Outside
         {
-            cullClipFaceMask |= s_FaceMasksOfVerts[v];
+            cullClipFaceMask |= GetFaceMaskOfVertex(v);
         }
 
         gs_HapVertsX[firstVertexOffset + v]          = hapVert.x;
@@ -572,7 +582,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
         // The light volume is a special type of cuboid - a right frustum.
         // We can exploit this fact by building a light-space projection matrix.
         float4x4 invTranslateToLightSpace      = Translation4x4(-rbpC);
-        float4x4 invRotateAndScaleInLightSpace = Homogenize3x3(Invert3x3(Rotation3x3(rbpX, rbpY, rbpZ)));
+        float4x4 invRotateAndScaleInLightSpace = Homogenize3x3(Invert3x3(ScaledRotation3x3(rbpX, rbpY, rbpZ)));
         // TODO: avoid full inversion by using unit vectors and passing magnitudes explicitly.
 
         // This (orthographic) projection matrix maps a view-space point to a light-space [-1, 1]^3 cube.
@@ -597,20 +607,8 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
         {
             uint v = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT;
 
-            // rapVertsCS[0] = (-1, -1, 0)
-            // rapVertsCS[1] = (+1, -1, 0)
-            // rapVertsCS[2] = (+1, +1, 0)
-            // rapVertsCS[3] = (-1, +1, 0)
-            // rapVertsCS[4] = (-1, -1, 1)
-            // rapVertsCS[5] = (+1, -1, 1)
-            // rapVertsCS[6] = (+1, +1, 1)
-            // rapVertsCS[7] = (-1, +1, 1)
-
-            float3 rapVertCS; // See the comment above
-
-            rapVertCS.x = (countbits(v % 4) == 1) ? 1 : -1;
-            rapVertCS.y = ((v & 2) != 0)          ? 1 : -1;
-            rapVertCS.z = (v >= 4)                ? 1 :  0;
+            float3 rapVertCS = GenerateVertexOfStandardCube(v);
+            rapVertCS.z = rapVertCS.z * 0.5 + 0.5; // View's projection matrix MUST map Z to [0, 1]
 
             float4 hbpVertVS = mul(g_mInvProjection, float4(rapVertCS, 1)); // Clip to view space
             float4 hapVertLS = mul(lightSpaceMatrix, hbpVertVS);            // View to light space

From 1e90134ad9e6104b80d38ccdf8b830ef2ffba440 Mon Sep 17 00:00:00 2001
From: Evgenii <evgenii@unity3d.com>
Date: Tue, 11 Aug 2020 12:54:05 -0700
Subject: [PATCH 13/22] Add wave intrinsic support

---
 .../Lighting/LightLoop/scrbound.compute       | 66 ++++++++++++++-----
 1 file changed, 49 insertions(+), 17 deletions(-)

diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
index 32c403ce85d..efa35aed440 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
+++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
@@ -28,7 +28,6 @@ RWStructuredBuffer<float4> g_vBoundsBuffer : register( u0 );
 
 #define Z_BINNING
 #define DUMB_COMPILER
-// #define USE_WAVE_INTRINSICS // We use TGSM and atomic operations if wave intrinsics are not supported
 
 #ifdef Z_BINNING
 
@@ -170,9 +169,10 @@ float3 GenerateVertexOfStandardCube(uint v)
 
 uint GetVertexListOfFace(uint f)
 {
-    static const uint3 allVertLists = uint3((VERT_LIST_RIGHT  << 12) | VERT_LIST_LEFT,
-                                            (VERT_LIST_BOTTOM << 12) | VERT_LIST_TOP,
-                                            (VERT_LIST_BACK   << 12) | VERT_LIST_FRONT);
+    // Warning: don't add 'static' here unless you want really bad code gen.
+    const uint3 allVertLists = uint3((VERT_LIST_RIGHT  << 12) | VERT_LIST_LEFT,
+                                     (VERT_LIST_BOTTOM << 12) | VERT_LIST_TOP,
+                                     (VERT_LIST_BACK   << 12) | VERT_LIST_FRONT);
 
     return BitFieldExtract(allVertLists[f >> 1], 12 * (f & 1), 12);
 }
@@ -184,7 +184,7 @@ groupshared float gs_HapVertsZ[VERTS_PER_GROUP];
 groupshared float gs_HapVertsW[VERTS_PER_GROUP];
 groupshared uint  gs_BehindMasksOfVerts[VERTS_PER_GROUP]; // 6 planes each (HLSL does not support small data types)
 
-#ifndef USE_WAVE_INTRINSICS
+#ifndef PLATFORM_SUPPORTS_WAVE_INTRINSICS
 // 1 array *  16 elements * 4 bytes each = 64 bytes.
 groupshared uint  gs_CullClipFaceMasks[LIGHTS_PER_GROUP]; // 6 faces  each (HLSL does not support small data types)
 
@@ -199,7 +199,7 @@ groupshared uint  gs_NdcAaBbMinPtZ[LIGHTS_PER_GROUP]; // Note that min-max Z can
 groupshared uint  gs_NdcAaBbMaxPtZ[LIGHTS_PER_GROUP]; // from min-max W if the projection is oblique.
 groupshared uint  gs_NdcAaBbMinPtW[LIGHTS_PER_GROUP]; // View-space Z coordinate
 groupshared uint  gs_NdcAaBbMaxPtW[LIGHTS_PER_GROUP]; // View-space Z coordinate
-#endif // USE_WAVE_INTRINSICS
+#endif // PLATFORM_SUPPORTS_WAVE_INTRINSICS
 
 // Returns 'true' if it manages to cull the face.
 bool TryCullFace(uint f, uint behindMasksOfVerts[NUM_VERTS])
@@ -322,7 +322,7 @@ void ClipFaceAgainstViewVolume(uint f, uint behindMasksOfVerts[NUM_VERTS], uint
 
     for (int j = 0; j < 4; j++)
     {
-        uint v = BitFieldExtract(vertMaskOfFace, 3 * j, 3);
+        uint v = BitFieldExtract(vertListOfFace, 3 * j, 3);
         // Non-zero if ANY of the vertices are behind any of the planes.
         clipMaskOfFace |= behindMasksOfVerts[v];
 
@@ -467,7 +467,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
     const float3 rbpY  = lgtDat.boxAxisY.xyz; // Pre-scaled
     const float3 rbpZ  = lgtDat.boxAxisZ.xyz; // Pre-scaled
 
-#ifndef USE_WAVE_INTRINSICS
+#ifndef PLATFORM_SUPPORTS_WAVE_INTRINSICS
     // (0) Initialize the TGSM.
     if (t % THREADS_PER_LIGHT == 0) // Avoid bank conflicts
     {
@@ -481,7 +481,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
         gs_NdcAaBbMinPtW[groupLocalLightIndex]     = asuint(FLT_INF);
         gs_NdcAaBbMaxPtW[groupLocalLightIndex]     = asuint(0.0f);
     }
-#endif // USE_WAVE_INTRINSICS
+#endif // PLATFORM_SUPPORTS_WAVE_INTRINSICS
 
     float4 ndcAaBbMinPt = float4(1, 1, 1, FLT_INF);
     float4 ndcAaBbMaxPt = 0;
@@ -563,8 +563,15 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
         gs_BehindMasksOfVerts[firstVertexOffset + v] = behindMask;
     }
 
-#ifdef USE_WAVE_INTRINSICS
-    // ...
+#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
+    for (uint i = 0; i < FastLog2(THREADS_PER_LIGHT); i++)
+    {
+        uint andMask = PLATFORM_LANE_COUNT - 1; // All lanes
+        uint orMask  = 0;                       // Plays no role
+        uint xorMask = 1 << i;                  // Flip bits one by one starting from the LSB
+        // TODO: Francesco - expose the right intrinsic.
+        cullClipFaceMask |= LaneSwizzle(cullClipFaceMask, orMask, 0, xorMask);
+    }
 #else
     InterlockedOr(gs_CullClipFaceMasks[groupLocalLightIndex], cullClipFaceMask);
 
@@ -633,6 +640,10 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
         }
     }
 
+#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
+    GroupMemoryBarrierWithGroupSync();
+#endif
+
     uint behindMasksOfVerts[NUM_VERTS];
 
     for (uint i = 0; i < NUM_VERTS; i++)
@@ -661,8 +672,15 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
         }
     }
 
-#ifdef USE_WAVE_INTRINSICS
-    // ...
+#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
+    for (uint i = 0; i < FastLog2(THREADS_PER_LIGHT); i++)
+    {
+        uint andMask = PLATFORM_LANE_COUNT - 1; // All lanes
+        uint orMask  = 0;                       // Plays no role
+        uint xorMask = 1 << i;                  // Flip bits one by one starting from the LSB
+        // TODO: Francesco - expose the right intrinsic.
+        cullClipFaceMask &= LaneSwizzle(cullClipFaceMask, orMask, 0, xorMask);
+    }
 #else
     InterlockedAnd(gs_CullClipFaceMasks[groupLocalLightIndex], cullClipFaceMask);
 
@@ -694,11 +712,25 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
         }
     }
 
-#ifdef USE_WAVE_INTRINSICS
-    // ...
+#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
+    for (uint i = 0; i < FastLog2(THREADS_PER_LIGHT); i++)
+    {
+        uint andMask = PLATFORM_LANE_COUNT - 1; // All lanes
+        uint orMask  = 0;                       // Plays no role
+        uint xorMask = 1 << i;                  // Flip bits one by one starting from the LSB
+        // TODO: Francesco - expose the right intrinsic.
+        ndcAaBbMinPt.x = min(ndcAaBbMinPt.x, LaneSwizzle(ndcAaBbMinPt.x, orMask, 0, xorMask));
+        ndcAaBbMaxPt.x = max(ndcAaBbMaxPt.x, LaneSwizzle(ndcAaBbMaxPt.x, orMask, 0, xorMask));
+        ndcAaBbMinPt.y = min(ndcAaBbMinPt.y, LaneSwizzle(ndcAaBbMinPt.y, orMask, 0, xorMask));
+        ndcAaBbMaxPt.y = max(ndcAaBbMaxPt.y, LaneSwizzle(ndcAaBbMaxPt.y, orMask, 0, xorMask));
+        ndcAaBbMinPt.z = min(ndcAaBbMinPt.z, LaneSwizzle(ndcAaBbMinPt.z, orMask, 0, xorMask));
+        ndcAaBbMaxPt.z = max(ndcAaBbMaxPt.z, LaneSwizzle(ndcAaBbMaxPt.z, orMask, 0, xorMask));
+        ndcAaBbMinPt.w = min(ndcAaBbMinPt.w, LaneSwizzle(ndcAaBbMinPt.w, orMask, 0, xorMask));
+        ndcAaBbMaxPt.w = max(ndcAaBbMaxPt.w, LaneSwizzle(ndcAaBbMaxPt.w, orMask, 0, xorMask));
+    }
 #else
     // Integer comparison works for floating-point numbers as long as the sign bit is 0.
-    // We must take care of -0 ourselves. saturate() does not help here.
+    // We must take care of -0 ourselves. saturate() does not help.
     InterlockedMin(gs_NdcAaBbMinPtX[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.x)));
     InterlockedMax(gs_NdcAaBbMaxPtX[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.x)));
     InterlockedMin(gs_NdcAaBbMinPtY[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.y)));
@@ -718,7 +750,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
     ndcAaBbMaxPt.z = asfloat(gs_NdcAaBbMaxPtZ[groupLocalLightIndex]);
     ndcAaBbMinPt.w = asfloat(gs_NdcAaBbMinPtW[groupLocalLightIndex]);
     ndcAaBbMaxPt.w = asfloat(gs_NdcAaBbMaxPtW[groupLocalLightIndex]);
-#endif // USE_WAVE_INTRINSICS
+#endif // PLATFORM_SUPPORTS_WAVE_INTRINSICS
 
     if (t % THREADS_PER_LIGHT == 0) // Avoid bank conflicts
     {

From 9e4b8c635c4c5205cac84951aa13c1bce8321edf Mon Sep 17 00:00:00 2001
From: Evgenii <evgenii@unity3d.com>
Date: Tue, 11 Aug 2020 16:07:59 -0700
Subject: [PATCH 14/22] Fix group count

---
 .../Runtime/Lighting/LightLoop/LightLoop.cs                | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs
index 3d6fc7dc90c..9ae90873a5a 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs
+++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs
@@ -2786,7 +2786,12 @@ static void GenerateLightsScreenSpaceAABBs(in BuildGPULightListParameters parame
                 cmd.SetComputeMatrixArrayParam(parameters.screenSpaceAABBShader, HDShaderIDs.g_mProjectionArr, parameters.lightListProjHMatrices);
                 cmd.SetComputeMatrixArrayParam(parameters.screenSpaceAABBShader, HDShaderIDs.g_mInvProjectionArr, parameters.lightListInvProjHMatrices);
 
-                cmd.DispatchCompute(parameters.screenSpaceAABBShader, parameters.screenSpaceAABBKernel, (parameters.totalLightCount + 7) / 8, parameters.viewCount, 1);
+                const int threadsPerLight = 4;  // Shader: THREADS_PER_LIGHT (4)
+                const int threadsPerGroup = 64; // Shader: THREADS_PER_GROUP (64)
+
+                int groupCount = HDUtils.DivRoundUp(parameters.totalLightCount * threadsPerLight, threadsPerGroup);
+
+                cmd.DispatchCompute(parameters.screenSpaceAABBShader, parameters.screenSpaceAABBKernel, groupCount, parameters.viewCount, 1);
             }
         }
 

From f52d29ba941cb70226606f189b0867858958a7ec Mon Sep 17 00:00:00 2001
From: Evgenii <evgenii@unity3d.com>
Date: Tue, 11 Aug 2020 16:14:11 -0700
Subject: [PATCH 15/22] Reduce the kernel count to 1

---
 .../Runtime/Lighting/LightLoop/LightLoop.cs               | 7 +------
 .../Runtime/Lighting/LightLoop/scrbound.compute           | 8 +++-----
 2 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs
index 9ae90873a5a..a476e3b789f 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs
+++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs
@@ -564,8 +564,6 @@ public void Allocate()
         Shader deferredTilePixelShader { get { return defaultResources.shaders.deferredTilePS; } }
 
 
-        static int s_GenAABBKernel;
-        static int s_GenAABBKernel_Oblique;
         static int s_GenListPerTileKernel;
         static int s_GenListPerTileKernel_Oblique;
         static int s_GenListPerVoxelKernel;
@@ -782,9 +780,6 @@ void InitializeLightLoop(IBLFilterBSDF[] iBLFilterBSDFArray)
             m_MaxLightsOnScreen = m_MaxDirectionalLightsOnScreen + m_MaxPunctualLightsOnScreen + m_MaxAreaLightsOnScreen + m_MaxEnvLightsOnScreen;
             m_MaxPlanarReflectionOnScreen = lightLoopSettings.maxPlanarReflectionOnScreen;
 
-            s_GenAABBKernel = buildScreenAABBShader.FindKernel("ScreenBoundsAABB");
-            s_GenAABBKernel_Oblique = buildScreenAABBShader.FindKernel("ScreenBoundsAABB_Oblique");
-
             // Cluster
             {
                 s_ClearVoxelAtomicKernel = buildPerVoxelLightListShader.FindKernel("ClearAtomic");
@@ -3075,7 +3070,7 @@ BuildGPULightListParameters PrepareBuildGPULightListParameters(HDCamera hdCamera
 
             // Screen space AABB
             parameters.screenSpaceAABBShader = buildScreenAABBShader;
-            parameters.screenSpaceAABBKernel = isProjectionOblique ? s_GenAABBKernel_Oblique : s_GenAABBKernel;
+            parameters.screenSpaceAABBKernel = 0;
             // camera to screen matrix (and it's inverse)
             for (int viewIndex = 0; viewIndex < hdCamera.viewCount; ++viewIndex)
             {
diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
index efa35aed440..6f697c6cb4e 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
+++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
@@ -1,10 +1,6 @@
 // The implementation is based on the demo on "fine pruned tiled lighting" published in GPU Pro 7.
 // https://github.com/wolfgangfengel/GPU-Pro-7
 
-#pragma kernel ScreenBoundsAABB                   SCRAABBGEN=ScreenBoundsAABB
-#pragma kernel ScreenBoundsAABB_Oblique           SCRAABBGEN=ScreenBoundsAABB_Oblique           USE_OBLIQUE_MODE
-
-
 #include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl"
 #include "Packages/com.unity.render-pipelines.high-definition-config/Runtime/ShaderConfig.cs.hlsl"
 #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs.hlsl"
@@ -13,6 +9,8 @@
 // #pragma enable_d3d11_debug_symbols
 #pragma only_renderers d3d11 playstation xboxone vulkan metal switch
 
+#pragma kernel GenLightAABB
+
 uniform int g_isOrthographic;
 uniform int g_iNrVisibLights;
 
@@ -408,7 +406,7 @@ void CalcBound(out bool2 bIsMinValid, out bool2 bIsMaxValid, out float2 vMin, ou
 #endif // Z_BINNING
 
 [numthreads(NR_THREADS, 1, 1)]
-void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
+void GenLightAABB(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
 {
     uint groupID = u3GroupID.x;
     uint eyeIndex = u3GroupID.y; // currently, can only be 0 or 1

From 64f50ba7319eb9ad91a084ecb49376ff68acb010 Mon Sep 17 00:00:00 2001
From: Evgenii <evgenii@unity3d.com>
Date: Tue, 11 Aug 2020 16:49:23 -0700
Subject: [PATCH 16/22] Remove old code

---
 .../Lighting/LightLoop/scrbound.compute       | 701 +++---------------
 1 file changed, 96 insertions(+), 605 deletions(-)

diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
index 6f697c6cb4e..86144d43973 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
+++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
@@ -11,23 +11,21 @@
 
 #pragma kernel GenLightAABB
 
+/* ------------------------------ Inputs ------------------------------------ */
+
 uniform int g_isOrthographic;
 uniform int g_iNrVisibLights;
 
 uniform float4x4 g_mInvProjectionArr[SHADEROPTIONS_XR_MAX_VIEWS];
 uniform float4x4 g_mProjectionArr[SHADEROPTIONS_XR_MAX_VIEWS];
 
-StructuredBuffer<SFiniteLightBound> g_data : register( t0 );
-
-#define NR_THREADS          64
+StructuredBuffer<SFiniteLightBound> g_data : register(t0);
 
-// output buffer
-RWStructuredBuffer<float4> g_vBoundsBuffer : register( u0 );
+/* ------------------------------ Outputs ----------------------------------- */
 
-#define Z_BINNING
-#define DUMB_COMPILER
+RWStructuredBuffer<float4> g_vBoundsBuffer : register(u0);
 
-#ifdef Z_BINNING
+/* ------------------------------ Utilities --------------------------------- */
 
 // Returns the location of the N-th set bit starting from the lowest order bit and working upward.
 // Slow implementation - do not use for large bit sets.
@@ -101,6 +99,10 @@ float4x4 PerspectiveProjection4x4(float a, float g, float n, float f)
                       0, 0, 1, 0);
 }
 
+/* ------------------------------ Implementation ---------------------------- */
+
+#define DUMB_COMPILER // Improve the quality of generated code
+
 #define CLEAR_SIGN_BIT(X)  (asint(X) & INT_MAX)
 #define DIV_ROUND_UP(N, D) (((N) + (D) - 1) / (D)) // No division by 0 checks
 
@@ -349,7 +351,7 @@ void ClipFaceAgainstViewVolume(uint f, uint behindMasksOfVerts[NUM_VERTS], uint
 }
 
 void UpdateAaBb(uint srcBegin, uint srcSize, float4 vertRingBuffer[MAX_CLIP_VERTS],
-                bool isOrthoProj, float4x4 invProj,
+                bool isOrthoProj, float4x4 invProjMat,
                 inout float4 ndcAaBbMinPt, inout float4 ndcAaBbMaxPt)
 {
 #ifdef DUMB_COMPILER
@@ -367,7 +369,7 @@ void UpdateAaBb(uint srcBegin, uint srcSize, float4 vertRingBuffer[MAX_CLIP_VERT
 
         if (isOrthoProj) // Must replace (w = 1)
         {
-            rbpVertVSz = dot(invProj[2], hapVert);
+            rbpVertVSz = dot(invProjMat[2], hapVert);
         }
 
         ndcAaBbMinPt = min(ndcAaBbMinPt, float4(rapVertNDC, rbpVertVSz));
@@ -379,105 +381,70 @@ void UpdateAaBb(uint srcBegin, uint srcSize, float4 vertRingBuffer[MAX_CLIP_VERT
     }
 }
 
-#else // !Z_BINNING
-
-#define THREADS_PER_LIGHT (8)
-#define THREADS_PER_GROUP (64)
-#define LIGHTS_PER_GROUP  (THREADS_PER_GROUP / THREADS_PER_LIGHT)
-#define MAX_PNTS          (9)   // strictly this should be 10=6+4 but we get more wavefronts and 10 seems to never hit (fingers crossed)
-                                // However, worst case the plane that would be skipped if such an extreme case ever happened would be backplane
-                                // clipping gets skipped which doesn't cause any errors.
-
-
-// LDS (2496 bytes)
-groupshared float posX[MAX_PNTS*8*2];
-groupshared float posY[MAX_PNTS*8*2];
-groupshared float posZ[MAX_PNTS*8*2];
-groupshared float posW[MAX_PNTS*8*2];
-groupshared unsigned int clipFlags[48];
-
+//**********************************************************************************************
+// The goal of this program is to compute the AABB of the light in the NDC space ([0, 1] range).
+// The light is represented by a convex volume (a cuboid) with 6 faces (planar quads) and 8 vertices.
+//
+// Since a light volume may be partially off-screen, we must clip it before computing the AABB.
+// Clipping the resulting AABB (rather than the light volume itself) may result in a loose AABB.
+//
+// To avoid having to deal with the "Moebius twist" property of the perspective transform,
+// we perform clipping using the homogeneous (projective) post-perspective coordinates.
+// This clipping method in described in Blinn's paper titled "Line Clipping".
+//
+// The algorithm processes a light on 4 threads. While all 6 faces may require clipping in the
+// worst case, clipping more than 4 faces is very uncommon (typically, we clip 0, 3 or 4).
+// Some faces may require culling rather than clipping (the former is simpler).
+//
+// It's important to realize that face culling may end up culling 5 (or even all 6) faces.
+// This means that the clipped light volume may be reduced to a single polygon, or nothing at all.
+// (Imagine a view volume completely or partially inside a light volume).
+// Therefore, we must perform view-volume-corner-inside-light-volume tests.
+//
+//
+// Notation:
+// rbp - real (3D) coordinates before perspective
+// hbp - hom. (4D) coordinates before perspective
+// hap - hom. (4D) coordinates after  perspective
+// rap - real (3D) coordinates after  perspective (after division by w)
+// *********************************************************************************************
+
+[numthreads(THREADS_PER_GROUP, 1, 1)]
+void GenLightAABB(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID)
+{
+    const uint t        = threadID;
+    const uint g        = groupID.x;
+    const uint eyeIndex = groupID.y; // Currently, can only be 0 or 1
 
-unsigned int GetClip(const float4 P);
-int ClipAgainstPlane(const int iSrcIndex, const int iNrSrcVerts, const int subLigt, const int p);
-void CalcBound(out bool2 bIsMinValid, out bool2 bIsMaxValid, out float2 vMin, out float2 vMax, float4x4 InvProjection, float3 pos_view_space, float r);
+    const uint intraGroupLightIndex = t / THREADS_PER_LIGHT;
+    const uint globalLightIndex     = g * LIGHTS_PER_GROUP + intraGroupLightIndex;
+    const uint firstVertexOffset    = intraGroupLightIndex * NUM_VERTS;
 
-#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightingConvexHullUtils.hlsl"
+    const int eyeAdjustedInputOffset = GenerateLightCullDataIndex(globalLightIndex, g_iNrVisibLights, eyeIndex);
+    const SFiniteLightBound cullData = g_data[eyeAdjustedInputOffset];
 
-#endif // Z_BINNING
+    const float4x4 projMat    = g_mProjectionArr[eyeIndex];
+    const float4x4 invProjMat = g_mInvProjectionArr[eyeIndex];
 
-[numthreads(NR_THREADS, 1, 1)]
-void GenLightAABB(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
-{
-    uint groupID = u3GroupID.x;
-    uint eyeIndex = u3GroupID.y; // currently, can only be 0 or 1
-
-    // The g_ is preserved in order to make cross-pipeline (FPTL) updates easier
-    float4x4 g_mInvProjection = g_mInvProjectionArr[eyeIndex];
-    float4x4 g_mProjection = g_mProjectionArr[eyeIndex];
-
-    //uint vindex = groupID * NR_THREADS + threadID;
-    unsigned int g = groupID;
-    unsigned int t = threadID;
-
-    const int subLigt = (uint) (t/THREADS_PER_LIGHT);
-    const int lgtIndex = subLigt+(uint) g*LIGHTS_PER_GROUP;
-    const int sideIndex = (uint) (t%8);
-
-    const int eyeAdjustedLgtIndex = GenerateLightCullDataIndex(lgtIndex, g_iNrVisibLights, eyeIndex);
-    SFiniteLightBound lgtDat = g_data[eyeAdjustedLgtIndex];
-
-#ifdef Z_BINNING
-    //**********************************************************************************************
-    // The goal of this program is to compute the AABB of the light in the NDC space ([0, 1] range).
-    // The light is represented by a convex volume (a cuboid) with 6 faces (planar quads) and 8 vertices.
-    //
-    // Since a light volume may be partially off-screen, we must clip it before computing the AABB.
-    // Clipping the resulting AABB (rather than the light volume itself) may result in a loose AABB.
-    //
-    // To avoid having to deal with the "Moebius twist" property of the perspective transform,
-    // we perform clipping using the homogeneous (projective) post-perspective coordinates.
-    // This clipping method in described in Blinn's paper titled "Line Clipping".
-    //
-    // The algorithm processes a light on 4 threads. While all 6 faces may require clipping in the
-    // worst case, clipping more than 4 faces is very uncommon (typically, we clip 0, 3 or 4).
-    // Some faces may require culling rather than clipping (the former is simpler).
-    //
-    // It's important to realize that face culling may end up culling 5 (or even all 6) faces.
-    // This means that the clipped light volume may be reduced to a single polygon, or nothing at all.
-    // (Imagine a view volume completely or partially inside a light volume).
-    // Therefore, we must perform view-volume-corner-inside-light-volume tests.
-    //
-    //
-    // Notation:
-    // rbp - real (3D) coordinates before perspective
-    // hbp - hom. (4D) coordinates before perspective
-    // hap - hom. (4D) coordinates after  perspective
-    // rap - real (3D) coordinates after  perspective (after division by w)
-    // *********************************************************************************************
-
-    const uint groupLocalLightIndex = t / THREADS_PER_LIGHT;
-    const uint firstVertexOffset    = NUM_VERTS * groupLocalLightIndex;
-
-    const float  scale = lgtDat.scaleXY;      // scale.x = scale.y
-    const float3 rbpC  = lgtDat.center.xyz;
-    // TODO: store X, Y, Scale
-    const float3 rbpX  = lgtDat.boxAxisX.xyz; // Pre-scaled
-    const float3 rbpY  = lgtDat.boxAxisY.xyz; // Pre-scaled
-    const float3 rbpZ  = lgtDat.boxAxisZ.xyz; // Pre-scaled
+    const float  scale = cullData.scaleXY;      // scale.x = scale.y
+    const float3 rbpC  = cullData.center.xyz;   // View-space
+    const float3 rbpX  = cullData.boxAxisX.xyz; // Pre-scaled
+    const float3 rbpY  = cullData.boxAxisY.xyz; // Pre-scaled
+    const float3 rbpZ  = cullData.boxAxisZ.xyz; // Pre-scaled
 
 #ifndef PLATFORM_SUPPORTS_WAVE_INTRINSICS
     // (0) Initialize the TGSM.
     if (t % THREADS_PER_LIGHT == 0) // Avoid bank conflicts
     {
-        gs_CullClipFaceMasks[groupLocalLightIndex] = 0; // Initially inside
-        gs_NdcAaBbMinPtX[groupLocalLightIndex]     = asuint(1.0f);
-        gs_NdcAaBbMaxPtX[groupLocalLightIndex]     = asuint(0.0f);
-        gs_NdcAaBbMinPtY[groupLocalLightIndex]     = asuint(1.0f);
-        gs_NdcAaBbMaxPtY[groupLocalLightIndex]     = asuint(0.0f);
-        gs_NdcAaBbMinPtZ[groupLocalLightIndex]     = asuint(1.0f);
-        gs_NdcAaBbMaxPtZ[groupLocalLightIndex]     = asuint(0.0f);
-        gs_NdcAaBbMinPtW[groupLocalLightIndex]     = asuint(FLT_INF);
-        gs_NdcAaBbMaxPtW[groupLocalLightIndex]     = asuint(0.0f);
+        gs_CullClipFaceMasks[intraGroupLightIndex] = 0; // Initially inside
+        gs_NdcAaBbMinPtX[intraGroupLightIndex]     = asuint(1.0f);
+        gs_NdcAaBbMaxPtX[intraGroupLightIndex]     = asuint(0.0f);
+        gs_NdcAaBbMinPtY[intraGroupLightIndex]     = asuint(1.0f);
+        gs_NdcAaBbMaxPtY[intraGroupLightIndex]     = asuint(0.0f);
+        gs_NdcAaBbMinPtZ[intraGroupLightIndex]     = asuint(1.0f);
+        gs_NdcAaBbMaxPtZ[intraGroupLightIndex]     = asuint(0.0f);
+        gs_NdcAaBbMinPtW[intraGroupLightIndex]     = asuint(FLT_INF);
+        gs_NdcAaBbMaxPtW[intraGroupLightIndex]     = asuint(0.0f);
     }
 #endif // PLATFORM_SUPPORTS_WAVE_INTRINSICS
 
@@ -511,7 +478,7 @@ void GenLightAABB(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
         // Avoid generating (w = 0).
         rbpVertVS.z = (abs(rbpVertVS.z) > FLT_MIN) ? rbpVertVS.z : FLT_MIN;
 
-        float4 hapVert = mul(g_mProjection, float4(rbpVertVS, 1));
+        float4 hapVert = mul(projMat, float4(rbpVertVS, 1));
 
         // Warning: the W component may be negative.
         // Flipping the -W pyramid by negating all coordinates is incorrect
@@ -571,11 +538,11 @@ void GenLightAABB(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
         cullClipFaceMask |= LaneSwizzle(cullClipFaceMask, orMask, 0, xorMask);
     }
 #else
-    InterlockedOr(gs_CullClipFaceMasks[groupLocalLightIndex], cullClipFaceMask);
+    InterlockedOr(gs_CullClipFaceMasks[intraGroupLightIndex], cullClipFaceMask);
 
     GroupMemoryBarrierWithGroupSync();
 
-    cullClipFaceMask = gs_CullClipFaceMasks[groupLocalLightIndex];
+    cullClipFaceMask = gs_CullClipFaceMasks[intraGroupLightIndex];
 #endif
 
     // (2) Test the corners of the view volume.
@@ -615,8 +582,8 @@ void GenLightAABB(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
             float3 rapVertCS = GenerateVertexOfStandardCube(v);
             rapVertCS.z = rapVertCS.z * 0.5 + 0.5; // View's projection matrix MUST map Z to [0, 1]
 
-            float4 hbpVertVS = mul(g_mInvProjection, float4(rapVertCS, 1)); // Clip to view space
-            float4 hapVertLS = mul(lightSpaceMatrix, hbpVertVS);            // View to light space
+            float4 hbpVertVS = mul(invProjMat, float4(rapVertCS, 1)); // Clip to view space
+            float4 hapVertLS = mul(lightSpaceMatrix, hbpVertVS);      // View to light space
 
             // Consider the vertex to be inside the light volume if:
             // -w < x < w
@@ -680,11 +647,11 @@ void GenLightAABB(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
         cullClipFaceMask &= LaneSwizzle(cullClipFaceMask, orMask, 0, xorMask);
     }
 #else
-    InterlockedAnd(gs_CullClipFaceMasks[groupLocalLightIndex], cullClipFaceMask);
+    InterlockedAnd(gs_CullClipFaceMasks[intraGroupLightIndex], cullClipFaceMask);
 
     GroupMemoryBarrierWithGroupSync();
 
-    cullClipFaceMask = gs_CullClipFaceMasks[groupLocalLightIndex];
+    cullClipFaceMask = gs_CullClipFaceMasks[intraGroupLightIndex];
 #endif
 
     // (4) Clip the faces.
@@ -704,7 +671,7 @@ void GenLightAABB(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
                 float4 vertRingBuffer[MAX_CLIP_VERTS];
                 ClipFaceAgainstViewVolume(f, behindMasksOfVerts, firstVertexOffset,
                                           srcBegin, srcSize, vertRingBuffer);
-                UpdateAaBb(srcBegin, srcSize, vertRingBuffer, g_isOrthographic != 0, g_mInvProjection,
+                UpdateAaBb(srcBegin, srcSize, vertRingBuffer, g_isOrthographic != 0, invProjMat,
                            ndcAaBbMinPt, ndcAaBbMaxPt);
             }
         }
@@ -729,510 +696,34 @@ void GenLightAABB(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
 #else
     // Integer comparison works for floating-point numbers as long as the sign bit is 0.
     // We must take care of -0 ourselves. saturate() does not help.
-    InterlockedMin(gs_NdcAaBbMinPtX[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.x)));
-    InterlockedMax(gs_NdcAaBbMaxPtX[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.x)));
-    InterlockedMin(gs_NdcAaBbMinPtY[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.y)));
-    InterlockedMax(gs_NdcAaBbMaxPtY[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.y)));
-    InterlockedMin(gs_NdcAaBbMinPtZ[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.z)));
-    InterlockedMax(gs_NdcAaBbMaxPtZ[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.z)));
-    InterlockedMin(gs_NdcAaBbMinPtW[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.w)));
-    InterlockedMax(gs_NdcAaBbMaxPtW[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.w)));
+    InterlockedMin(gs_NdcAaBbMinPtX[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.x)));
+    InterlockedMax(gs_NdcAaBbMaxPtX[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.x)));
+    InterlockedMin(gs_NdcAaBbMinPtY[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.y)));
+    InterlockedMax(gs_NdcAaBbMaxPtY[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.y)));
+    InterlockedMin(gs_NdcAaBbMinPtZ[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.z)));
+    InterlockedMax(gs_NdcAaBbMaxPtZ[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.z)));
+    InterlockedMin(gs_NdcAaBbMinPtW[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.w)));
+    InterlockedMax(gs_NdcAaBbMaxPtW[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.w)));
 
     GroupMemoryBarrierWithGroupSync();
 
-    ndcAaBbMinPt.x = asfloat(gs_NdcAaBbMinPtX[groupLocalLightIndex]);
-    ndcAaBbMaxPt.x = asfloat(gs_NdcAaBbMaxPtX[groupLocalLightIndex]);
-    ndcAaBbMinPt.y = asfloat(gs_NdcAaBbMinPtY[groupLocalLightIndex]);
-    ndcAaBbMaxPt.y = asfloat(gs_NdcAaBbMaxPtY[groupLocalLightIndex]);
-    ndcAaBbMinPt.z = asfloat(gs_NdcAaBbMinPtZ[groupLocalLightIndex]);
-    ndcAaBbMaxPt.z = asfloat(gs_NdcAaBbMaxPtZ[groupLocalLightIndex]);
-    ndcAaBbMinPt.w = asfloat(gs_NdcAaBbMinPtW[groupLocalLightIndex]);
-    ndcAaBbMaxPt.w = asfloat(gs_NdcAaBbMaxPtW[groupLocalLightIndex]);
+    ndcAaBbMinPt.x = asfloat(gs_NdcAaBbMinPtX[intraGroupLightIndex]);
+    ndcAaBbMaxPt.x = asfloat(gs_NdcAaBbMaxPtX[intraGroupLightIndex]);
+    ndcAaBbMinPt.y = asfloat(gs_NdcAaBbMinPtY[intraGroupLightIndex]);
+    ndcAaBbMaxPt.y = asfloat(gs_NdcAaBbMaxPtY[intraGroupLightIndex]);
+    ndcAaBbMinPt.z = asfloat(gs_NdcAaBbMinPtZ[intraGroupLightIndex]);
+    ndcAaBbMaxPt.z = asfloat(gs_NdcAaBbMaxPtZ[intraGroupLightIndex]);
+    ndcAaBbMinPt.w = asfloat(gs_NdcAaBbMinPtW[intraGroupLightIndex]);
+    ndcAaBbMaxPt.w = asfloat(gs_NdcAaBbMaxPtW[intraGroupLightIndex]);
 #endif // PLATFORM_SUPPORTS_WAVE_INTRINSICS
 
     if (t % THREADS_PER_LIGHT == 0) // Avoid bank conflicts
     {
-        // Each light's AABB is represented by two float3s, the min and max of the box.
-        // And for stereo, we have two sets of lights. Therefore, each eye has a set of mins, followed by
-        // a set of maxs, and each set is equal to g_iNrVisibLights.
-        const ScreenSpaceBoundsIndices boundsIndices = GenerateScreenSpaceBoundsIndices(lgtIndex, g_iNrVisibLights, eyeIndex);
-
-        g_vBoundsBuffer[boundsIndices.min] = ndcAaBbMinPt;
-        g_vBoundsBuffer[boundsIndices.max] = ndcAaBbMaxPt;
-    }
-
-#else // !Z_BINNING
-    const float3 boxX = lgtDat.boxAxisX.xyz;
-    const float3 boxY = lgtDat.boxAxisY.xyz;
-    const float3 boxZ = -lgtDat.boxAxisZ.xyz;           // flip axis (so it points away from the light direction for a spot-light)
-    const float3 center = lgtDat.center.xyz;
-    const float radius = lgtDat.radius;
-    const float2 scaleXY = lgtDat.scaleXY;
-
-    {
-        if(sideIndex<6 && lgtIndex<(int) g_iNrVisibLights)      // mask 2 out of 8 threads
-        {
-            float3 q0, q1, q2, q3;
-            GetHullQuad(q0, q1, q2, q3, boxX, boxY, boxZ, center, scaleXY, sideIndex);
-
-
-            const float4 vP0 = mul(g_mProjection, float4(q0, 1));
-            const float4 vP1 = mul(g_mProjection, float4(q1, 1));
-            const float4 vP2 = mul(g_mProjection, float4(q2, 1));
-            const float4 vP3 = mul(g_mProjection, float4(q3, 1));
-
-            // test vertices of one quad (of the convex hull) for intersection
-            const unsigned int uFlag0 = GetClip(vP0);
-            const unsigned int uFlag1 = GetClip(vP1);
-            const unsigned int uFlag2 = GetClip(vP2);
-            const unsigned int uFlag3 = GetClip(vP3);
-
-            const float4 vPnts[] = {vP0, vP1, vP2, vP3};
-
-            // screen-space AABB of one quad (assuming no intersection)
-            float3 vMin, vMax;
-            for(int k=0; k<4; k++)
-            {
-                float fW = vPnts[k].w;
-                float fS = fW<0 ? -1 : 1;
-                float fWabs = fW<0 ? (-fW) : fW;
-                fW = fS * (fWabs<FLT_EPS ? FLT_EPS : fWabs);
-                float3 vP = float3(vPnts[k].x/fW, vPnts[k].y/fW, vPnts[k].z/fW);
-                if(k==0) { vMin=vP; vMax=vP; }
-
-                vMax = max(vMax, vP); vMin = min(vMin, vP);
-            }
-
-            clipFlags[subLigt*6+sideIndex] = (uFlag0<<0) | (uFlag1<<6) | (uFlag2<<12) | (uFlag3<<18);
-
-            // store in clip buffer (only use these vMin and vMax if light is 100% visible in which case clipping isn't needed)
-            posX[subLigt*MAX_PNTS*2 + sideIndex] = vMin.x;
-            posY[subLigt*MAX_PNTS*2 + sideIndex] = vMin.y;
-            posZ[subLigt*MAX_PNTS*2 + sideIndex] = vMin.z;
-
-            posX[subLigt*MAX_PNTS*2 + sideIndex + 6] = vMax.x;
-            posY[subLigt*MAX_PNTS*2 + sideIndex + 6] = vMax.y;
-            posZ[subLigt*MAX_PNTS*2 + sideIndex + 6] = vMax.z;
-        }
-    }
-
-    // if not XBONE and not PLAYSTATION4 we need a memorybarrier here
-    // since we can't rely on the gpu cores being 64 wide.
-    // We need a pound define around this.
-    GroupMemoryBarrierWithGroupSync();
-
-
-    {
-        int f=0;
-
-        if(sideIndex==0 && lgtIndex<(int) g_iNrVisibLights)
-        {
-            // quick acceptance or rejection
-            unsigned int uCollectiveAnd = (unsigned int) -1;
-            unsigned int uCollectiveOr = 0;
-            for(f=0; f<6; f++)
-            {
-                unsigned int uFlagAnd = clipFlags[subLigt*6+f]&0x3f;
-                unsigned int uFlagOr = uFlagAnd;
-                for(int i=1; i<4; i++)
-                {
-                    unsigned int uClipBits = (clipFlags[subLigt*6+f]>>(i*6))&0x3f;
-                    uFlagAnd &= uClipBits;
-                    uFlagOr |= uClipBits;
-                }
-
-                uCollectiveAnd &= uFlagAnd;
-                uCollectiveOr |= uFlagOr;
-            }
-
-            bool bSetBoundYet = false;
-            float3 vMin=0.0, vMax=0.0;
-            if(uCollectiveAnd!=0 || uCollectiveOr==0)       // all invisible or all visible (early out)
-            {
-                if(uCollectiveOr==0)    // all visible
-                {
-                    for(f=0; f<6; f++)
-                    {
-                        const int sideIndex = f;
-
-                        float3 vFaceMi = float3(posX[subLigt*MAX_PNTS*2 + sideIndex + 0], posY[subLigt*MAX_PNTS*2 + sideIndex + 0], posZ[subLigt*MAX_PNTS*2 + sideIndex + 0]);
-                        float3 vFaceMa = float3(posX[subLigt*MAX_PNTS*2 + sideIndex + 6], posY[subLigt*MAX_PNTS*2 + sideIndex + 6], posZ[subLigt*MAX_PNTS*2 + sideIndex + 6]);
-
-                        for(int k=0; k<2; k++)
-                        {
-                            float3 vP = k==0 ? vFaceMi : vFaceMa;
-                            if(f==0 && k==0) { vMin=vP; vMax=vP; }
-
-                            vMax = max(vMax, vP); vMin = min(vMin, vP);
-                        }
-                    }
-                    bSetBoundYet=true;
-                }
-            }
-            else        // :( need true clipping
-            {
-
-                for(f=0; f<6; f++)
-                {
-                    float3 q0, q1, q2, q3;
-                    GetHullQuad(q0, q1, q2, q3, boxX, boxY, boxZ, center, scaleXY, f);
-
-                    // 4 vertices to a quad of the convex hull in post projection space
-                    const float4 vP0 = mul(g_mProjection, float4(q0, 1));
-                    const float4 vP1 = mul(g_mProjection, float4(q1, 1));
-                    const float4 vP2 = mul(g_mProjection, float4(q2, 1));
-                    const float4 vP3 = mul(g_mProjection, float4(q3, 1));
-
-
-                    int iSrcIndex = 0;
-
-                    int offs = iSrcIndex*MAX_PNTS+subLigt*MAX_PNTS*2;
-
-                    // fill up source clip buffer with the quad
-                    posX[offs+0]=vP0.x; posX[offs+1]=vP1.x; posX[offs+2]=vP2.x; posX[offs+3]=vP3.x;
-                    posY[offs+0]=vP0.y; posY[offs+1]=vP1.y; posY[offs+2]=vP2.y; posY[offs+3]=vP3.y;
-                    posZ[offs+0]=vP0.z; posZ[offs+1]=vP1.z; posZ[offs+2]=vP2.z; posZ[offs+3]=vP3.z;
-                    posW[offs+0]=vP0.w; posW[offs+1]=vP1.w; posW[offs+2]=vP2.w; posW[offs+3]=vP3.w;
-
-                    int iNrSrcVerts = 4;
-
-                    // do true clipping
-                    for(int p=0; p<6; p++)
-                    {
-                        const int nrVertsDst = ClipAgainstPlane(iSrcIndex, iNrSrcVerts, subLigt, p);
-
-                        iSrcIndex = 1-iSrcIndex;
-                        iNrSrcVerts = nrVertsDst;
-
-                        if(iNrSrcVerts<3 || iNrSrcVerts>=MAX_PNTS) break;
-                    }
-
-                    // final clipped convex primitive is in src buffer
-                    if(iNrSrcVerts>2)
-                    {
-                        int offs_src = iSrcIndex*MAX_PNTS+subLigt*MAX_PNTS*2;
-                        for(int k=0; k<iNrSrcVerts; k++)
-                        {
-                            float4 vCur = float4(posX[offs_src+k], posY[offs_src+k], posZ[offs_src+k], posW[offs_src+k]);
-
-                            // project and apply toward AABB
-                            float3 vP = float3(vCur.x/vCur.w, vCur.y/vCur.w, vCur.z/vCur.w);
-                            if(!bSetBoundYet) { vMin=vP; vMax=vP; bSetBoundYet=true; }
-
-                            vMax = max(vMax, vP); vMin = min(vMin, vP);
-                        }
-                    }
-
-                }
-
-                ////////////////////// look for camera frustum verts that need to be included. That is frustum vertices inside the convex hull for the light
-#ifdef USE_OBLIQUE_MODE
-				bool bIsObliqueClipPlane = true;
-#else
-				bool bIsObliqueClipPlane = false;
-#endif
-				const int nrFrustVertsToTest = bIsObliqueClipPlane ? 4 : 8;
-
-                int i=0;
-                for(i=0; i<nrFrustVertsToTest; i++)  // establish 8 camera frustum vertices
-                {
-                    float3 vVertPSpace = float3((i&1)!=0 ? 1 : (-1), (i&2)!=0 ? 1 : (-1), (i&4)!=0 ? 1 : 0);
-
-                    float4 v4ViewSpace = mul(g_mInvProjection, float4(vVertPSpace,1));
-                    float3 vViewSpace = float3(v4ViewSpace.x/v4ViewSpace.w, v4ViewSpace.y/v4ViewSpace.w, v4ViewSpace.z/v4ViewSpace.w);
-
-                    posX[subLigt*MAX_PNTS*2 + i] = vViewSpace.x;
-                    posY[subLigt*MAX_PNTS*2 + i] = vViewSpace.y;
-                    posZ[subLigt*MAX_PNTS*2 + i] = vViewSpace.z;
-                }
-
-                // determine which camera frustum vertices are inside the convex hull
-                uint uVisibFl = 0xff;
-                for(f=0; f<6; f++)
-                {
-                    float3 vP0, vN;
-                    GetHullPlane(vP0, vN, boxX, boxY, boxZ, center, scaleXY, f);
-
-                    for(i=0; i<nrFrustVertsToTest; i++)
-                    {
-                        float3 vViewSpace = float3(posX[subLigt*MAX_PNTS*2 + i], posY[subLigt*MAX_PNTS*2 + i], posZ[subLigt*MAX_PNTS*2 + i]);
-                        uVisibFl &= ( dot(vViewSpace-vP0, vN)<0 ? 0xff : (~(1<<i)) );
-                    }
-                }
-
-                // apply camera frustum vertices inside the convex hull to the AABB
-                for(i=0; i<nrFrustVertsToTest; i++)
-                {
-                    if((uVisibFl&(1<<i))!=0)
-                    {
-                        float3 vP = float3((i&1)!=0 ? 1 : (-1), (i&2)!=0 ? 1 : (-1), (i&4)!=0 ? 1 : 0);
-
-                        if(!bSetBoundYet) { vMin=vP; vMax=vP; bSetBoundYet=true; }
-
-                        vMax = max(vMax, vP); vMin = min(vMin, vP);
-                    }
-                }
-            }
-
-
-
-
-
-            // determine AABB bound in [-1;1]x[-1;1] screen space using bounding sphere.
-            // Use the result to make our already established AABB from the convex hull
-            // potentially tighter.
-            if(!bSetBoundYet)
-            {
-                // set the AABB off-screen
-                vMin = float3(-3,-3,-3);
-                vMax = float3(-2,-2,-2);
-            }
-            else
-            {
-                //if((center.z+radius)<0.0)
-                if(g_isOrthographic==0 && length(center)>radius)
-                {
-                    float2 vMi, vMa;
-                    bool2 bMi, bMa;
-                    CalcBound(bMi, bMa, vMi, vMa, g_mInvProjection, center, radius);
-
-                    vMin.xy = bMi ? max(vMin.xy, vMi) : vMin.xy;
-                    vMax.xy = bMa ? min(vMax.xy, vMa) : vMax.xy;
-                }
-                else if(g_isOrthographic!=0)
-                {
-                    float2 vMi = mul(g_mProjection, float4(center.xyz-radius,1)).xy;     // no division needed for ortho
-                    float2 vMa = mul(g_mProjection, float4(center.xyz+radius,1)).xy;     // no division needed for ortho
-                    vMin.xy = max(vMin.xy, vMi);
-                    vMax.xy = min(vMax.xy, vMa);
-                }
-#ifndef USE_OBLIQUE_MODE
-#if USE_LEFT_HAND_CAMERA_SPACE
-                if((center.z-radius)>0.0)
-                {
-                    float4 vPosF = mul(g_mProjection, float4(0,0,center.z-radius,1));
-                    vMin.z = max(vMin.z, vPosF.z/vPosF.w);
-                }
-                if((center.z+radius)>0.0)
-                {
-                    float4 vPosB = mul(g_mProjection, float4(0,0,center.z+radius,1));
-                    vMax.z = min(vMax.z, vPosB.z/vPosB.w);
-                }
-#else
-                if((center.z+radius)<0.0)
-                {
-                    float4 vPosF = mul(g_mProjection, float4(0,0,center.z+radius,1));
-                    vMin.z = max(vMin.z, vPosF.z/vPosF.w);
-                }
-                if((center.z-radius)<0.0)
-                {
-                    float4 vPosB = mul(g_mProjection, float4(0,0,center.z-radius,1));
-                    vMax.z = min(vMax.z, vPosB.z/vPosB.w);
-                }
-#endif
-                else
-                {
-                    vMin = float3(-3,-3,-3);
-                    vMax = float3(-2,-2,-2);
-                }
-#endif
-            }
-
-
-            // we should consider doing a look-up here into a max depth mip chain
-            // to see if the light is occluded: vMin.z*VIEWPORT_SCALE_Z > MipTexelMaxDepth
-            //g_vBoundsBuffer[lgtIndex+0] = float3(0.5*vMin.x+0.5, -0.5*vMax.y+0.5, vMin.z*VIEWPORT_SCALE_Z);
-            //g_vBoundsBuffer[lgtIndex+g_iNrVisibLights] = float3(0.5*vMax.x+0.5, -0.5*vMin.y+0.5, vMax.z*VIEWPORT_SCALE_Z);
-
-            // changed for unity
-
-            // Each light's AABB is represented by two float3s, the min and max of the box.
-            // And for stereo, we have two sets of lights.  Therefore, each eye has a set of mins, followed by
-            // a set of maxs, and each set is equal to g_iNrVisibLights.
-            const ScreenSpaceBoundsIndices boundsIndices = GenerateScreenSpaceBoundsIndices(lgtIndex, g_iNrVisibLights, eyeIndex);
-
-			// build a linear (in camera space) min/max Z for the aabb. This is needed for clustered when oblique is active
-			float linMiZ, linMaZ;
-#ifndef USE_OBLIQUE_MODE
-			float2 vMiZW = mul(g_mInvProjection, float4(vMin,1)).zw;
-			float2 vMaZW = mul(g_mInvProjection, float4(vMax,1)).zw;
-			linMiZ = vMiZW.x/vMiZW.y; linMaZ = vMaZW.x/vMaZW.y;
-#else
-			for(int i=0; i<8; i++)  // establish 8 aabb points in camera space.
-            {
-                float3 vP = float3((i&1)!=0 ? vMax.x : vMin.x, (i&2)!=0 ? vMax.y : vMin.y, (i&4)!=0 ? vMax.z : vMin.z);
-
-                float2 v2Pc = mul(g_mInvProjection, float4(vP,1)).zw;
-                float linZ = v2Pc.x/v2Pc.y;
-
-				if(i==0) { linMiZ=linZ; linMaZ=linZ; }
-#if USE_LEFT_HAND_CAMERA_SPACE
-				linMiZ = min(linMiZ, linZ); linMaZ = max(linMaZ, linZ);
-#else
-				linMiZ = max(linMiZ, linZ); linMaZ = min(linMaZ, linZ);
-#endif
-            }
-
-			float z0 = center.z-radius, z1 = center.z+radius;
-#if USE_LEFT_HAND_CAMERA_SPACE
-			linMiZ = max(linMiZ, z0); linMaZ = min(linMaZ, z1);
-#else
-			linMiZ = min(linMiZ, z1); linMaZ = max(linMaZ, z0);
-#endif
-
-#endif
-
-            g_vBoundsBuffer[boundsIndices.min] = float4(0.5*vMin.x + 0.5, 0.5*vMin.y + 0.5, vMin.z*VIEWPORT_SCALE_Z, linMiZ);
-            g_vBoundsBuffer[boundsIndices.max] = float4(0.5*vMax.x + 0.5, 0.5*vMax.y + 0.5, vMax.z*VIEWPORT_SCALE_Z, linMaZ);
-        }
-    }
-#endif // Z_BINNING
-}
-
-#ifndef Z_BINNING
-
-float4 GenNewVert(const float4 vVisib, const float4 vInvisib, const int p);
-
-int ClipAgainstPlane(const int iSrcIndex, const int iNrSrcVerts, const int subLigt, const int p)
-{
-    int offs_src = iSrcIndex*MAX_PNTS+subLigt*MAX_PNTS*2;
-    int offs_dst = (1-iSrcIndex)*MAX_PNTS+subLigt*MAX_PNTS*2;
-
-    float4 vPrev = float4(posX[offs_src+(iNrSrcVerts-1)], posY[offs_src+(iNrSrcVerts-1)], posZ[offs_src+(iNrSrcVerts-1)], posW[offs_src+(iNrSrcVerts-1)]);
-
-    int nrVertsDst = 0;
-
-    unsigned int uMask = (1<<p);
-    bool bIsPrevVisib = (GetClip(vPrev)&uMask)==0;
-    for(int i=0; i<iNrSrcVerts; i++)
-    {
-        float4 vCur = float4(posX[offs_src+i], posY[offs_src+i], posZ[offs_src+i], posW[offs_src+i]);
-        bool bIsCurVisib = (GetClip(vCur)&uMask)==0;
-        if( (bIsCurVisib && !bIsPrevVisib) || (!bIsCurVisib && bIsPrevVisib) )
-        {
-            //assert(nrVertsDst<MAX_PNTS);
-            if(nrVertsDst<MAX_PNTS)
-            {
-                // generate new vertex
-                float4 vNew = GenNewVert(bIsCurVisib ? vCur : vPrev, bIsCurVisib ? vPrev : vCur, p);
-                posX[offs_dst+nrVertsDst]=vNew.x; posY[offs_dst+nrVertsDst]=vNew.y; posZ[offs_dst+nrVertsDst]=vNew.z; posW[offs_dst+nrVertsDst]=vNew.w;
-                ++nrVertsDst;
-            }
-        }
-
-        if(bIsCurVisib)
-        {
-            //assert(nrVertsDst<MAX_PNTS);
-            if(nrVertsDst<MAX_PNTS)
-            {
-                posX[offs_dst+nrVertsDst]=vCur.x; posY[offs_dst+nrVertsDst]=vCur.y; posZ[offs_dst+nrVertsDst]=vCur.z; posW[offs_dst+nrVertsDst]=vCur.w;
-                ++nrVertsDst;
-            }
-        }
+        // For stereo, we have two sets of lights. Therefore, each eye has a set of mins
+        // followed by a set of maxs, and each set is equal to g_iNrVisibLights.
+        const ScreenSpaceBoundsIndices eyeAdjustedOutputOffsets = GenerateScreenSpaceBoundsIndices(globalLightIndex, g_iNrVisibLights, eyeIndex);
 
-        vPrev = vCur;
-        bIsPrevVisib = bIsCurVisib;
+        g_vBoundsBuffer[eyeAdjustedOutputOffsets.min] = ndcAaBbMinPt;
+        g_vBoundsBuffer[eyeAdjustedOutputOffsets.max] = ndcAaBbMaxPt;
     }
-
-    return nrVertsDst;
-}
-
-
-
-unsigned int GetClip(const float4 P)
-{
-#ifdef USE_OBLIQUE_MODE
-	bool bIsObliqueClipPlane = true;
-#else
-	bool bIsObliqueClipPlane = false;
-#endif
-
-    //-P.w <= P.x <= P.w
-    return (((P.x<-P.w)?1:0) | ((P.x>P.w)?2:0) | ((P.y<-P.w)?4:0) | ((P.y>P.w)?8:0) | ((P.z<0)?16:0) | ((P.z>P.w)?32:0)) & (bIsObliqueClipPlane ? 0x1f : 0x3f);
-}
-
-float4 GenNewVert(const float4 vVisib, const float4 vInvisib, const int p)
-{
-    const float fS = p==4 ? 0 : ((p&1)==0 ? -1 : 1);
-    const int index = ((uint) p)/2;
-    float x1 = index==0 ? vVisib.x : (index==1 ? vVisib.y : vVisib.z);
-    float x0 = index==0 ? vInvisib.x : (index==1 ? vInvisib.y : vInvisib.z);
-
-    //fS*((vVisib.w-vInvisib.w)*t + vInvisib.w) = (x1-x0)*t + x0;
-
-    const float fT = (fS*vInvisib.w-x0)/((x1-x0) - fS*(vVisib.w-vInvisib.w));
-    float4 vNew = vVisib*fT + vInvisib*(1-fT);
-
-    // just to be really anal we make sure the clipped against coordinate is precise
-    if(index==0) vNew.x = fS*vNew.w;
-    else if(index==1) vNew.y = fS*vNew.w;
-    else vNew.z = fS*vNew.w;
-
-    return vNew;
-}
-
-
-float4 TransformPlaneToPostSpace(float4x4 InvProjection, float4 plane)
-{
-    return mul(plane, InvProjection);
-}
-
-float4 EvalPlanePair(out bool validPlanes, float2 posXY_in, float r)
-{
-    // rotate by 90 degrees to avoid potential division by zero
-    bool bMustFlip = abs(posXY_in.y)<abs(posXY_in.x);
-    float2 posXY = bMustFlip ? float2(-posXY_in.y, posXY_in.x) : posXY_in;
-
-    float fLenSQ = dot(posXY, posXY);
-    float diffSq = fLenSQ - r*r;
-    float D = posXY.y * sqrt(max(0.0, diffSq));
-
-    float4 res;
-    res.x = (-r*posXY.x - D) / fLenSQ;
-    res.z = (-r*posXY.x + D) / fLenSQ;
-    res.y = (-r-res.x*posXY.x) / posXY.y;
-    res.w = (-r-res.z*posXY.x) / posXY.y;
-
-    // rotate back by 90 degrees
-    res = bMustFlip ? float4(res.y, -res.x, res.w, -res.z) : res;
-
-    validPlanes = diffSq>0.0;
-
-    return res;
 }
-
-void CalcBound(out bool2 bIsMinValid, out bool2 bIsMaxValid, out float2 vMin, out float2 vMax, float4x4 InvProjection, float3 pos_view_space, float r)
-{
-    bool validX, validY;
-    float4 planeX = EvalPlanePair(validX, float2(pos_view_space.x, pos_view_space.z), r);
-    float4 planeY = EvalPlanePair(validY, float2(pos_view_space.y, pos_view_space.z), r);
-
-
-#if USE_LEFT_HAND_CAMERA_SPACE
-    planeX = planeX.zwxy;       // need to swap left/right and top/bottom planes when using left hand system
-    planeY = planeY.zwxy;
-#endif
-
-    bIsMinValid = bool2(planeX.z<0, planeY.z<0) && bool2(validX,validY);
-    bIsMaxValid = bool2((-planeX.x)<0, (-planeY.x)<0) && bool2(validX,validY);
-
-    // hopefully the compiler takes zeros into account
-    // should be the case since the transformation in TransformPlaneToPostSpace()
-    // is done using multiply-adds and not dot product instructions.
-    float4 planeX0 = TransformPlaneToPostSpace(InvProjection, float4(planeX.x, 0, planeX.y, 0));
-    float4 planeX1 = TransformPlaneToPostSpace(InvProjection, float4(planeX.z, 0, planeX.w, 0));
-    float4 planeY0 = TransformPlaneToPostSpace(InvProjection, float4(0, planeY.x, planeY.y, 0));
-    float4 planeY1 = TransformPlaneToPostSpace(InvProjection, float4(0, planeY.z, planeY.w, 0));
-
-
-    // convert planes to the forms (1,0,0,D) and (0,1,0,D)
-    // 2D bound is given by -D components
-    float2 A = -float2(planeX0.w / planeX0.x, planeY0.w / planeY0.y);
-    float2 B = -float2(planeX1.w / planeX1.x, planeY1.w / planeY1.y);
-
-    // Bound is complete
-    vMin = B;
-    vMax = A;
-}
-
-#endif // !Z_BINNING
\ No newline at end of file

From 1a3e1725aea8e2e913c6509ff3804593e4b414fd Mon Sep 17 00:00:00 2001
From: Evgenii <evgenii@unity3d.com>
Date: Tue, 11 Aug 2020 17:25:46 -0700
Subject: [PATCH 17/22] Bounds check

---
 .../Lighting/LightLoop/LightCullUtils.hlsl    | 16 ++++----
 .../Lighting/LightLoop/scrbound.compute       | 40 +++++++++----------
 2 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightCullUtils.hlsl b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightCullUtils.hlsl
index ea8d937ca7c..4a2a69df125 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightCullUtils.hlsl
+++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightCullUtils.hlsl
@@ -3,31 +3,33 @@
 
 // Used to index into our SFiniteLightBound (g_data) and
 // LightVolumeData (_LightVolumeData) buffers.
-int GenerateLightCullDataIndex(int lightIndex, uint numVisibleLights, uint eyeIndex)
+uint GenerateLightCullDataIndex(uint lightIndex, uint numVisibleLights, uint eyeIndex)
 {
+    lightIndex = min(lightIndex, numVisibleLights - 1); // Stay within bounds
+
     // For monoscopic, there is just one set of light cull data structs.
     // In stereo, all of the left eye structs are first, followed by the right eye structs.
-    const int perEyeBaseIndex = (int)eyeIndex * (int)numVisibleLights;
+    const uint perEyeBaseIndex = eyeIndex * numVisibleLights;
     return (perEyeBaseIndex + lightIndex);
 }
 
 struct ScreenSpaceBoundsIndices
 {
-    int min;
-    int max;
+    uint min;
+    uint max;
 };
 
 // The returned values are used to index into our AABB screen space bounding box buffer
 // Usually named g_vBoundsBuffer.  The two values represent the min/max indices.
-ScreenSpaceBoundsIndices GenerateScreenSpaceBoundsIndices(int lightIndex, uint numVisibleLights, uint eyeIndex)
+ScreenSpaceBoundsIndices GenerateScreenSpaceBoundsIndices(uint lightIndex, uint numVisibleLights, uint eyeIndex)
 {
     // In the monoscopic mode, there is one set of bounds (min,max -> 2 * g_iNrVisibLights)
     // In stereo, there are two sets of bounds (leftMin, leftMax, rightMin, rightMax -> 4 * g_iNrVisibLights)
-    const int eyeRelativeBase = (int)eyeIndex * 2 * (int)numVisibleLights;
+    const uint eyeRelativeBase = eyeIndex * 2 * numVisibleLights;
 
     ScreenSpaceBoundsIndices indices;
     indices.min = eyeRelativeBase + lightIndex;
-    indices.max = eyeRelativeBase + lightIndex + (int)numVisibleLights;
+    indices.max = indices.min + numVisibleLights;
 
     return indices;
 }
diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
index 86144d43973..c804adad4dc 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
+++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
@@ -13,8 +13,8 @@
 
 /* ------------------------------ Inputs ------------------------------------ */
 
-uniform int g_isOrthographic;
-uniform int g_iNrVisibLights;
+uniform uint g_isOrthographic;
+uniform uint g_iNrVisibLights;
 
 uniform float4x4 g_mInvProjectionArr[SHADEROPTIONS_XR_MAX_VIEWS];
 uniform float4x4 g_mProjectionArr[SHADEROPTIONS_XR_MAX_VIEWS];
@@ -207,7 +207,7 @@ bool TryCullFace(uint f, uint behindMasksOfVerts[NUM_VERTS])
     uint cullMaskOfFace = FACE_MASK; // Initially behind
     uint vertListOfFace = GetVertexListOfFace(f);
 
-    for (int j = 0; j < 4; j++)
+    for (uint j = 0; j < 4; j++)
     {
         uint v = BitFieldExtract(vertListOfFace, 3 * j, 3);
         // Non-zero if ALL the vertices are behind any of the planes.
@@ -310,7 +310,7 @@ void ClipPolygonAgainstPlane(uint p, uint srcBegin, uint srcSize,
     }
 }
 
-void ClipFaceAgainstViewVolume(uint f, uint behindMasksOfVerts[NUM_VERTS], uint firstVertexOffset,
+void ClipFaceAgainstViewVolume(uint f, uint behindMasksOfVerts[NUM_VERTS], uint baseVertexOffset,
                                out uint srcBegin, out uint srcSize,
                                out float4 vertRingBuffer[MAX_CLIP_VERTS])
 {
@@ -320,7 +320,7 @@ void ClipFaceAgainstViewVolume(uint f, uint behindMasksOfVerts[NUM_VERTS], uint
     uint clipMaskOfFace = 0; // Initially in front
     uint vertListOfFace = GetVertexListOfFace(f);
 
-    for (int j = 0; j < 4; j++)
+    for (uint j = 0; j < 4; j++)
     {
         uint v = BitFieldExtract(vertListOfFace, 3 * j, 3);
         // Non-zero if ANY of the vertices are behind any of the planes.
@@ -328,10 +328,10 @@ void ClipFaceAgainstViewVolume(uint f, uint behindMasksOfVerts[NUM_VERTS], uint
 
         // Not all edges may require clipping. However, filtering the vertex list
         // is somewhat expensive, so we currently don't do it.
-        vertRingBuffer[j].x = gs_HapVertsX[firstVertexOffset + v];
-        vertRingBuffer[j].y = gs_HapVertsY[firstVertexOffset + v];
-        vertRingBuffer[j].z = gs_HapVertsZ[firstVertexOffset + v];
-        vertRingBuffer[j].w = gs_HapVertsW[firstVertexOffset + v];
+        vertRingBuffer[j].x = gs_HapVertsX[baseVertexOffset + v];
+        vertRingBuffer[j].y = gs_HapVertsY[baseVertexOffset + v];
+        vertRingBuffer[j].z = gs_HapVertsZ[baseVertexOffset + v];
+        vertRingBuffer[j].w = gs_HapVertsW[baseVertexOffset + v];
     }
 
     // Sutherland-Hodgeman polygon clipping algorithm.
@@ -418,10 +418,10 @@ void GenLightAABB(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID)
 
     const uint intraGroupLightIndex = t / THREADS_PER_LIGHT;
     const uint globalLightIndex     = g * LIGHTS_PER_GROUP + intraGroupLightIndex;
-    const uint firstVertexOffset    = intraGroupLightIndex * NUM_VERTS;
+    const uint baseVertexOffset     = intraGroupLightIndex * NUM_VERTS;
 
-    const int eyeAdjustedInputOffset = GenerateLightCullDataIndex(globalLightIndex, g_iNrVisibLights, eyeIndex);
-    const SFiniteLightBound cullData = g_data[eyeAdjustedInputOffset];
+    const uint eyeAdjustedInputOffset = GenerateLightCullDataIndex(globalLightIndex, g_iNrVisibLights, eyeIndex);
+    const SFiniteLightBound  cullData = g_data[eyeAdjustedInputOffset];
 
     const float4x4 projMat    = g_mProjectionArr[eyeIndex];
     const float4x4 invProjMat = g_mInvProjectionArr[eyeIndex];
@@ -521,11 +521,11 @@ void GenLightAABB(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID)
             cullClipFaceMask |= GetFaceMaskOfVertex(v);
         }
 
-        gs_HapVertsX[firstVertexOffset + v]          = hapVert.x;
-        gs_HapVertsY[firstVertexOffset + v]          = hapVert.y;
-        gs_HapVertsZ[firstVertexOffset + v]          = hapVert.z;
-        gs_HapVertsW[firstVertexOffset + v]          = hapVert.w;
-        gs_BehindMasksOfVerts[firstVertexOffset + v] = behindMask;
+        gs_HapVertsX[baseVertexOffset + v]          = hapVert.x;
+        gs_HapVertsY[baseVertexOffset + v]          = hapVert.y;
+        gs_HapVertsZ[baseVertexOffset + v]          = hapVert.z;
+        gs_HapVertsW[baseVertexOffset + v]          = hapVert.w;
+        gs_BehindMasksOfVerts[baseVertexOffset + v] = behindMask;
     }
 
 #ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
@@ -613,7 +613,7 @@ void GenLightAABB(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID)
 
     for (uint i = 0; i < NUM_VERTS; i++)
     {
-        behindMasksOfVerts[i] = gs_BehindMasksOfVerts[firstVertexOffset + i];
+        behindMasksOfVerts[i] = gs_BehindMasksOfVerts[baseVertexOffset + i];
     }
 
     // (3) Cull the faces.
@@ -669,7 +669,7 @@ void GenLightAABB(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID)
 
                 uint   srcBegin, srcSize;
                 float4 vertRingBuffer[MAX_CLIP_VERTS];
-                ClipFaceAgainstViewVolume(f, behindMasksOfVerts, firstVertexOffset,
+                ClipFaceAgainstViewVolume(f, behindMasksOfVerts, baseVertexOffset,
                                           srcBegin, srcSize, vertRingBuffer);
                 UpdateAaBb(srcBegin, srcSize, vertRingBuffer, g_isOrthographic != 0, invProjMat,
                            ndcAaBbMinPt, ndcAaBbMaxPt);
@@ -717,7 +717,7 @@ void GenLightAABB(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID)
     ndcAaBbMaxPt.w = asfloat(gs_NdcAaBbMaxPtW[intraGroupLightIndex]);
 #endif // PLATFORM_SUPPORTS_WAVE_INTRINSICS
 
-    if (t % THREADS_PER_LIGHT == 0) // Avoid bank conflicts
+    if ((globalLightIndex < g_iNrVisibLights) && (t % THREADS_PER_LIGHT == 0)) // Avoid bank conflicts
     {
         // For stereo, we have two sets of lights. Therefore, each eye has a set of mins
         // followed by a set of maxs, and each set is equal to g_iNrVisibLights.

From 3e28378351118888600c4a4b5b3495c39d71302e Mon Sep 17 00:00:00 2001
From: Evgenii <evgenii@unity3d.com>
Date: Tue, 11 Aug 2020 17:39:14 -0700
Subject: [PATCH 18/22] Add a profiling marker

---
 .../Runtime/Lighting/LightLoop/LightLoop.cs   | 27 ++++++++++---------
 .../Lighting/LightLoop/scrbound.compute       |  4 +--
 .../Runtime/RenderPipeline/HDProfileId.cs     |  1 +
 3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs
index a476e3b789f..8af2d142711 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs
+++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs
@@ -2769,24 +2769,27 @@ static void GenerateLightsScreenSpaceAABBs(in BuildGPULightListParameters parame
         {
             if (parameters.totalLightCount != 0)
             {
-                var tileAndCluster = resources.tileAndClusterData;
+                using (new ProfilingScope(cmd, ProfilingSampler.Get(HDProfileId.GenerateLightAABBs)))
+                {
+                    var tileAndCluster = resources.tileAndClusterData;
 
-                cmd.SetComputeIntParam(parameters.screenSpaceAABBShader, HDShaderIDs.g_isOrthographic, parameters.isOrthographic ? 1 : 0);
+                    cmd.SetComputeIntParam(parameters.screenSpaceAABBShader, HDShaderIDs.g_isOrthographic, parameters.isOrthographic ? 1 : 0);
 
-                // With XR single-pass, we have one set of light bounds per view to iterate over (bounds are in view space for each view)
-                cmd.SetComputeIntParam(parameters.screenSpaceAABBShader, HDShaderIDs.g_iNrVisibLights, parameters.totalLightCount);
-                cmd.SetComputeBufferParam(parameters.screenSpaceAABBShader, parameters.screenSpaceAABBKernel, HDShaderIDs.g_data, tileAndCluster.convexBoundsBuffer);
-                cmd.SetComputeBufferParam(parameters.screenSpaceAABBShader, parameters.screenSpaceAABBKernel, HDShaderIDs.g_vBoundsBuffer, tileAndCluster.AABBBoundsBuffer);
+                    // With XR single-pass, we have one set of light bounds per view to iterate over (bounds are in view space for each view)
+                    cmd.SetComputeIntParam(parameters.screenSpaceAABBShader, HDShaderIDs.g_iNrVisibLights, parameters.totalLightCount);
+                    cmd.SetComputeBufferParam(parameters.screenSpaceAABBShader, parameters.screenSpaceAABBKernel, HDShaderIDs.g_data, tileAndCluster.convexBoundsBuffer);
+                    cmd.SetComputeBufferParam(parameters.screenSpaceAABBShader, parameters.screenSpaceAABBKernel, HDShaderIDs.g_vBoundsBuffer, tileAndCluster.AABBBoundsBuffer);
 
-                cmd.SetComputeMatrixArrayParam(parameters.screenSpaceAABBShader, HDShaderIDs.g_mProjectionArr, parameters.lightListProjHMatrices);
-                cmd.SetComputeMatrixArrayParam(parameters.screenSpaceAABBShader, HDShaderIDs.g_mInvProjectionArr, parameters.lightListInvProjHMatrices);
+                    cmd.SetComputeMatrixArrayParam(parameters.screenSpaceAABBShader, HDShaderIDs.g_mProjectionArr, parameters.lightListProjHMatrices);
+                    cmd.SetComputeMatrixArrayParam(parameters.screenSpaceAABBShader, HDShaderIDs.g_mInvProjectionArr, parameters.lightListInvProjHMatrices);
 
-                const int threadsPerLight = 4;  // Shader: THREADS_PER_LIGHT (4)
-                const int threadsPerGroup = 64; // Shader: THREADS_PER_GROUP (64)
+                    const int threadsPerLight = 4;  // Shader: THREADS_PER_LIGHT (4)
+                    const int threadsPerGroup = 64; // Shader: THREADS_PER_GROUP (64)
 
-                int groupCount = HDUtils.DivRoundUp(parameters.totalLightCount * threadsPerLight, threadsPerGroup);
+                    int groupCount = HDUtils.DivRoundUp(parameters.totalLightCount * threadsPerLight, threadsPerGroup);
 
-                cmd.DispatchCompute(parameters.screenSpaceAABBShader, parameters.screenSpaceAABBKernel, groupCount, parameters.viewCount, 1);
+                    cmd.DispatchCompute(parameters.screenSpaceAABBShader, parameters.screenSpaceAABBKernel, groupCount, parameters.viewCount, 1);
+                }
             }
         }
 
diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
index c804adad4dc..233876c830c 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
+++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
@@ -9,7 +9,7 @@
 // #pragma enable_d3d11_debug_symbols
 #pragma only_renderers d3d11 playstation xboxone vulkan metal switch
 
-#pragma kernel GenLightAABB
+#pragma kernel main
 
 /* ------------------------------ Inputs ------------------------------------ */
 
@@ -410,7 +410,7 @@ void UpdateAaBb(uint srcBegin, uint srcSize, float4 vertRingBuffer[MAX_CLIP_VERT
 // *********************************************************************************************
 
 [numthreads(THREADS_PER_GROUP, 1, 1)]
-void GenLightAABB(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID)
+void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID)
 {
     const uint t        = threadID;
     const uint g        = groupID.x;
diff --git a/com.unity.render-pipelines.high-definition/Runtime/RenderPipeline/HDProfileId.cs b/com.unity.render-pipelines.high-definition/Runtime/RenderPipeline/HDProfileId.cs
index 8959bcbc944..1d5e71b62f7 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/RenderPipeline/HDProfileId.cs
+++ b/com.unity.render-pipelines.high-definition/Runtime/RenderPipeline/HDProfileId.cs
@@ -15,6 +15,7 @@ internal enum HDProfileId
         DenoiseSSAO,
         UpSampleSSAO,
         ScreenSpaceShadows,
+        GenerateLightAABBs,
         BuildLightList,
         ContactShadows,
         BlitToFinalRTDevBuildOnly,

From 06a8d7081cce9889c573d8816d28f1d457436e51 Mon Sep 17 00:00:00 2001
From: Evgenii <evgenii@unity3d.com>
Date: Tue, 11 Aug 2020 17:58:14 -0700
Subject: [PATCH 19/22] Fix lane masks

---
 .../Lighting/LightLoop/scrbound.compute       | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
index 233876c830c..585441e30c0 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
+++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
@@ -535,7 +535,7 @@ void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID)
         uint orMask  = 0;                       // Plays no role
         uint xorMask = 1 << i;                  // Flip bits one by one starting from the LSB
         // TODO: Francesco - expose the right intrinsic.
-        cullClipFaceMask |= LaneSwizzle(cullClipFaceMask, orMask, 0, xorMask);
+        cullClipFaceMask |= LaneSwizzle(cullClipFaceMask, andMask, orMask, xorMask);
     }
 #else
     InterlockedOr(gs_CullClipFaceMasks[intraGroupLightIndex], cullClipFaceMask);
@@ -644,7 +644,7 @@ void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID)
         uint orMask  = 0;                       // Plays no role
         uint xorMask = 1 << i;                  // Flip bits one by one starting from the LSB
         // TODO: Francesco - expose the right intrinsic.
-        cullClipFaceMask &= LaneSwizzle(cullClipFaceMask, orMask, 0, xorMask);
+        cullClipFaceMask &= LaneSwizzle(cullClipFaceMask, andMask, orMask, xorMask);
     }
 #else
     InterlockedAnd(gs_CullClipFaceMasks[intraGroupLightIndex], cullClipFaceMask);
@@ -684,14 +684,14 @@ void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID)
         uint orMask  = 0;                       // Plays no role
         uint xorMask = 1 << i;                  // Flip bits one by one starting from the LSB
         // TODO: Francesco - expose the right intrinsic.
-        ndcAaBbMinPt.x = min(ndcAaBbMinPt.x, LaneSwizzle(ndcAaBbMinPt.x, orMask, 0, xorMask));
-        ndcAaBbMaxPt.x = max(ndcAaBbMaxPt.x, LaneSwizzle(ndcAaBbMaxPt.x, orMask, 0, xorMask));
-        ndcAaBbMinPt.y = min(ndcAaBbMinPt.y, LaneSwizzle(ndcAaBbMinPt.y, orMask, 0, xorMask));
-        ndcAaBbMaxPt.y = max(ndcAaBbMaxPt.y, LaneSwizzle(ndcAaBbMaxPt.y, orMask, 0, xorMask));
-        ndcAaBbMinPt.z = min(ndcAaBbMinPt.z, LaneSwizzle(ndcAaBbMinPt.z, orMask, 0, xorMask));
-        ndcAaBbMaxPt.z = max(ndcAaBbMaxPt.z, LaneSwizzle(ndcAaBbMaxPt.z, orMask, 0, xorMask));
-        ndcAaBbMinPt.w = min(ndcAaBbMinPt.w, LaneSwizzle(ndcAaBbMinPt.w, orMask, 0, xorMask));
-        ndcAaBbMaxPt.w = max(ndcAaBbMaxPt.w, LaneSwizzle(ndcAaBbMaxPt.w, orMask, 0, xorMask));
+        ndcAaBbMinPt.x = min(ndcAaBbMinPt.x, LaneSwizzle(ndcAaBbMinPt.x, andMask, orMask, xorMask));
+        ndcAaBbMaxPt.x = max(ndcAaBbMaxPt.x, LaneSwizzle(ndcAaBbMaxPt.x, andMask, orMask, xorMask));
+        ndcAaBbMinPt.y = min(ndcAaBbMinPt.y, LaneSwizzle(ndcAaBbMinPt.y, andMask, orMask, xorMask));
+        ndcAaBbMaxPt.y = max(ndcAaBbMaxPt.y, LaneSwizzle(ndcAaBbMaxPt.y, andMask, orMask, xorMask));
+        ndcAaBbMinPt.z = min(ndcAaBbMinPt.z, LaneSwizzle(ndcAaBbMinPt.z, andMask, orMask, xorMask));
+        ndcAaBbMaxPt.z = max(ndcAaBbMaxPt.z, LaneSwizzle(ndcAaBbMaxPt.z, andMask, orMask, xorMask));
+        ndcAaBbMinPt.w = min(ndcAaBbMinPt.w, LaneSwizzle(ndcAaBbMinPt.w, andMask, orMask, xorMask));
+        ndcAaBbMaxPt.w = max(ndcAaBbMaxPt.w, LaneSwizzle(ndcAaBbMaxPt.w, andMask, orMask, xorMask));
     }
 #else
     // Integer comparison works for floating-point numbers as long as the sign bit is 0.

From a7fcd99e66edfdf1539186ccfd20fcc845731a35 Mon Sep 17 00:00:00 2001
From: Evgenii <evgenii@unity3d.com>
Date: Tue, 11 Aug 2020 18:44:10 -0700
Subject: [PATCH 20/22] Fix compiler warning

---
 .../Lighting/LightLoop/scrbound.compute       | 24 ++++++++++---------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
index 585441e30c0..7cdeab66b02 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
+++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
@@ -101,7 +101,7 @@ float4x4 PerspectiveProjection4x4(float a, float g, float n, float f)
 
 /* ------------------------------ Implementation ---------------------------- */
 
-#define DUMB_COMPILER // Improve the quality of generated code
+#define DUMB_COMPILER // Improve the quality of generated code at the expense of readability
 
 #define CLEAR_SIGN_BIT(X)  (asint(X) & INT_MAX)
 #define DIV_ROUND_UP(N, D) (((N) + (D) - 1) / (D)) // No division by 0 checks
@@ -259,10 +259,10 @@ void ClipPolygonAgainstPlane(uint p, uint srcBegin, uint srcSize,
     uint modDstIdx = dstBegin % MAX_CLIP_VERTS;
 #endif
 
-    for (uint k = srcBegin; k < (srcBegin + srcSize); k++)
+    for (uint j = srcBegin; j < (srcBegin + srcSize); j++)
     {
     #ifndef DUMB_COMPILER
-        uint modSrcIdx = k % MAX_CLIP_VERTS;
+        uint modSrcIdx = j % MAX_CLIP_VERTS;
     #endif
         ClipVertex leadVert = CreateClipVertex(p, vertRingBuffer[modSrcIdx]);
 
@@ -457,8 +457,10 @@ void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID)
     // any single plane, we can trivially reject (cull) that face.
     uint cullClipFaceMask = 0; // Initially inside
 
+    uint i; // Avoid multiply-declared variable warning
+
     // (1) Compute the vertices of the light volume.
-    for (uint i = 0; i < VERTS_PER_THREAD; i++)
+    for (i = 0; i < VERTS_PER_THREAD; i++)
     {
         uint v = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT;
 
@@ -529,7 +531,7 @@ void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID)
     }
 
 #ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
-    for (uint i = 0; i < FastLog2(THREADS_PER_LIGHT); i++)
+    for (i = 0; i < FastLog2(THREADS_PER_LIGHT); i++)
     {
         uint andMask = PLATFORM_LANE_COUNT - 1; // All lanes
         uint orMask  = 0;                       // Plays no role
@@ -575,7 +577,7 @@ void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID)
             lightSpaceMatrix = mul(mul(perspProjMatrix, invTranslateEye), lightSpaceMatrix);
         }
 
-        for (uint i = 0; i < VERTS_PER_THREAD; i++)
+        for (i = 0; i < VERTS_PER_THREAD; i++)
         {
             uint v = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT;
 
@@ -611,7 +613,7 @@ void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID)
 
     uint behindMasksOfVerts[NUM_VERTS];
 
-    for (uint i = 0; i < NUM_VERTS; i++)
+    for (i = 0; i < NUM_VERTS; i++)
     {
         behindMasksOfVerts[i] = gs_BehindMasksOfVerts[baseVertexOffset + i];
     }
@@ -621,7 +623,7 @@ void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID)
         const uint cullFaceMask   = cullClipFaceMask;
         const uint numFacesToCull = countbits(cullFaceMask); // [0, 6]
 
-        for (uint i = 0; i < FACES_PER_THREAD; i++)
+        for (i = 0; i < FACES_PER_THREAD; i++)
         {
             uint n = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT;
 
@@ -638,7 +640,7 @@ void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID)
     }
 
 #ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
-    for (uint i = 0; i < FastLog2(THREADS_PER_LIGHT); i++)
+    for (i = 0; i < FastLog2(THREADS_PER_LIGHT); i++)
     {
         uint andMask = PLATFORM_LANE_COUNT - 1; // All lanes
         uint orMask  = 0;                       // Plays no role
@@ -659,7 +661,7 @@ void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID)
         const uint clipFaceMask   = cullClipFaceMask;
         const uint numFacesToClip = countbits(clipFaceMask); // [0, 6]
 
-        for (uint i = 0; i < FACES_PER_THREAD; i++)
+        for (i = 0; i < FACES_PER_THREAD; i++)
         {
             uint n = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT;
 
@@ -678,7 +680,7 @@ void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID)
     }
 
 #ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
-    for (uint i = 0; i < FastLog2(THREADS_PER_LIGHT); i++)
+    for (i = 0; i < FastLog2(THREADS_PER_LIGHT); i++)
     {
         uint andMask = PLATFORM_LANE_COUNT - 1; // All lanes
         uint orMask  = 0;                       // Plays no role

From 71004f0b983dbb1c16d1842b8011147ab469b2a9 Mon Sep 17 00:00:00 2001
From: Evgenii <evgenii@unity3d.com>
Date: Tue, 11 Aug 2020 18:56:38 -0700
Subject: [PATCH 21/22] Remove GPU Pro reference

---
 .../Runtime/Lighting/LightLoop/scrbound.compute       | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
index 7cdeab66b02..26783afb163 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
+++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
@@ -1,16 +1,13 @@
-// The implementation is based on the demo on "fine pruned tiled lighting" published in GPU Pro 7.
-// https://github.com/wolfgangfengel/GPU-Pro-7
+// #pragma enable_d3d11_debug_symbols
+#pragma only_renderers d3d11 playstation xboxone vulkan metal switch
+
+#pragma kernel main
 
 #include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl"
 #include "Packages/com.unity.render-pipelines.high-definition-config/Runtime/ShaderConfig.cs.hlsl"
 #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs.hlsl"
 #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightCullUtils.hlsl"
 
-// #pragma enable_d3d11_debug_symbols
-#pragma only_renderers d3d11 playstation xboxone vulkan metal switch
-
-#pragma kernel main
-
 /* ------------------------------ Inputs ------------------------------------ */
 
 uniform uint g_isOrthographic;

From 21c2481e2ee71bbd88878a37fd24e197f61837bd Mon Sep 17 00:00:00 2001
From: Evgenii <evgenii@unity3d.com>
Date: Wed, 26 Aug 2020 12:00:28 -0700
Subject: [PATCH 22/22] No instrinsics on Xbox

---
 .../Lighting/LightLoop/scrbound.compute       | 20 +++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
index 26783afb163..95670ade423 100644
--- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
+++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute
@@ -98,7 +98,19 @@ float4x4 PerspectiveProjection4x4(float a, float g, float n, float f)
 
 /* ------------------------------ Implementation ---------------------------- */
 
-#define DUMB_COMPILER // Improve the quality of generated code at the expense of readability
+// Improve the quality of generated code at the expense of readability.
+// Remove when the shader compiler is clever enough to perform this optimization for us.
+#define DUMB_COMPILER
+
+#ifdef SHADER_API_XBOXONE
+// The Xbox shader compiler expects the lane swizzle mask to be a compile-time constant.
+// In our case, the mask is a compile-time constant, but it is defined inside a loop
+// that is unrolled at the compile time, and the constants are generated during the
+// constant propagation pass of the optimizer. This works fine on PlayStation, but does not work
+// on Xbox. In order to avoid writing hideous code specifically for Xbox, we disable the support
+// of wave intrinsics on Xbox until the Xbox compiler is fixed.
+#undef PLATFORM_SUPPORTS_WAVE_INTRINSICS
+#endif
 
 #define CLEAR_SIGN_BIT(X)  (asint(X) & INT_MAX)
 #define DIV_ROUND_UP(N, D) (((N) + (D) - 1) / (D)) // No division by 0 checks
@@ -533,7 +545,7 @@ void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID)
         uint andMask = PLATFORM_LANE_COUNT - 1; // All lanes
         uint orMask  = 0;                       // Plays no role
         uint xorMask = 1 << i;                  // Flip bits one by one starting from the LSB
-        // TODO: Francesco - expose the right intrinsic.
+
         cullClipFaceMask |= LaneSwizzle(cullClipFaceMask, andMask, orMask, xorMask);
     }
 #else
@@ -642,7 +654,7 @@ void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID)
         uint andMask = PLATFORM_LANE_COUNT - 1; // All lanes
         uint orMask  = 0;                       // Plays no role
         uint xorMask = 1 << i;                  // Flip bits one by one starting from the LSB
-        // TODO: Francesco - expose the right intrinsic.
+
         cullClipFaceMask &= LaneSwizzle(cullClipFaceMask, andMask, orMask, xorMask);
     }
 #else
@@ -682,7 +694,7 @@ void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID)
         uint andMask = PLATFORM_LANE_COUNT - 1; // All lanes
         uint orMask  = 0;                       // Plays no role
         uint xorMask = 1 << i;                  // Flip bits one by one starting from the LSB
-        // TODO: Francesco - expose the right intrinsic.
+
         ndcAaBbMinPt.x = min(ndcAaBbMinPt.x, LaneSwizzle(ndcAaBbMinPt.x, andMask, orMask, xorMask));
         ndcAaBbMaxPt.x = max(ndcAaBbMaxPt.x, LaneSwizzle(ndcAaBbMaxPt.x, andMask, orMask, xorMask));
         ndcAaBbMinPt.y = min(ndcAaBbMinPt.y, LaneSwizzle(ndcAaBbMinPt.y, andMask, orMask, xorMask));