From c084cdd0df7fa4c35be0136e7ae57bb0ebf79302 Mon Sep 17 00:00:00 2001 From: Evgenii Date: Mon, 20 Jul 2020 16:35:07 -0700 Subject: [PATCH 01/22] Implement clipping and culling (does not consider view frustum corners) --- .../ShaderLibrary/Macros.hlsl | 1 + .../Runtime/Lighting/LightLoop/LightLoop.cs | 9 +- .../Lighting/LightLoop/scrbound.compute | 529 +++++++++++++++++- 3 files changed, 533 insertions(+), 6 deletions(-) diff --git a/com.unity.render-pipelines.core/ShaderLibrary/Macros.hlsl b/com.unity.render-pipelines.core/ShaderLibrary/Macros.hlsl index f8f478a163e..c89f6f0bbd2 100644 --- a/com.unity.render-pipelines.core/ShaderLibrary/Macros.hlsl +++ b/com.unity.render-pipelines.core/ShaderLibrary/Macros.hlsl @@ -43,6 +43,7 @@ #define HALF_MIN 6.103515625e-5 // 2^-14, the same value for 10, 11 and 16-bit: https://www.khronos.org/opengl/wiki/Small_Float_Formats #define HALF_MAX 65504.0 #define UINT_MAX 0xFFFFFFFFu +#define INT_MAX 0x7FFFFFFF #ifdef SHADER_API_GLES diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs index 1ca6843ed51..1aa7b2292f8 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs +++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs @@ -1642,9 +1642,12 @@ void GetLightVolumeDataAndBound(LightCategory lightCategory, GPULightType gpuLig } else if (gpuLightType == GPULightType.Point) { - Vector3 vx = xAxisVS; - Vector3 vy = yAxisVS; - Vector3 vz = zAxisVS; + // Construct a view-space axis-aligned bounding cube around the bounding sphere. + // This allows us to utilize the same polygon clipping technique for all lights. + // Non-axis-aligned vectors may result in a larger screen-space AABB. + Vector3 vx = new Vector3(1, 0, 0); + Vector3 vy = new Vector3(0, 1, 0); + Vector3 vz = new Vector3(0, 0, 1); bound.center = positionVS; bound.boxAxisX = vx * range; diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute index 2681070522f..6b37c450c6e 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute +++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute @@ -10,6 +10,7 @@ #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs.hlsl" #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightCullUtils.hlsl" +// #pragma enable_d3d11_debug_symbols #pragma only_renderers d3d11 playstation xboxone vulkan metal switch uniform int g_isOrthographic; @@ -25,6 +26,288 @@ StructuredBuffer g_data : register( t0 ); // output buffer RWStructuredBuffer g_vBoundsBuffer : register( u0 ); +#define DUMB_COMPILER +// #define USE_WAVE_INTRINSICS // We use TGSM and atomic operations if wave intrinsics are not supported + +#ifdef Z_BINNING + +// Computes r=(n/d) and rounds the result towards the largest adjacent integer. +uint DivRoundUp(uint n, uint d) +{ + return (n + d - 1) / d; // No division by 0 checks +} + +// Returns the location of the N-th set bit starting from the lowest order bit and working upward. +// Slow implementation - do not use for large bit sets. +// Could be optimized - see https://graphics.stanford.edu/~seander/bithacks.html +uint NthBitLow(uint value, uint n) +{ + uint b = -1; // Consistent with the behavior of firstbitlow() + uint c = countbits(value); + + if (n < c) // Validate inputs + { + uint r = n + 1; // Compute the number of remaining bits + + do + { + uint f = firstbitlow(value >> (b + 1)); // Find the next set bit + b += f + r; // Make a guess (assume all [b+f+1,b+f+r] bits are set) + c = countbits(value << (32 - (b + 1))); // Count the number of bits actually set + r = (n + 1) - c; // Compute the number of remaining bits + } while (r > 0); + } + + return b; +} + +// Clipping a plane by a cube may produce a hexagon (6-gon). +// Clipping a hexagon by 4 planes may produce a decagon (10-gon). +#define MAX_CLIP_VERTS (10) +#define NUM_EDGES (12) +#define NUM_VERTS (8) +#define NUM_FACES (6) +#define NUM_PLANES (6) +#define THREADS_PER_LIGHT (4) +#define THREADS_PER_GROUP (64) +#define LIGHTS_PER_GROUP (THREADS_PER_GROUP / THREADS_PER_LIGHT) +#define VERTS_PER_GROUP (NUM_VERTS * LIGHTS_PER_GROUP) +#define VERTS_PER_THREAD (NUM_VERTS / THREADS_PER_LIGHT) +#define FACES_PER_THREAD DivRoundUp(NUM_FACES, THREADS_PER_LIGHT) + +// All planes and faces are always in the standard order (see below). +#define FACE_LEFT (1 << 0) // x = -1 +#define FACE_RIGHT (1 << 1) // x = +1 +#define FACE_FRONT (1 << 2) // y = -1 +#define FACE_BACK (1 << 3) // y = +1 +#define FACE_TOP (1 << 4) // z = -1 +#define FACE_BOTTOM (1 << 5) // z = +1 +#define FACE_MASK ((1 << NUM_FACES) - 1) + +// TODO: the compiler generates 'tbuffer_load_format_x' instructions +// when we access the look-up tables. Can we avoid this? + +// All vertices are always in the standard order (see below). +static const uint s_FaceMasksOfVerts[NUM_VERTS] = +{ + FACE_LEFT | FACE_FRONT | FACE_TOP, // 0: (-1, -1, -1) + FACE_RIGHT | FACE_FRONT | FACE_TOP, // 1: (+1, -1, -1) + FACE_RIGHT | FACE_BACK | FACE_TOP, // 2: (+1, +1, -1) + FACE_LEFT | FACE_BACK | FACE_TOP, // 3: (-1, +1, -1) + FACE_LEFT | FACE_FRONT | FACE_BOTTOM, // 4: (-1, -1, +1) + FACE_RIGHT | FACE_FRONT | FACE_BOTTOM, // 5: (+1, -1, +1) + FACE_RIGHT | FACE_BACK | FACE_BOTTOM, // 6: (+1, +1, +1) + FACE_LEFT | FACE_BACK | FACE_BOTTOM // 7: (-1, +1, +1) +}; + +// CCW order (starting with the LSB) of vertices for each face (w.r.t. its normal), +// with normals pointing in the interior of the volume. +static const uint s_VertMasksOfFaces[NUM_FACES] = +{ + 3 << 9 | 7 << 6 | 4 << 3 | 0 << 0, // 0: FACE_LEFT + 5 << 9 | 6 << 6 | 2 << 3 | 1 << 0, // 1: FACE_RIGHT + 4 << 9 | 5 << 6 | 1 << 3 | 0 << 0, // 2: FACE_FRONT + 2 << 9 | 6 << 6 | 7 << 3 | 3 << 0, // 3: FACE_BACK + 1 << 9 | 2 << 6 | 3 << 3 | 0 << 0, // 4: FACE_TOP + 7 << 9 | 6 << 6 | 5 << 3 | 4 << 0 // 5: FACE_BOTTOM +}; + +// 5 arrays * 128 elements * 4 bytes each = 2560 bytes. +groupshared float gs_HapVertsX[VERTS_PER_GROUP]; +groupshared float gs_HapVertsY[VERTS_PER_GROUP]; +groupshared float gs_HapVertsZ[VERTS_PER_GROUP]; +groupshared float gs_HapVertsW[VERTS_PER_GROUP]; +groupshared uint gs_BehindMasksOfVerts[VERTS_PER_GROUP]; // 6 planes each (HLSL does not support small data types) + +#ifndef USE_WAVE_INTRINSICS +// 1 array * 16 elements * 4 bytes each = 64 bytes. +groupshared uint gs_CullClipFaceMasks[LIGHTS_PER_GROUP]; // 6 faces each (HLSL does not support small data types) + +// 6 arrays * 16 elements * 4 bytes each = 384 bytes. +// Note that these are actually floats reinterpreted as uints. +// The reason is because floating-point atomic operations are not supported. +groupshared uint gs_RapAaBbMinPtX[LIGHTS_PER_GROUP]; +groupshared uint gs_RapAaBbMaxPtX[LIGHTS_PER_GROUP]; +groupshared uint gs_RapAaBbMinPtY[LIGHTS_PER_GROUP]; +groupshared uint gs_RapAaBbMaxPtY[LIGHTS_PER_GROUP]; +groupshared uint gs_RapAaBbMinPtZ[LIGHTS_PER_GROUP]; +groupshared uint gs_RapAaBbMaxPtZ[LIGHTS_PER_GROUP]; +#endif // USE_WAVE_INTRINSICS + +// Returns 'true' if it manages to cull the face. +bool TryCullFace(uint f, uint behindMasksOfVerts[NUM_VERTS]) +{ + uint cullMaskOfFace = FACE_MASK; // Initially behind + uint vertMaskOfFace = s_VertMasksOfFaces[f]; + + for (int j = 0; j < 4; j++) + { + uint v = BitFieldExtract(vertMaskOfFace, 3 * j, 3); + // Non-zero if ALL the vertices are behind any of the planes. + cullMaskOfFace &= behindMasksOfVerts[v]; + } + + return (cullMaskOfFace != 0); +} + +struct ClipVertex +{ + float4 pt; // Homogeneous coordinate after perspective + float bc; // Boundary coordinate with respect to the plane 'p' +}; + +ClipVertex CreateClipVertex(uint p, float4 v) +{ + bool evenPlane = (p % 2) == 0; + + float c = v[p / 2]; + float w = v.w; + + ClipVertex cv; + + cv.pt = v; + cv.bc = evenPlane ? c : w - c; // dot(PlaneEquation, HapVertex); + + return cv; +} + +float4 IntersectEdgeAgainstPlane(ClipVertex v0, ClipVertex v1) +{ + float alpha = saturate(v0.bc * rcp(v0.bc - v1.bc)); // Guaranteed to lie between 0 and 1 + + return lerp(v0.pt, v1.pt, alpha); +} + +void ClipPolygonAgainstPlane(uint p, uint srcBegin, uint srcSize, + inout float4 vertRingBuffer[MAX_CLIP_VERTS], + out uint dstBegin, out uint dstSize) +{ + dstBegin = srcBegin + srcSize; // Start at the end; we don't use modular arithmetic here + dstSize = 0; + + ClipVertex tailVert = CreateClipVertex(p, vertRingBuffer[(srcBegin + srcSize - 1) % MAX_CLIP_VERTS]); + +#ifdef DUMB_COMPILER + uint modSrcIdx = srcBegin % MAX_CLIP_VERTS; + uint modDstIdx = dstBegin % MAX_CLIP_VERTS; +#endif + + for (uint k = srcBegin; k < (srcBegin + srcSize); k++) + { + #ifndef DUMB_COMPILER + uint modSrcIdx = k % MAX_CLIP_VERTS; + #endif + ClipVertex leadVert = CreateClipVertex(p, vertRingBuffer[modSrcIdx]); + + // Execute Blinn's line clipping algorithm. + // Classify the line segment. 4 cases: + // 0. v0 out, v1 out -> add nothing + // 1. v0 in, v1 out -> add intersection + // 2. v0 out, v1 in -> add intersection, add v1 + // 3. v0 in, v1 in -> add v1 + // (bc >= 0) <-> in, (bc < 0) <-> out. Beware of the signed zero. + + if ((tailVert.bc >= 0) != (leadVert.bc >= 0)) + { + // The line segment is guaranteed to cross the plane. + float4 clipVert = IntersectEdgeAgainstPlane(tailVert, leadVert); + #ifndef DUMB_COMPILER + uint modDstIdx = (dstBegin + dstSize++) % MAX_CLIP_VERTS; + #endif + vertRingBuffer[modDstIdx] = clipVert; + #ifdef DUMB_COMPILER + dstSize++; + modDstIdx++; + modDstIdx = (modDstIdx == MAX_CLIP_VERTS) ? 0 : modDstIdx; + #endif + } + + if (leadVert.bc >= 0) + { + #ifndef DUMB_COMPILER + uint modDstIdx = (dstBegin + dstSize++) % MAX_CLIP_VERTS; + #endif + vertRingBuffer[modDstIdx] = leadVert.pt; + #ifdef DUMB_COMPILER + dstSize++; + modDstIdx++; + modDstIdx = (modDstIdx == MAX_CLIP_VERTS) ? 0 : modDstIdx; + #endif + } + + #ifdef DUMB_COMPILER + modSrcIdx++; + modSrcIdx = (modSrcIdx == MAX_CLIP_VERTS) ? 0 : modSrcIdx; + #endif + tailVert = leadVert; // Avoid recomputation and overwriting the vertex in the ring buffer + } +} + +void ClipFaceAgainstViewVolumeAndUpdateAaBb(uint f, uint behindMasksOfVerts[NUM_VERTS], uint firstVertexOffset, + inout float3 rapAaBbMinPt, inout float3 rapAaBbMaxPt) +{ + float4 vertRingBuffer[MAX_CLIP_VERTS]; + uint srcBegin = 0, srcSize = 4; + + uint clipMaskOfFace = 0; // Initially in front + uint vertMaskOfFace = s_VertMasksOfFaces[f]; + + for (int j = 0; j < 4; j++) + { + uint v = BitFieldExtract(vertMaskOfFace, 3 * j, 3); + // Non-zero if ANY of the vertices are behind any of the planes. + clipMaskOfFace |= behindMasksOfVerts[v]; + + // Note that not all edges may require clipping. However, + // filtering the vertex list is somewhat expensive, so we currently don't do it. + vertRingBuffer[j].x = gs_HapVertsX[firstVertexOffset + v]; + vertRingBuffer[j].y = gs_HapVertsY[firstVertexOffset + v]; + vertRingBuffer[j].z = gs_HapVertsZ[firstVertexOffset + v]; + vertRingBuffer[j].w = gs_HapVertsW[firstVertexOffset + v]; + } + + const uint numPlanesToClipAgainst = countbits(clipMaskOfFace); // [1, 6] + + // Sutherland-Hodgeman polygon clipping algorithm. + // It works by clipping the entire polygon against one clipping plane at a time. + for (uint j = 0; j < numPlanesToClipAgainst; j++) + { + uint p = firstbitlow(clipMaskOfFace); + + uint dstBegin, dstSize; + ClipPolygonAgainstPlane(p, srcBegin, srcSize, vertRingBuffer, dstBegin, dstSize); + + srcBegin = dstBegin; + srcSize = dstSize; + + clipMaskOfFace ^= 1 << p; // Clear the bit to continue using firstbitlow() + } + +#ifdef DUMB_COMPILER + uint modSrcIdx = srcBegin % MAX_CLIP_VERTS; +#endif + + for (int j = srcBegin; j < (srcBegin + srcSize); j++) + { + #ifndef DUMB_COMPILER + uint modSrcIdx = j % MAX_CLIP_VERTS; + #endif + + float4 hapVert = vertRingBuffer[modSrcIdx]; + float3 rapVert = hapVert.xyz * rcp(hapVert.w); + + rapAaBbMinPt = min(rapAaBbMinPt, rapVert); + rapAaBbMaxPt = max(rapAaBbMaxPt, rapVert); + + #ifdef DUMB_COMPILER + modSrcIdx++; + modSrcIdx = (modSrcIdx == MAX_CLIP_VERTS) ? 0 : modSrcIdx; + #endif + } +} + +#else // !Z_BINNING + #define MAX_PNTS 9 // strictly this should be 10=6+4 but we get more wavefronts and 10 seems to never hit (fingers crossed) // However, worst case the plane that would be skipped if such an extreme case ever happened would be backplane // clipping gets skipped which doesn't cause any errors. @@ -44,6 +327,7 @@ void CalcBound(out bool2 bIsMinValid, out bool2 bIsMaxValid, out float2 vMin, ou #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightingConvexHullUtils.hlsl" +#endif // Z_BINNING [numthreads(NR_THREADS, 1, 1)] void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) @@ -59,13 +343,248 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) unsigned int g = groupID; unsigned int t = threadID; - const int subLigt = (int) (t/8); - const int lgtIndex = subLigt+(int) g*8; - const int sideIndex = (int) (t%8); + const int subLigt = (uint) (t/8); + const int lgtIndex = subLigt+(uint) g*8; + const int sideIndex = (uint) (t%8); const int eyeAdjustedLgtIndex = GenerateLightCullDataIndex(lgtIndex, g_iNrVisibLights, eyeIndex); SFiniteLightBound lgtDat = g_data[eyeAdjustedLgtIndex]; +#ifdef Z_BINNING + //********************************************************************************************** + // The goal of this program is to compute the AABB of the light in the NDC space ([0, 1] range). + // The light is represented by a convex volume (a cuboid) with 6 faces (planar quads) and 8 vertices. + // + // Since a light volume may be partially off-screen, we must clip it before computing the AABB. + // Clipping the resulting AABB (rather than the light volume itself) may result in a loose AABB. + // + // To avoid having to deal with toroidal properties of the perspective transform, + // we perform clipping using the homogeneous (projective) post-perspective coordinates. + // This clipping method in described in Blinn's paper titled "Line Clipping". + // + // The algorithm processes a light on 4 threads. While all 6 faces may require clipping in the + // worst case, clipping more than 4 faces is very uncommon (typically, we clip 0, 3 or 4). + // Note that some faces may require culling rather than clipping (the former is simpler). + // + // It's important to realize that face culling may end up culling 5 (or even all 6) faces. + // This means that the clipped light volume may be reduced to a single polygon, or nothing at all. + // (Imagine a view volume completely or partially inside a light volume). + // Therefore, we must perform view-volume-corner-inside-light-volume tests. + // + // + // Notation: + // rbp - real (3D) coordinates before perspective + // hbp - hom. (4D) coordinates before perspective + // hap - hom. (4D) coordinates after perspective + // rap - real (3D) coordinates after perspective (after division by w) + // ********************************************************************************************* + + const uint groupLocalLightIndex = t / THREADS_PER_LIGHT; + const uint firstVertexOffset = NUM_VERTS * groupLocalLightIndex; + + const float2 scale = lgtDat.scaleXY.xy; + const float3 rbpC = lgtDat.center.xyz; + const float3 rbpX = lgtDat.boxAxisX.xyz; + const float3 rbpY = lgtDat.boxAxisY.xyz; + const float3 rbpZ = lgtDat.boxAxisZ.xyz; + +#ifndef USE_WAVE_INTRINSICS + // Initialize the TGSM. All threads write the same value -> no data races. + // The hardware will coalesce the writes. + gs_CullClipFaceMasks[groupLocalLightIndex] = 0; // Initially inside + gs_RapAaBbMinPtX[groupLocalLightIndex] = asuint(1.0f); + gs_RapAaBbMaxPtX[groupLocalLightIndex] = asuint(0.0f); + gs_RapAaBbMinPtY[groupLocalLightIndex] = asuint(1.0f); + gs_RapAaBbMaxPtY[groupLocalLightIndex] = asuint(0.0f); + gs_RapAaBbMinPtZ[groupLocalLightIndex] = asuint(1.0f); + gs_RapAaBbMaxPtZ[groupLocalLightIndex] = asuint(0.0f); +#endif // USE_WAVE_INTRINSICS + + float3 rapAaBbMinPt = 1; + float3 rapAaBbMaxPt = 0; + + // We must determine whether we have to clip or cull any of the faces. + // If all vertices of a face are inside with respect to all the culling planes, + // we can trivially accept that face. If all vertices of a face are behind + // any single plane, we can trivially reject (cull) that face. + uint cullClipFaceMask = 0; // Initially inside + + // (1) Compute the vertices of the light volume. + for (uint i = 0; i < VERTS_PER_THREAD; i++) + { + uint v = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT; + + // rbpVerts[0] = rbpC - rbpX * scale.x - rbpY * scale.y - rbpZ; // (-1, -1, -1) + // rbpVerts[1] = rbpC + rbpX * scale.x - rbpY * scale.y - rbpZ; // (+1, -1, -1) + // rbpVerts[2] = rbpC + rbpX * scale.x + rbpY * scale.y - rbpZ; // (+1, +1, -1) + // rbpVerts[3] = rbpC - rbpX * scale.x + rbpY * scale.y - rbpZ; // (-1, +1, -1) + // rbpVerts[4] = rbpC - rbpX - rbpY + rbpZ; // (-1, -1, +1) + // rbpVerts[5] = rbpC + rbpX - rbpY + rbpZ; // (+1, -1, +1) + // rbpVerts[6] = rbpC + rbpX + rbpY + rbpZ; // (+1, +1, +1) + // rbpVerts[7] = rbpC - rbpX + rbpY + rbpZ; // (-1, +1, +1) + + float3 m; // See the comment above + + m.x = (countbits(v % 4) == 1) ? 1 : -1; + m.y = (v & 2 != 0) ? 1 : -1; + m.z = (v >= 4) ? 1 : -1; + + m.xy *= (v >= 4) ? 1 : scale; + + float3 rbpVert = rbpC + m.x * rbpX + m.y * rbpY + m.z * rbpZ; + float4 hapVert = mul(g_mProjection, float4(rbpVert, 1)); + + // Make sure the W component is strictly positive. + // It is helpful in order to simplify clipping and to avoid perspective division by 0. + float w = hapVert.w; + float s = (w >= 0) ? 1 : -1; + + // Transform the X and Y components: [-w, w] -> [0, w]. + hapVert.x = (0.5 * s) * hapVert.x + ((0.5 * s) * w); + hapVert.y = (0.5 * s) * hapVert.y + ((0.5 * s) * w); + hapVert.z = s * hapVert.z; + hapVert.w = max(abs(w), FLT_MIN); + + // For each vertex, we must determine whether it is within the bounds. + // For culling and clipping, we must know, per culling plane, whether the vertex + // is in the positive or the negative half-space. + uint behindMask = 0; // Initially in front + + // Consider the vertex to be inside the view volume if: + // 0 <= x <= w + // 0 <= y <= w + // 0 <= z <= w + w = hapVert.w; + + for (uint j = 0; j < (NUM_PLANES / 2); j++) + { + behindMask |= (hapVert[j] < 0 ? 1 : 0) << (2 * j + 0); // Planes crossing '0' + behindMask |= (hapVert[j] > w ? 1 : 0) << (2 * j + 1); // Planes crossing 'w' + } + + if (behindMask == 0) // Inside? + { + float3 rapVert = hapVert.xyz * rcp(hapVert.w); + + rapAaBbMinPt = min(rapAaBbMinPt, rapVert); + rapAaBbMaxPt = max(rapAaBbMaxPt, rapVert); + } + else // Outside + { + cullClipFaceMask |= s_FaceMasksOfVerts[v]; + } + + gs_HapVertsX[firstVertexOffset + v] = hapVert.x; + gs_HapVertsY[firstVertexOffset + v] = hapVert.y; + gs_HapVertsZ[firstVertexOffset + v] = hapVert.z; + gs_HapVertsW[firstVertexOffset + v] = hapVert.w; + gs_BehindMasksOfVerts[firstVertexOffset + v] = behindMask; + } + +#ifdef USE_WAVE_INTRINSICS + // ... +#else + InterlockedOr(gs_CullClipFaceMasks[groupLocalLightIndex], cullClipFaceMask); + + GroupMemoryBarrierWithGroupSync(); + + cullClipFaceMask = gs_CullClipFaceMasks[groupLocalLightIndex]; +#endif + + if (cullClipFaceMask != 0) + { + // The light may be partially outside the view volume. + } + + uint behindMasksOfVerts[NUM_VERTS]; + + for (uint i = 0; i < NUM_VERTS; i++) + { + behindMasksOfVerts[i] = gs_BehindMasksOfVerts[firstVertexOffset + i]; + } + + // (2) Cull the faces. + const uint cullFaceMask = cullClipFaceMask; + const uint numFacesToCull = countbits(cullFaceMask); // [0, 6] + + for (uint i = 0; i < FACES_PER_THREAD; i++) + { + uint n = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT; + + if (n < numFacesToCull) + { + uint f = NthBitLow(cullFaceMask, n); + + if (TryCullFace(f, behindMasksOfVerts)) + { + cullClipFaceMask ^= 1 << f; // Clear the bit + } + } + } + +#ifdef USE_WAVE_INTRINSICS + // ... +#else + InterlockedAnd(gs_CullClipFaceMasks[groupLocalLightIndex], cullClipFaceMask); + + GroupMemoryBarrierWithGroupSync(); + + cullClipFaceMask = gs_CullClipFaceMasks[groupLocalLightIndex]; +#endif + + // (3) Clip the faces. + const uint clipFaceMask = cullClipFaceMask; + const uint numFacesToClip = countbits(clipFaceMask); // [0, 6] + + for (uint i = 0; i < FACES_PER_THREAD; i++) + { + uint n = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT; + + if (n < numFacesToCull) + { + uint f = NthBitLow(clipFaceMask, n); + + ClipFaceAgainstViewVolumeAndUpdateAaBb(f, behindMasksOfVerts, firstVertexOffset, + rapAaBbMinPt, rapAaBbMaxPt); + } + } + +#ifdef USE_WAVE_INTRINSICS + // ... +#else + // Integer comparison works for floating-point numbers as long as the sign bit is 0. + // We must take care of the signed zero ourselves. + InterlockedMin(gs_RapAaBbMinPtX[groupLocalLightIndex], asuint(rapAaBbMinPt.x) & INT_MAX); + InterlockedMax(gs_RapAaBbMaxPtX[groupLocalLightIndex], asuint(rapAaBbMaxPt.x) & INT_MAX); + InterlockedMin(gs_RapAaBbMinPtY[groupLocalLightIndex], asuint(rapAaBbMinPt.y) & INT_MAX); + InterlockedMax(gs_RapAaBbMaxPtY[groupLocalLightIndex], asuint(rapAaBbMaxPt.y) & INT_MAX); + InterlockedMin(gs_RapAaBbMinPtZ[groupLocalLightIndex], asuint(rapAaBbMinPt.z) & INT_MAX); + InterlockedMax(gs_RapAaBbMaxPtZ[groupLocalLightIndex], asuint(rapAaBbMaxPt.z) & INT_MAX); + + GroupMemoryBarrierWithGroupSync(); + + rapAaBbMinPt.x = asfloat(gs_RapAaBbMinPtX[groupLocalLightIndex]); + rapAaBbMaxPt.x = asfloat(gs_RapAaBbMaxPtX[groupLocalLightIndex]); + rapAaBbMinPt.y = asfloat(gs_RapAaBbMinPtY[groupLocalLightIndex]); + rapAaBbMaxPt.y = asfloat(gs_RapAaBbMaxPtY[groupLocalLightIndex]); + rapAaBbMinPt.z = asfloat(gs_RapAaBbMinPtZ[groupLocalLightIndex]); + rapAaBbMaxPt.z = asfloat(gs_RapAaBbMaxPtZ[groupLocalLightIndex]); +#endif // USE_WAVE_INTRINSICS + + if (t % THREADS_PER_LIGHT == 0) + { + // Each light's AABB is represented by two float3s, the min and max of the box. + // And for stereo, we have two sets of lights. Therefore, each eye has a set of mins, followed by + // a set of maxs, and each set is equal to g_iNrVisibLights. + const ScreenSpaceBoundsIndices boundsIndices = GenerateScreenSpaceBoundsIndices(lgtIndex, g_iNrVisibLights, eyeIndex); + + float minLinearDepth = -1, maxLinearDepth = -1; // TODO + + g_vBoundsBuffer[boundsIndices.min] = float4(rapAaBbMinPt, minLinearDepth); + g_vBoundsBuffer[boundsIndices.max] = float4(rapAaBbMaxPt, maxLinearDepth); + } + +#else // !Z_BINNING const float3 boxX = lgtDat.boxAxisX.xyz; const float3 boxY = lgtDat.boxAxisY.xyz; const float3 boxZ = -lgtDat.boxAxisZ.xyz; // flip axis (so it points away from the light direction for a spot-light) @@ -390,8 +909,10 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) g_vBoundsBuffer[boundsIndices.max] = float4(0.5*vMax.x + 0.5, 0.5*vMax.y + 0.5, vMax.z*VIEWPORT_SCALE_Z, linMaZ); } } +#endif // Z_BINNING } +#ifndef Z_BINNING float4 GenNewVert(const float4 vVisib, const float4 vInvisib, const int p); @@ -536,3 +1057,5 @@ void CalcBound(out bool2 bIsMinValid, out bool2 bIsMaxValid, out float2 vMin, ou vMin = B; vMax = A; } + +#endif // !Z_BINNING \ No newline at end of file From a6f00ce6dc90e9e308ff90080b62985856333a15 Mon Sep 17 00:00:00 2001 From: Evgenii Date: Wed, 5 Aug 2020 14:04:30 -0700 Subject: [PATCH 02/22] Support orthographic projection --- .../Runtime/Lighting/LightLoop/scrbound.compute | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute index 6b37c450c6e..714a8002fa2 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute +++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute @@ -436,7 +436,8 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) // Make sure the W component is strictly positive. // It is helpful in order to simplify clipping and to avoid perspective division by 0. - float w = hapVert.w; + // For the orthographic projection, we only consider (w = 1) + float w = g_isOrthographic ? 1 : hapVert.w; float s = (w >= 0) ? 1 : -1; // Transform the X and Y components: [-w, w] -> [0, w]. From f8eea291dbff3ed9e2a8ffc8ff8701b41f2b9651 Mon Sep 17 00:00:00 2001 From: Evgenii Date: Wed, 5 Aug 2020 14:38:37 -0700 Subject: [PATCH 03/22] k_identity -> k_Identity --- .../ShaderLibrary/Common.hlsl | 10 +++++----- .../ShaderLibrary/ImageBasedLighting.hlsl | 4 ++-- .../Editor/ShaderGraph/SharedCode.template.hlsl | 2 +- .../Runtime/Material/AxF/AxF.hlsl | 10 +++++----- .../AxF/PreIntegratedFGD_CookTorrance.shader | 2 +- .../Runtime/Material/Eye/Eye.hlsl | 4 ++-- .../GGXConvolution/ComputeGgxIblSampleData.compute | 4 ++-- .../Runtime/Material/Lit/Lit.hlsl | 10 +++++----- .../Runtime/Material/Lit/SimpleLit.hlsl | 2 +- .../Runtime/Material/StackLit/StackLit.hlsl | 12 ++++++------ .../RenderPipeline/ShaderPass/VaryingMesh.hlsl | 2 +- 11 files changed, 31 insertions(+), 31 deletions(-) diff --git a/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl b/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl index 60449601be5..ce325adcde6 100644 --- a/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl +++ b/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl @@ -852,11 +852,11 @@ void CompositeOver(real3 colorFront, real3 alphaFront, // Space transformations // ---------------------------------------------------------------------------- -static const float3x3 k_identity3x3 = {1, 0, 0, +static const float3x3 k_Identity3x3 = {1, 0, 0, 0, 1, 0, 0, 0, 1}; -static const float4x4 k_identity4x4 = {1, 0, 0, 0, +static const float4x4 k_Identity4x4 = {1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1}; @@ -880,7 +880,7 @@ float4 ComputeClipSpacePosition(float2 positionNDC, float deviceDepth) // (position = positionCS) => (clipSpaceTransform = use default) // (position = positionVS) => (clipSpaceTransform = UNITY_MATRIX_P) // (position = positionWS) => (clipSpaceTransform = UNITY_MATRIX_VP) -float4 ComputeClipSpacePosition(float3 position, float4x4 clipSpaceTransform = k_identity4x4) +float4 ComputeClipSpacePosition(float3 position, float4x4 clipSpaceTransform = k_Identity4x4) { return mul(clipSpaceTransform, float4(position, 1.0)); } @@ -890,7 +890,7 @@ float4 ComputeClipSpacePosition(float3 position, float4x4 clipSpaceTransform = k // (position = positionCS) => (clipSpaceTransform = use default) // (position = positionVS) => (clipSpaceTransform = UNITY_MATRIX_P) // (position = positionWS) => (clipSpaceTransform = UNITY_MATRIX_VP) -float3 ComputeNormalizedDeviceCoordinatesWithZ(float3 position, float4x4 clipSpaceTransform = k_identity4x4) +float3 ComputeNormalizedDeviceCoordinatesWithZ(float3 position, float4x4 clipSpaceTransform = k_Identity4x4) { float4 positionCS = ComputeClipSpacePosition(position, clipSpaceTransform); @@ -912,7 +912,7 @@ float3 ComputeNormalizedDeviceCoordinatesWithZ(float3 position, float4x4 clipSpa // (position = positionCS) => (clipSpaceTransform = use default) // (position = positionVS) => (clipSpaceTransform = UNITY_MATRIX_P) // (position = positionWS) => (clipSpaceTransform = UNITY_MATRIX_VP) -float2 ComputeNormalizedDeviceCoordinates(float3 position, float4x4 clipSpaceTransform = k_identity4x4) +float2 ComputeNormalizedDeviceCoordinates(float3 position, float4x4 clipSpaceTransform = k_Identity4x4) { return ComputeNormalizedDeviceCoordinatesWithZ(position, clipSpaceTransform).xy; } diff --git a/com.unity.render-pipelines.core/ShaderLibrary/ImageBasedLighting.hlsl b/com.unity.render-pipelines.core/ShaderLibrary/ImageBasedLighting.hlsl index 406fa64810a..b67991a54ee 100644 --- a/com.unity.render-pipelines.core/ShaderLibrary/ImageBasedLighting.hlsl +++ b/com.unity.render-pipelines.core/ShaderLibrary/ImageBasedLighting.hlsl @@ -183,7 +183,7 @@ void SampleVisibleAnisoGGXDir(real2 u, real3x3 viewToLocal; if (VeqN) { - viewToLocal = k_identity3x3; + viewToLocal = k_Identity3x3; } else { @@ -366,7 +366,7 @@ real4 IntegrateGGXAndDisneyDiffuseFGD(real NdotV, real roughness, uint sampleCou real3 V = real3(sqrt(1 - NdotV * NdotV), 0, NdotV); real4 acc = real4(0.0, 0.0, 0.0, 0.0); - real3x3 localToWorld = k_identity3x3; + real3x3 localToWorld = k_Identity3x3; for (uint i = 0; i < sampleCount; ++i) { diff --git a/com.unity.render-pipelines.high-definition/Editor/ShaderGraph/SharedCode.template.hlsl b/com.unity.render-pipelines.high-definition/Editor/ShaderGraph/SharedCode.template.hlsl index 93ebaac408a..54af96c8743 100644 --- a/com.unity.render-pipelines.high-definition/Editor/ShaderGraph/SharedCode.template.hlsl +++ b/com.unity.render-pipelines.high-definition/Editor/ShaderGraph/SharedCode.template.hlsl @@ -7,7 +7,7 @@ // Init to some default value to make the computer quiet (else it output 'divide by zero' warning even if value is not used). // TODO: this is a really poor workaround, but the variable is used in a bunch of places // to compute normals which are then passed on elsewhere to compute other values... - output.tangentToWorld = k_identity3x3; + output.tangentToWorld = k_Identity3x3; output.positionSS = input.positionCS; // input.positionCS is SV_Position $FragInputs.positionRWS: output.positionRWS = input.positionRWS; diff --git a/com.unity.render-pipelines.high-definition/Runtime/Material/AxF/AxF.hlsl b/com.unity.render-pipelines.high-definition/Runtime/Material/AxF/AxF.hlsl index 29a7ad28cef..94758aa5406 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/Material/AxF/AxF.hlsl +++ b/com.unity.render-pipelines.high-definition/Runtime/Material/AxF/AxF.hlsl @@ -1338,14 +1338,14 @@ PreLightData GetPreLightData(float3 viewWS_Clearcoat, PositionInputs posInput } else { - preLightData.ltcTransformDiffuse = k_identity3x3; // Lambert + preLightData.ltcTransformDiffuse = k_Identity3x3; // Lambert } // Load specular LTC & FGD switch ((_SVBRDF_BRDFType >> 1) & 7) { // Warning: all these LTC_MATRIX_INDEX_ are the same for now, and fitted for GGX, hence the code - // above that selected the UVs all used a preLightData.iblPerceptualRoughness value that used a + // above that selected the UVs all used a preLightData.iblPerceptualRoughness value that used a // conversion formula for Beckmann NDF (exp) based BRDFs // (see switch ((_SVBRDF_BRDFType >> 1) & 7) above and usage of PerceptualRoughnessBeckmannToGGX) // @@ -2037,7 +2037,7 @@ DirectLighting EvaluateBSDF_Line( LightLoopContext lightLoopContext, //----------------------------------------------------------------------------- // Use Lambert for diffuse - ltcValue = LTCEvaluate(P1, P2, B, k_identity3x3); // No transform: Lambert uses identity + ltcValue = LTCEvaluate(P1, P2, B, k_Identity3x3); // No transform: Lambert uses identity ltcValue *= lightData.diffuseDimmer; lighting.diffuse = ltcValue; // no FGD, lambert gives 1 @@ -2141,7 +2141,7 @@ DirectLighting EvaluateBSDF_Line( LightLoopContext lightLoopContext, { // Only lighting, not BSDF // Apply area light on lambert then multiply by PI to cancel Lambert - lighting.diffuse = LTCEvaluate(P1, P2, B, k_identity3x3); + lighting.diffuse = LTCEvaluate(P1, P2, B, k_Identity3x3); lighting.diffuse *= PI * lightData.diffuseDimmer; } #endif @@ -2358,7 +2358,7 @@ DirectLighting EvaluateBSDF_Rect(LightLoopContext lightLoopContext, { // Only lighting, not BSDF // Apply area light on lambert then multiply by PI to cancel Lambert - lighting.diffuse = PolygonIrradiance(mul(lightVerts, k_identity3x3)); + lighting.diffuse = PolygonIrradiance(mul(lightVerts, k_Identity3x3)); lighting.diffuse *= PI * lightData.diffuseDimmer; } #endif diff --git a/com.unity.render-pipelines.high-definition/Runtime/Material/AxF/PreIntegratedFGD_CookTorrance.shader b/com.unity.render-pipelines.high-definition/Runtime/Material/AxF/PreIntegratedFGD_CookTorrance.shader index 5dddd6a2842..8f2bcc9fd1b 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/Material/AxF/PreIntegratedFGD_CookTorrance.shader +++ b/com.unity.render-pipelines.high-definition/Runtime/Material/AxF/PreIntegratedFGD_CookTorrance.shader @@ -89,7 +89,7 @@ Shader "Hidden/HDRP/PreIntegratedFGD_CookTorrance" float NdotV = ClampNdotV( dot(N, V) ); float4 acc = float4(0.0, 0.0, 0.0, 0.0); - float3x3 localToWorld = GetLocalFrame(N); //TODO: N not needed, we use a frame aligned to N, should use k_identity3x3 + float3x3 localToWorld = GetLocalFrame(N); //TODO: N not needed, we use a frame aligned to N, should use k_Identity3x3 for (uint i = 0; i < sampleCount; ++i) { diff --git a/com.unity.render-pipelines.high-definition/Runtime/Material/Eye/Eye.hlsl b/com.unity.render-pipelines.high-definition/Runtime/Material/Eye/Eye.hlsl index df704ded1ff..3466ba4c714 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/Material/Eye/Eye.hlsl +++ b/com.unity.render-pipelines.high-definition/Runtime/Material/Eye/Eye.hlsl @@ -295,7 +295,7 @@ PreLightData GetPreLightData(float3 V, PositionInputs posInput, inout BSDFData b float2 uv = Remap01ToHalfTexelCoord(float2(bsdfData.perceptualRoughness, theta * INV_HALF_PI), LTC_LUT_SIZE); // Note we load the matrix transpose (avoid to have to transpose it in shader) - preLightData.ltcTransformDiffuse = k_identity3x3; + preLightData.ltcTransformDiffuse = k_Identity3x3; // Get the inverse LTC matrix for GGX // Note we load the matrix transpose (avoid to have to transpose it in shader) @@ -660,7 +660,7 @@ DirectLighting EvaluateBSDF_Rect( LightLoopContext lightLoopContext, { // Only lighting, not BSDF // Apply area light on lambert then multiply by PI to cancel Lambert - lighting.diffuse = PolygonIrradiance(mul(lightVerts, k_identity3x3)); + lighting.diffuse = PolygonIrradiance(mul(lightVerts, k_Identity3x3)); lighting.diffuse *= PI * lightData.diffuseDimmer; } #endif diff --git a/com.unity.render-pipelines.high-definition/Runtime/Material/GGXConvolution/ComputeGgxIblSampleData.compute b/com.unity.render-pipelines.high-definition/Runtime/Material/GGXConvolution/ComputeGgxIblSampleData.compute index d8445386ed7..71c3ebfd5f9 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/Material/GGXConvolution/ComputeGgxIblSampleData.compute +++ b/com.unity.render-pipelines.high-definition/Runtime/Material/GGXConvolution/ComputeGgxIblSampleData.compute @@ -57,7 +57,7 @@ void ComputeGgxIblSampleData(uint3 groupThreadId : SV_GroupThreadID) // TODO: might be interesting to try Mitchell's Poisson disk sampling algorithm. // In our case, samples would not have disks associated with them, but rather solid angles. float2 u = Golden2dSeq(i, sampleCount); - SampleGGXDir(u, V, k_identity3x3, roughness, localL, NdotL, NdotH, LdotH, true); + SampleGGXDir(u, V, k_Identity3x3, roughness, localL, NdotL, NdotH, LdotH, true); if (NdotL > 0) { @@ -77,7 +77,7 @@ void ComputeGgxIblSampleData(uint3 groupThreadId : SV_GroupThreadID) float2 u = Golden2dSeq(sampleIndex, sampleCount); - SampleGGXDir(u, V, k_identity3x3, roughness, localL, NdotL, NdotH, LdotH, true); + SampleGGXDir(u, V, k_Identity3x3, roughness, localL, NdotL, NdotH, LdotH, true); float pdf = 0.25 * D_GGX(NdotH, roughness); float omegaS = rcp(sampleCount) * rcp(pdf); diff --git a/com.unity.render-pipelines.high-definition/Runtime/Material/Lit/Lit.hlsl b/com.unity.render-pipelines.high-definition/Runtime/Material/Lit/Lit.hlsl index 959efa32a01..996785eb3c7 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/Material/Lit/Lit.hlsl +++ b/com.unity.render-pipelines.high-definition/Runtime/Material/Lit/Lit.hlsl @@ -1082,7 +1082,7 @@ PreLightData GetPreLightData(float3 V, PositionInputs posInput, inout BSDFData b // Note we load the matrix transpose (avoid to have to transpose it in shader) #ifdef USE_DIFFUSE_LAMBERT_BRDF - preLightData.ltcTransformDiffuse = k_identity3x3; + preLightData.ltcTransformDiffuse = k_Identity3x3; #else // Get the inverse LTC matrix for Disney Diffuse preLightData.ltcTransformDiffuse = 0.0; @@ -1417,7 +1417,7 @@ DirectLighting EvaluateBSDF_Line( LightLoopContext lightLoopContext, // Use the Lambertian approximation for performance reasons. // The matrix multiplication should not generate any extra ALU on GCN. // TODO: double evaluation is very inefficient! This is a temporary solution. - ltcValue = LTCEvaluate(P1, P2, B, mul(flipMatrix, k_identity3x3)); + ltcValue = LTCEvaluate(P1, P2, B, mul(flipMatrix, k_Identity3x3)); ltcValue *= lightData.diffuseDimmer; // We use diffuse lighting for accumulation since it is going to be blurred during the SSS pass. // We don't multiply by 'bsdfData.diffuseColor' here. It's done only once in PostEvaluateBSDF(). @@ -1452,7 +1452,7 @@ DirectLighting EvaluateBSDF_Line( LightLoopContext lightLoopContext, { // Only lighting, not BSDF // Apply area light on lambert then multiply by PI to cancel Lambert - lighting.diffuse = LTCEvaluate(P1, P2, B, k_identity3x3); + lighting.diffuse = LTCEvaluate(P1, P2, B, k_Identity3x3); lighting.diffuse *= PI * lightData.diffuseDimmer; } #endif @@ -1572,7 +1572,7 @@ DirectLighting EvaluateBSDF_Rect( LightLoopContext lightLoopContext, // Use the Lambertian approximation for performance reasons. // The matrix multiplication should not generate any extra ALU on GCN. - float3x3 ltcTransform = mul(flipMatrix, k_identity3x3); + float3x3 ltcTransform = mul(flipMatrix, k_Identity3x3); // Polygon irradiance in the transformed configuration. // TODO: double evaluation is very inefficient! This is a temporary solution. @@ -1640,7 +1640,7 @@ DirectLighting EvaluateBSDF_Rect( LightLoopContext lightLoopContext, { // Only lighting, not BSDF // Apply area light on lambert then multiply by PI to cancel Lambert - lighting.diffuse = PolygonIrradiance(mul(lightVerts, k_identity3x3)); + lighting.diffuse = PolygonIrradiance(mul(lightVerts, k_Identity3x3)); lighting.diffuse *= PI * lightData.diffuseDimmer; } #endif diff --git a/com.unity.render-pipelines.high-definition/Runtime/Material/Lit/SimpleLit.hlsl b/com.unity.render-pipelines.high-definition/Runtime/Material/Lit/SimpleLit.hlsl index 6e311ca6951..75efb9f667e 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/Material/Lit/SimpleLit.hlsl +++ b/com.unity.render-pipelines.high-definition/Runtime/Material/Lit/SimpleLit.hlsl @@ -211,7 +211,7 @@ PreLightData GetPreLightData(float3 V, PositionInputs posInput, inout BSDFData b float theta = FastACosPos(clampedNdotV); // For Area light - UVs for sampling the LUTs float2 uv = LTC_LUT_OFFSET + LTC_LUT_SCALE * float2(bsdfData.perceptualRoughness, theta * INV_HALF_PI); - preLightData.ltcTransformDiffuse = k_identity3x3; + preLightData.ltcTransformDiffuse = k_Identity3x3; preLightData.ltcTransformSpecular = 0.0; preLightData.ltcTransformSpecular._m22 = 1.0; diff --git a/com.unity.render-pipelines.high-definition/Runtime/Material/StackLit/StackLit.hlsl b/com.unity.render-pipelines.high-definition/Runtime/Material/StackLit/StackLit.hlsl index 80a53b76a8a..9f8a5c28ff7 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/Material/StackLit/StackLit.hlsl +++ b/com.unity.render-pipelines.high-definition/Runtime/Material/StackLit/StackLit.hlsl @@ -2261,7 +2261,7 @@ void PreLightData_SetupAreaLights(BSDFData bsdfData, float3 V, float3 N[NB_NORMA #ifdef USE_DIFFUSE_LAMBERT_BRDF - preLightData.ltcTransformDiffuse = k_identity3x3; + preLightData.ltcTransformDiffuse = k_Identity3x3; #else // TODO // Get the inverse LTC matrix for Disney Diffuse @@ -2314,7 +2314,7 @@ void PreLightData_SetupAreaLightsAniso(BSDFData bsdfData, float3 V, float3 N[NB_ #ifdef USE_DIFFUSE_LAMBERT_BRDF - preLightData.ltcTransformDiffuse = k_identity3x3; + preLightData.ltcTransformDiffuse = k_Identity3x3; #else // TODO // Get the inverse LTC matrix for Disney Diffuse @@ -3801,7 +3801,7 @@ DirectLighting EvaluateBSDF_Line( LightLoopContext lightLoopContext, // Use the Lambertian approximation for performance reasons. // The matrix multiplication should not generate any extra ALU on GCN. // TODO: double evaluation is very inefficient! This is a temporary solution. - ltcValue = LTCEvaluate(localP1, localP2, B, mul(flipMatrix, k_identity3x3)); + ltcValue = LTCEvaluate(localP1, localP2, B, mul(flipMatrix, k_Identity3x3)); ltcValue *= lightData.diffuseDimmer; // VLAYERED_DIFFUSE_ENERGY_HACKED_TERM: @@ -3892,7 +3892,7 @@ DirectLighting EvaluateBSDF_Line( LightLoopContext lightLoopContext, // Only lighting, not BSDF // Apply area light on lambert then multiply by PI to cancel Lambert - lighting.diffuse = LTCEvaluate(localP1, localP2, B, k_identity3x3); + lighting.diffuse = LTCEvaluate(localP1, localP2, B, k_Identity3x3); lighting.diffuse *= PI * lightData.diffuseDimmer; } #endif @@ -4022,7 +4022,7 @@ DirectLighting EvaluateBSDF_Rect( LightLoopContext lightLoopContext, // Use the Lambertian approximation for performance reasons. // The matrix multiplication should not generate any extra ALU on GCN. - float3x3 ltcTransform = mul(flipMatrix, k_identity3x3); + float3x3 ltcTransform = mul(flipMatrix, k_Identity3x3); // Polygon irradiance in the transformed configuration. // TODO: double evaluation is very inefficient! This is a temporary solution. @@ -4136,7 +4136,7 @@ DirectLighting EvaluateBSDF_Rect( LightLoopContext lightLoopContext, // Only lighting, not BSDF // Apply area light on lambert then multiply by PI to cancel Lambert - lighting.diffuse = PolygonIrradiance(mul(localLightVerts, k_identity3x3)); + lighting.diffuse = PolygonIrradiance(mul(localLightVerts, k_Identity3x3)); lighting.diffuse *= PI * lightData.diffuseDimmer; } #endif diff --git a/com.unity.render-pipelines.high-definition/Runtime/RenderPipeline/ShaderPass/VaryingMesh.hlsl b/com.unity.render-pipelines.high-definition/Runtime/RenderPipeline/ShaderPass/VaryingMesh.hlsl index dca58fd987e..d2d12385dca 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/RenderPipeline/ShaderPass/VaryingMesh.hlsl +++ b/com.unity.render-pipelines.high-definition/Runtime/RenderPipeline/ShaderPass/VaryingMesh.hlsl @@ -140,7 +140,7 @@ FragInputs UnpackVaryingsMeshToFragInputs(PackedVaryingsMeshToPS input) // Init to some default value to make the computer quiet (else it output "divide by zero" warning even if value is not used). // TODO: this is a really poor workaround, but the variable is used in a bunch of places // to compute normals which are then passed on elsewhere to compute other values... - output.tangentToWorld = k_identity3x3; + output.tangentToWorld = k_Identity3x3; output.positionSS = input.positionCS; // input.positionCS is SV_Position From ccbb9e16e6b27b85038f026d99a6267a96e6ab73 Mon Sep 17 00:00:00 2001 From: Evgenii Date: Wed, 5 Aug 2020 15:52:27 -0700 Subject: [PATCH 04/22] Turn 'scaleXY' into a scalar --- .../Runtime/Lighting/LightLoop/LightLoop.cs | 30 +++++++++---------- .../Lighting/LightLoop/LightLoop.cs.hlsl | 2 +- .../Lighting/LightLoop/scrbound.compute | 8 ++--- .../Runtime/Material/Decal/DecalSystem.cs | 6 ++-- 4 files changed, 23 insertions(+), 23 deletions(-) diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs index 1aa7b2292f8..3d6fc7dc90c 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs +++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs @@ -128,8 +128,8 @@ struct SFiniteLightBound public Vector3 boxAxisY; // Scaled by the extents (half-size) public Vector3 boxAxisZ; // Scaled by the extents (half-size) public Vector3 center; // Center of the bounds (box) in camera space - public Vector2 scaleXY; // Scale applied to the top of the box to turn it into a truncated pyramid - public float radius; // Circumscribed sphere for the bounds (box) + public float scaleXY; // Scale applied to the top of the box to turn it into a truncated pyramid (X = Y) + public float radius; // Circumscribed sphere for the bounds (box) }; [GenerateHLSL] @@ -1628,9 +1628,9 @@ void GetLightVolumeDataAndBound(LightCategory lightCategory, GPULightType gpuLig fAltDx *= range; fAltDy *= range; // Handle case of pyramid with this select (currently unused) - var altDist = Mathf.Sqrt(fAltDy * fAltDy + (true ? 1.0f : 2.0f) * fAltDx * fAltDx); - bound.radius = altDist > (0.5f * range) ? altDist : (0.5f * range); // will always pick fAltDist - bound.scaleXY = squeeze ? new Vector2(0.01f, 0.01f) : new Vector2(1.0f, 1.0f); + var altDist = Mathf.Sqrt(fAltDy * fAltDy + (true ? 1.0f : 2.0f) * fAltDx * fAltDx); + bound.radius = altDist > (0.5f * range) ? altDist : (0.5f * range); // will always pick fAltDist + bound.scaleXY = squeeze ? 0.01f : 1.0f; lightVolumeData.lightAxisX = vx; lightVolumeData.lightAxisY = vy; @@ -1653,8 +1653,8 @@ void GetLightVolumeDataAndBound(LightCategory lightCategory, GPULightType gpuLig bound.boxAxisX = vx * range; bound.boxAxisY = vy * range; bound.boxAxisZ = vz * range; - bound.scaleXY.Set(1.0f, 1.0f); - bound.radius = range; + bound.scaleXY = 1.0f; + bound.radius = range; // fill up ldata lightVolumeData.lightAxisX = vx; @@ -1675,7 +1675,7 @@ void GetLightVolumeDataAndBound(LightCategory lightCategory, GPULightType gpuLig bound.boxAxisY = extents.y * yAxisVS; bound.boxAxisZ = extents.z * zAxisVS; bound.radius = extents.magnitude; - bound.scaleXY.Set(1.0f, 1.0f); + bound.scaleXY = 1.0f; lightVolumeData.lightPos = centerVS; lightVolumeData.lightAxisX = xAxisVS; @@ -1695,7 +1695,7 @@ void GetLightVolumeDataAndBound(LightCategory lightCategory, GPULightType gpuLig bound.boxAxisY = extents.y * yAxisVS; bound.boxAxisZ = extents.z * zAxisVS; bound.radius = extents.magnitude; - bound.scaleXY.Set(1.0f, 1.0f); + bound.scaleXY = 1.0f; lightVolumeData.lightPos = centerVS; lightVolumeData.lightAxisX = xAxisVS; @@ -1715,7 +1715,7 @@ void GetLightVolumeDataAndBound(LightCategory lightCategory, GPULightType gpuLig bound.boxAxisY = extents.y * yAxisVS; bound.boxAxisZ = extents.z * zAxisVS; bound.radius = extents.magnitude; - bound.scaleXY.Set(1.0f, 1.0f); + bound.scaleXY = 1.0f; lightVolumeData.lightPos = centerVS; lightVolumeData.lightAxisX = xAxisVS; @@ -1894,8 +1894,8 @@ void GetEnvLightVolumeDataAndBound(HDProbe probe, LightVolumeType lightVolumeTyp bound.boxAxisX = influenceRightVS * influenceExtents.x; bound.boxAxisY = influenceUpVS * influenceExtents.x; bound.boxAxisZ = influenceForwardVS * influenceExtents.x; - bound.scaleXY.Set(1.0f, 1.0f); - bound.radius = influenceExtents.x; + bound.scaleXY = 1.0f; + bound.radius = influenceExtents.x; break; } case LightVolumeType.Box: @@ -1904,8 +1904,8 @@ void GetEnvLightVolumeDataAndBound(HDProbe probe, LightVolumeType lightVolumeTyp bound.boxAxisX = influenceExtents.x * influenceRightVS; bound.boxAxisY = influenceExtents.y * influenceUpVS; bound.boxAxisZ = influenceExtents.z * influenceForwardVS; - bound.scaleXY.Set(1.0f, 1.0f); - bound.radius = influenceExtents.magnitude; + bound.scaleXY = 1.0f; + bound.radius = influenceExtents.magnitude; // The culling system culls pixels that are further // than a threshold to the box influence extents. @@ -1945,7 +1945,7 @@ void AddBoxVolumeDataAndBound(OrientedBBox obb, LightCategory category, LightFea bound.boxAxisY = obb.extentY * upVS; bound.boxAxisZ = obb.extentZ * forwardVS; bound.radius = extents.magnitude; - bound.scaleXY.Set(1.0f, 1.0f); + bound.scaleXY = 1.0f; // The culling system culls pixels that are further // than a threshold to the box influence extents. diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs.hlsl b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs.hlsl index 5efdcddabfc..f158b3be894 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs.hlsl +++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs.hlsl @@ -66,7 +66,7 @@ struct SFiniteLightBound float3 boxAxisY; float3 boxAxisZ; float3 center; - float2 scaleXY; + float scaleXY; float radius; }; diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute index 714a8002fa2..38582d5e548 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute +++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute @@ -382,11 +382,11 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) const uint groupLocalLightIndex = t / THREADS_PER_LIGHT; const uint firstVertexOffset = NUM_VERTS * groupLocalLightIndex; - const float2 scale = lgtDat.scaleXY.xy; + const float scale = lgtDat.scaleXY.x; // scale.x = scale.y const float3 rbpC = lgtDat.center.xyz; - const float3 rbpX = lgtDat.boxAxisX.xyz; - const float3 rbpY = lgtDat.boxAxisY.xyz; - const float3 rbpZ = lgtDat.boxAxisZ.xyz; + const float3 rbpX = lgtDat.boxAxisX.xyz; // Pre-scaled + const float3 rbpY = lgtDat.boxAxisY.xyz; // Pre-scaled + const float3 rbpZ = lgtDat.boxAxisZ.xyz; // Pre-scaled #ifndef USE_WAVE_INTRINSICS // Initialize the TGSM. All threads write the same value -> no data races. diff --git a/com.unity.render-pipelines.high-definition/Runtime/Material/Decal/DecalSystem.cs b/com.unity.render-pipelines.high-definition/Runtime/Material/Decal/DecalSystem.cs index 55c80ef43a1..c71e4846911 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/Material/Decal/DecalSystem.cs +++ b/com.unity.render-pipelines.high-definition/Runtime/Material/Decal/DecalSystem.cs @@ -579,12 +579,12 @@ private void GetDecalVolumeDataAndBound(Matrix4x4 decalToWorld, Matrix4x4 worldT var influenceForwardVS = worldToView.MultiplyVector(influenceZ / influenceExtents.z); var influencePositionVS = worldToView.MultiplyPoint(pos); // place the mesh pivot in the center - m_Bounds[m_DecalDatasCount].center = influencePositionVS; + m_Bounds[m_DecalDatasCount].center = influencePositionVS; m_Bounds[m_DecalDatasCount].boxAxisX = influenceRightVS * influenceExtents.x; m_Bounds[m_DecalDatasCount].boxAxisY = influenceUpVS * influenceExtents.y; m_Bounds[m_DecalDatasCount].boxAxisZ = influenceForwardVS * influenceExtents.z; - m_Bounds[m_DecalDatasCount].scaleXY.Set(1.0f, 1.0f); - m_Bounds[m_DecalDatasCount].radius = influenceExtents.magnitude; + m_Bounds[m_DecalDatasCount].scaleXY = 1.0f; + m_Bounds[m_DecalDatasCount].radius = influenceExtents.magnitude; // The culling system culls pixels that are further // than a threshold to the box influence extents. From 1559d1c80966b02b9ce2992fbb7bdc72c8af6d84 Mon Sep 17 00:00:00 2001 From: Evgenii Date: Wed, 5 Aug 2020 18:53:33 -0700 Subject: [PATCH 05/22] Test corners of the view volume --- .../Lighting/LightLoop/scrbound.compute | 138 +++++++++++++++++- 1 file changed, 130 insertions(+), 8 deletions(-) diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute index 38582d5e548..77eb4f6a530 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute +++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute @@ -61,6 +61,55 @@ uint NthBitLow(uint value, uint n) return b; } +float4x4 Translation4x4(float3 d) +{ + float4x4 M = k_Identity4x4; + + M._14_24_34 = d; // Last column + + return M; +} + +float3x3 Rotation3x3(float3 xAxis, float3 yAxis, float3 zAxis) +{ + float3x3 R = float3x3(xAxis, yAxis, zAxis); + float3x3 C = transpose(R); // Row to column + + return C; +} + +float3x3 Invert3x3(float3x3 R) +{ + float3x3 C = transpose(R); // Row to column + float det = dot(C[0], cross(C[1], C[2])); + float3x3 adj = float3x3(cross(C[1], C[2]), + cross(C[2], C[0]), + cross(C[0], C[1])); + + return rcp(det) * adj; +} + +float4x4 Homogenize3x3(float3x3 R) +{ + float4x4 M = float4x4(float4(R[0], 0), + float4(R[1], 0), + float4(R[2], 0), + float4(0,0,0,1)); + + return M; +} + +float4x4 PerspectiveProjection4x4(float s, float g, float n, float f) +{ + float a = (f + n) * rcp(f - n); + float b = -2 * f * n * rcp(f - n); + + return float4x4(g/s, 0, 0, 0, + 0, g, 0, 0, + 0, 0, a, b, + 0, 0, 1, 0); +} + // Clipping a plane by a cube may produce a hexagon (6-gon). // Clipping a hexagon by 4 planes may produce a decagon (10-gon). #define MAX_CLIP_VERTS (10) @@ -382,7 +431,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) const uint groupLocalLightIndex = t / THREADS_PER_LIGHT; const uint firstVertexOffset = NUM_VERTS * groupLocalLightIndex; - const float scale = lgtDat.scaleXY.x; // scale.x = scale.y + const float scale = lgtDat.scaleXY; // scale.x = scale.y const float3 rbpC = lgtDat.center.xyz; const float3 rbpX = lgtDat.boxAxisX.xyz; // Pre-scaled const float3 rbpY = lgtDat.boxAxisY.xyz; // Pre-scaled @@ -432,19 +481,21 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) m.xy *= (v >= 4) ? 1 : scale; float3 rbpVert = rbpC + m.x * rbpX + m.y * rbpY + m.z * rbpZ; + // Avoid generating (w = 0). + rbpVert.z = (abs(rbpVert.z) >= FLT_EPS) ? rbpVert.z : FLT_EPS; + float4 hapVert = mul(g_mProjection, float4(rbpVert, 1)); // Make sure the W component is strictly positive. // It is helpful in order to simplify clipping and to avoid perspective division by 0. - // For the orthographic projection, we only consider (w = 1) - float w = g_isOrthographic ? 1 : hapVert.w; + float w = hapVert.w; float s = (w >= 0) ? 1 : -1; // Transform the X and Y components: [-w, w] -> [0, w]. hapVert.x = (0.5 * s) * hapVert.x + ((0.5 * s) * w); hapVert.y = (0.5 * s) * hapVert.y + ((0.5 * s) * w); hapVert.z = s * hapVert.z; - hapVert.w = max(abs(w), FLT_MIN); + hapVert.w = s * hapVert.w; // For each vertex, we must determine whether it is within the bounds. // For culling and clipping, we must know, per culling plane, whether the vertex @@ -453,8 +504,11 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) // Consider the vertex to be inside the view volume if: // 0 <= x <= w - // 0 <= y <= w + // 0 <= y <= w <-- include boundary points, to avoid clipping them later // 0 <= z <= w + // w is always valid + // For the orthographic projection, (w = 1), so no modifications are necessary. + // TODO: epsilon for numerical robustness? w = hapVert.w; for (uint j = 0; j < (NUM_PLANES / 2); j++) @@ -492,9 +546,77 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) cullClipFaceMask = gs_CullClipFaceMasks[groupLocalLightIndex]; #endif + // (2) Test the corners of the view volume. if (cullClipFaceMask != 0) { - // The light may be partially outside the view volume. + // The light is partially outside the view volume. + // Therefore, some of the corners of the view volume may be inside the light volume. + // We perform aggressive culling, so we must make sure they are accounted for. + // The light volume is a special type of cuboid - a right frustum. + // We can exploit this fact by building a light-space projection matrix. + float4x4 invTranslateToLightSpace = Translation4x4(-rbpC); + float4x4 invRotateAndScaleInLightSpace = Homogenize3x3(Invert3x3(Rotation3x3(rbpX, rbpY, rbpZ))); + // TODO: avoid full inversion by using unit vectors and passing magnitudes explicitly. + + // This (orhographic) projection matrix maps a view-space point to a light-space [-1, 1]^3 cube. + float4x4 lightSpaceMatrix = mul(invRotateAndScaleInLightSpace, invTranslateToLightSpace); + + if (scale != 1) // Perspective light space? + { + // Compute the parameters of the perspective projection. + float s = scale; + float e = -1 - 2 * (s * rcp(1 - s)); // Signed distance from the origin to the eye + float n = -e - 1; // Distance from the eye to the near plane + float f = -e + 1; // Distance from the eye to the far plane + float g = f; // Distance from the eye to the projection plane + + float4x4 invTranslateEye = Translation4x4(float3(0, 0, -e)); + float4x4 perspProjMatrix = PerspectiveProjection4x4(s, g, n, f); + + lightSpaceMatrix = mul(perspProjMatrix, mul(invTranslateEye, lightSpaceMatrix)); + } + + for (uint i = 0; i < VERTS_PER_THREAD; i++) + { + uint v = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT; + + // rapVertsCS[0] = (-1, -1, 0) + // rapVertsCS[1] = (+1, -1, 0) + // rapVertsCS[2] = (+1, +1, 0) + // rapVertsCS[3] = (-1, +1, 0) + // rapVertsCS[4] = (-1, -1, 1) + // rapVertsCS[5] = (+1, -1, 1) + // rapVertsCS[6] = (+1, +1, 1) + // rapVertsCS[7] = (-1, +1, 1) + + float3 rapVertCS; // See the comment above + + rapVertCS.x = (countbits(v % 4) == 1) ? 1 : -1; + rapVertCS.y = (v & 2 != 0) ? 1 : -1; + rapVertCS.z = (v >= 4) ? 1 : 0; + + float4 hbpVertVS = mul(g_mInvProjection, float4(rapVertCS, 1)); // Clip to view space + float4 hapVertLS = mul(lightSpaceMatrix, hbpVertVS); // View to light space + + // Consider the vertex to be inside the light volume if: + // -w < x < w + // -w < y < w <-- exclude boundary points, as we will not clip using these vertices + // -w < z < w + // 0 < w + // For the orthographic projection, (w = 1), so no modifications are necessary. + // TODO: epsilon for numerical robustness? + + bool inside = Max3(abs(hapVertLS.x), abs(hapVertLS.y), abs(hapVertLS.z)) < hapVertLS.w; + + if (inside) + { + float3 rapVertNDC = float3(rapVertCS.xy * 0.5 + 0.5, rapVertCS.z); + + // Update the AABB. + rapAaBbMinPt = min(rapAaBbMinPt, rapVertNDC); + rapAaBbMaxPt = max(rapAaBbMaxPt, rapVertNDC); + } + } } uint behindMasksOfVerts[NUM_VERTS]; @@ -504,7 +626,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) behindMasksOfVerts[i] = gs_BehindMasksOfVerts[firstVertexOffset + i]; } - // (2) Cull the faces. + // (3) Cull the faces. const uint cullFaceMask = cullClipFaceMask; const uint numFacesToCull = countbits(cullFaceMask); // [0, 6] @@ -533,7 +655,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) cullClipFaceMask = gs_CullClipFaceMasks[groupLocalLightIndex]; #endif - // (3) Clip the faces. + // (4) Clip the faces. const uint clipFaceMask = cullClipFaceMask; const uint numFacesToClip = countbits(clipFaceMask); // [0, 6] From b8e8c836bc1d6bf9536d806b79598d8b1b283f02 Mon Sep 17 00:00:00 2001 From: Evgenii Date: Wed, 5 Aug 2020 18:58:56 -0700 Subject: [PATCH 06/22] Improve the placeholder for the linear depth --- .../Runtime/Lighting/LightLoop/scrbound.compute | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute index 77eb4f6a530..0bbd71b58ed 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute +++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute @@ -701,7 +701,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) // a set of maxs, and each set is equal to g_iNrVisibLights. const ScreenSpaceBoundsIndices boundsIndices = GenerateScreenSpaceBoundsIndices(lgtIndex, g_iNrVisibLights, eyeIndex); - float minLinearDepth = -1, maxLinearDepth = -1; // TODO + float minLinearDepth = 0, maxLinearDepth = FLT_MAX; // TODO g_vBoundsBuffer[boundsIndices.min] = float4(rapAaBbMinPt, minLinearDepth); g_vBoundsBuffer[boundsIndices.max] = float4(rapAaBbMaxPt, maxLinearDepth); From 676eb5a564641650c60a805c0df6a0208b7a9ffd Mon Sep 17 00:00:00 2001 From: Evgenii Date: Wed, 5 Aug 2020 20:08:03 -0700 Subject: [PATCH 07/22] Fix aspect --- .../Runtime/Lighting/LightLoop/scrbound.compute | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute index 0bbd71b58ed..92e39049c78 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute +++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute @@ -99,14 +99,14 @@ float4x4 Homogenize3x3(float3x3 R) return M; } -float4x4 PerspectiveProjection4x4(float s, float g, float n, float f) +float4x4 PerspectiveProjection4x4(float a, float g, float n, float f) { - float a = (f + n) * rcp(f - n); - float b = -2 * f * n * rcp(f - n); + float b = (f + n) * rcp(f - n); + float c = -2 * f * n * rcp(f - n); - return float4x4(g/s, 0, 0, 0, + return float4x4(g/a, 0, 0, 0, 0, g, 0, 0, - 0, 0, a, b, + 0, 0, b, c, 0, 0, 1, 0); } @@ -571,7 +571,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) float g = f; // Distance from the eye to the projection plane float4x4 invTranslateEye = Translation4x4(float3(0, 0, -e)); - float4x4 perspProjMatrix = PerspectiveProjection4x4(s, g, n, f); + float4x4 perspProjMatrix = PerspectiveProjection4x4(1, g, n, f); lightSpaceMatrix = mul(perspProjMatrix, mul(invTranslateEye, lightSpaceMatrix)); } From 433e27e7b6f6f5f87e8fd7376f796a6695544fcc Mon Sep 17 00:00:00 2001 From: Evgenii Date: Wed, 5 Aug 2020 20:41:09 -0700 Subject: [PATCH 08/22] Bugfix --- .../Lighting/LightLoop/scrbound.compute | 155 +++++++++--------- 1 file changed, 81 insertions(+), 74 deletions(-) diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute index 92e39049c78..665296d158b 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute +++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute @@ -10,7 +10,7 @@ #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs.hlsl" #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightCullUtils.hlsl" -// #pragma enable_d3d11_debug_symbols +#pragma enable_d3d11_debug_symbols #pragma only_renderers d3d11 playstation xboxone vulkan metal switch uniform int g_isOrthographic; @@ -26,17 +26,12 @@ StructuredBuffer g_data : register( t0 ); // output buffer RWStructuredBuffer g_vBoundsBuffer : register( u0 ); -#define DUMB_COMPILER +#define Z_BINNING +// #define DUMB_COMPILER // #define USE_WAVE_INTRINSICS // We use TGSM and atomic operations if wave intrinsics are not supported #ifdef Z_BINNING -// Computes r=(n/d) and rounds the result towards the largest adjacent integer. -uint DivRoundUp(uint n, uint d) -{ - return (n + d - 1) / d; // No division by 0 checks -} - // Returns the location of the N-th set bit starting from the lowest order bit and working upward. // Slow implementation - do not use for large bit sets. // Could be optimized - see https://graphics.stanford.edu/~seander/bithacks.html @@ -110,21 +105,24 @@ float4x4 PerspectiveProjection4x4(float a, float g, float n, float f) 0, 0, 1, 0); } +#define CLEAR_SIGN_BIT(X) (asuint(X) & INT_MAX) +#define DIV_ROUND_UP(N, D) (((N) + (D) - 1) / (D)) // No division by 0 checks + // Clipping a plane by a cube may produce a hexagon (6-gon). // Clipping a hexagon by 4 planes may produce a decagon (10-gon). #define MAX_CLIP_VERTS (10) -#define NUM_EDGES (12) #define NUM_VERTS (8) #define NUM_FACES (6) #define NUM_PLANES (6) -#define THREADS_PER_LIGHT (4) #define THREADS_PER_GROUP (64) +#define THREADS_PER_LIGHT (1) // Set to 1 for debugging #define LIGHTS_PER_GROUP (THREADS_PER_GROUP / THREADS_PER_LIGHT) #define VERTS_PER_GROUP (NUM_VERTS * LIGHTS_PER_GROUP) #define VERTS_PER_THREAD (NUM_VERTS / THREADS_PER_LIGHT) -#define FACES_PER_THREAD DivRoundUp(NUM_FACES, THREADS_PER_LIGHT) +#define FACES_PER_THREAD DIV_ROUND_UP(NUM_FACES, THREADS_PER_LIGHT) // All planes and faces are always in the standard order (see below). +// Near and far planes may be swapped for Reverse Z-Buffering, but it does not change the algorithm. #define FACE_LEFT (1 << 0) // x = -1 #define FACE_RIGHT (1 << 1) // x = +1 #define FACE_FRONT (1 << 2) // y = -1 @@ -136,6 +134,8 @@ float4x4 PerspectiveProjection4x4(float a, float g, float n, float f) // TODO: the compiler generates 'tbuffer_load_format_x' instructions // when we access the look-up tables. Can we avoid this? +// TODO: try vert order (0 0 0), (1 0 0), (0 1 0), (1 1 0), (0 0 1), (1 0 1), (0 1 1), (1 1 1) + // All vertices are always in the standard order (see below). static const uint s_FaceMasksOfVerts[NUM_VERTS] = { @@ -153,12 +153,12 @@ static const uint s_FaceMasksOfVerts[NUM_VERTS] = // with normals pointing in the interior of the volume. static const uint s_VertMasksOfFaces[NUM_FACES] = { - 3 << 9 | 7 << 6 | 4 << 3 | 0 << 0, // 0: FACE_LEFT - 5 << 9 | 6 << 6 | 2 << 3 | 1 << 0, // 1: FACE_RIGHT - 4 << 9 | 5 << 6 | 1 << 3 | 0 << 0, // 2: FACE_FRONT - 2 << 9 | 6 << 6 | 7 << 3 | 3 << 0, // 3: FACE_BACK - 1 << 9 | 2 << 6 | 3 << 3 | 0 << 0, // 4: FACE_TOP - 7 << 9 | 6 << 6 | 5 << 3 | 4 << 0 // 5: FACE_BOTTOM + (3) << 9 | (7) << 6 | (4) << 3 | (0) << 0, // 0: FACE_LEFT + (5) << 9 | (6) << 6 | (2) << 3 | (1) << 0, // 1: FACE_RIGHT + (4) << 9 | (5) << 6 | (1) << 3 | (0) << 0, // 2: FACE_FRONT + (6) << 9 | (7) << 6 | (3) << 3 | (2) << 0, // 3: FACE_BACK + (1) << 9 | (2) << 6 | (3) << 3 | (0) << 0, // 4: FACE_TOP + (7) << 9 | (6) << 6 | (5) << 3 | (4) << 0 // 5: FACE_BOTTOM }; // 5 arrays * 128 elements * 4 bytes each = 2560 bytes. @@ -173,7 +173,7 @@ groupshared uint gs_BehindMasksOfVerts[VERTS_PER_GROUP]; // 6 planes each (HLSL groupshared uint gs_CullClipFaceMasks[LIGHTS_PER_GROUP]; // 6 faces each (HLSL does not support small data types) // 6 arrays * 16 elements * 4 bytes each = 384 bytes. -// Note that these are actually floats reinterpreted as uints. +// These are actually floats reinterpreted as uints. // The reason is because floating-point atomic operations are not supported. groupshared uint gs_RapAaBbMinPtX[LIGHTS_PER_GROUP]; groupshared uint gs_RapAaBbMaxPtX[LIGHTS_PER_GROUP]; @@ -307,8 +307,8 @@ void ClipFaceAgainstViewVolumeAndUpdateAaBb(uint f, uint behindMasksOfVerts[NUM_ // Non-zero if ANY of the vertices are behind any of the planes. clipMaskOfFace |= behindMasksOfVerts[v]; - // Note that not all edges may require clipping. However, - // filtering the vertex list is somewhat expensive, so we currently don't do it. + // Not all edges may require clipping. However, filtering the vertex list + // is somewhat expensive, so we currently don't do it. vertRingBuffer[j].x = gs_HapVertsX[firstVertexOffset + v]; vertRingBuffer[j].y = gs_HapVertsY[firstVertexOffset + v]; vertRingBuffer[j].z = gs_HapVertsZ[firstVertexOffset + v]; @@ -336,14 +336,14 @@ void ClipFaceAgainstViewVolumeAndUpdateAaBb(uint f, uint behindMasksOfVerts[NUM_ uint modSrcIdx = srcBegin % MAX_CLIP_VERTS; #endif - for (int j = srcBegin; j < (srcBegin + srcSize); j++) + for (uint j = srcBegin; j < (srcBegin + srcSize); j++) { #ifndef DUMB_COMPILER uint modSrcIdx = j % MAX_CLIP_VERTS; #endif float4 hapVert = vertRingBuffer[modSrcIdx]; - float3 rapVert = hapVert.xyz * rcp(hapVert.w); + float3 rapVert = saturate(hapVert.xyz * rcp(hapVert.w)); // Must not generate negative values rapAaBbMinPt = min(rapAaBbMinPt, rapVert); rapAaBbMaxPt = max(rapAaBbMaxPt, rapVert); @@ -357,7 +357,10 @@ void ClipFaceAgainstViewVolumeAndUpdateAaBb(uint f, uint behindMasksOfVerts[NUM_ #else // !Z_BINNING -#define MAX_PNTS 9 // strictly this should be 10=6+4 but we get more wavefronts and 10 seems to never hit (fingers crossed) +#define THREADS_PER_LIGHT (8) +#define THREADS_PER_GROUP (64) +#define LIGHTS_PER_GROUP (THREADS_PER_GROUP / THREADS_PER_LIGHT) +#define MAX_PNTS (9) // strictly this should be 10=6+4 but we get more wavefronts and 10 seems to never hit (fingers crossed) // However, worst case the plane that would be skipped if such an extreme case ever happened would be backplane // clipping gets skipped which doesn't cause any errors. @@ -392,8 +395,8 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) unsigned int g = groupID; unsigned int t = threadID; - const int subLigt = (uint) (t/8); - const int lgtIndex = subLigt+(uint) g*8; + const int subLigt = (uint) (t/THREADS_PER_LIGHT); + const int lgtIndex = subLigt+(uint) g*LIGHTS_PER_GROUP; const int sideIndex = (uint) (t%8); const int eyeAdjustedLgtIndex = GenerateLightCullDataIndex(lgtIndex, g_iNrVisibLights, eyeIndex); @@ -407,13 +410,13 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) // Since a light volume may be partially off-screen, we must clip it before computing the AABB. // Clipping the resulting AABB (rather than the light volume itself) may result in a loose AABB. // - // To avoid having to deal with toroidal properties of the perspective transform, + // To avoid having to deal with the "Moebius twist" property of the perspective transform, // we perform clipping using the homogeneous (projective) post-perspective coordinates. // This clipping method in described in Blinn's paper titled "Line Clipping". // // The algorithm processes a light on 4 threads. While all 6 faces may require clipping in the // worst case, clipping more than 4 faces is very uncommon (typically, we clip 0, 3 or 4). - // Note that some faces may require culling rather than clipping (the former is simpler). + // Some faces may require culling rather than clipping (the former is simpler). // // It's important to realize that face culling may end up culling 5 (or even all 6) faces. // This means that the clipped light volume may be reduced to a single polygon, or nothing at all. @@ -433,20 +436,23 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) const float scale = lgtDat.scaleXY; // scale.x = scale.y const float3 rbpC = lgtDat.center.xyz; + // TODO: store X, Y, Scale const float3 rbpX = lgtDat.boxAxisX.xyz; // Pre-scaled const float3 rbpY = lgtDat.boxAxisY.xyz; // Pre-scaled const float3 rbpZ = lgtDat.boxAxisZ.xyz; // Pre-scaled #ifndef USE_WAVE_INTRINSICS - // Initialize the TGSM. All threads write the same value -> no data races. - // The hardware will coalesce the writes. - gs_CullClipFaceMasks[groupLocalLightIndex] = 0; // Initially inside - gs_RapAaBbMinPtX[groupLocalLightIndex] = asuint(1.0f); - gs_RapAaBbMaxPtX[groupLocalLightIndex] = asuint(0.0f); - gs_RapAaBbMinPtY[groupLocalLightIndex] = asuint(1.0f); - gs_RapAaBbMaxPtY[groupLocalLightIndex] = asuint(0.0f); - gs_RapAaBbMinPtZ[groupLocalLightIndex] = asuint(1.0f); - gs_RapAaBbMaxPtZ[groupLocalLightIndex] = asuint(0.0f); + // (0) Initialize the TGSM. + if (t % THREADS_PER_LIGHT == 0) // Avoid bank conflicts + { + gs_CullClipFaceMasks[groupLocalLightIndex] = 0; // Initially inside + gs_RapAaBbMinPtX[groupLocalLightIndex] = asuint(1.0f); + gs_RapAaBbMaxPtX[groupLocalLightIndex] = asuint(0.0f); + gs_RapAaBbMinPtY[groupLocalLightIndex] = asuint(1.0f); + gs_RapAaBbMaxPtY[groupLocalLightIndex] = asuint(0.0f); + gs_RapAaBbMinPtZ[groupLocalLightIndex] = asuint(1.0f); + gs_RapAaBbMaxPtZ[groupLocalLightIndex] = asuint(0.0f); + } #endif // USE_WAVE_INTRINSICS float3 rapAaBbMinPt = 1; @@ -482,20 +488,16 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) float3 rbpVert = rbpC + m.x * rbpX + m.y * rbpY + m.z * rbpZ; // Avoid generating (w = 0). - rbpVert.z = (abs(rbpVert.z) >= FLT_EPS) ? rbpVert.z : FLT_EPS; + rbpVert.z = (abs(rbpVert.z) > FLT_MIN) ? rbpVert.z : FLT_MIN; float4 hapVert = mul(g_mProjection, float4(rbpVert, 1)); - // Make sure the W component is strictly positive. - // It is helpful in order to simplify clipping and to avoid perspective division by 0. - float w = hapVert.w; - float s = (w >= 0) ? 1 : -1; + // Warning: the W component may be negative. + // Flipping the -W pyramid by negating all coordinates is incorrect + // and will break both classification and clipping. // Transform the X and Y components: [-w, w] -> [0, w]. - hapVert.x = (0.5 * s) * hapVert.x + ((0.5 * s) * w); - hapVert.y = (0.5 * s) * hapVert.y + ((0.5 * s) * w); - hapVert.z = s * hapVert.z; - hapVert.w = s * hapVert.w; + hapVert.xy = 0.5 * hapVert.xy + (0.5 * hapVert.w); // For each vertex, we must determine whether it is within the bounds. // For culling and clipping, we must know, per culling plane, whether the vertex @@ -509,17 +511,18 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) // w is always valid // For the orthographic projection, (w = 1), so no modifications are necessary. // TODO: epsilon for numerical robustness? - w = hapVert.w; for (uint j = 0; j < (NUM_PLANES / 2); j++) { + float w = hapVert.w; + behindMask |= (hapVert[j] < 0 ? 1 : 0) << (2 * j + 0); // Planes crossing '0' behindMask |= (hapVert[j] > w ? 1 : 0) << (2 * j + 1); // Planes crossing 'w' } if (behindMask == 0) // Inside? { - float3 rapVert = hapVert.xyz * rcp(hapVert.w); + float3 rapVert = saturate(hapVert.xyz * rcp(hapVert.w)); // Must not generate negative values rapAaBbMinPt = min(rapAaBbMinPt, rapVert); rapAaBbMaxPt = max(rapAaBbMaxPt, rapVert); @@ -573,7 +576,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) float4x4 invTranslateEye = Translation4x4(float3(0, 0, -e)); float4x4 perspProjMatrix = PerspectiveProjection4x4(1, g, n, f); - lightSpaceMatrix = mul(perspProjMatrix, mul(invTranslateEye, lightSpaceMatrix)); + lightSpaceMatrix = mul(mul(perspProjMatrix, invTranslateEye), lightSpaceMatrix); } for (uint i = 0; i < VERTS_PER_THREAD; i++) @@ -601,7 +604,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) // Consider the vertex to be inside the light volume if: // -w < x < w // -w < y < w <-- exclude boundary points, as we will not clip using these vertices - // -w < z < w + // -w < z < w <-- assume that Z-precision is not very important here // 0 < w // For the orthographic projection, (w = 1), so no modifications are necessary. // TODO: epsilon for numerical robustness? @@ -627,20 +630,22 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) } // (3) Cull the faces. - const uint cullFaceMask = cullClipFaceMask; - const uint numFacesToCull = countbits(cullFaceMask); // [0, 6] - - for (uint i = 0; i < FACES_PER_THREAD; i++) { - uint n = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT; + const uint cullFaceMask = cullClipFaceMask; + const uint numFacesToCull = countbits(cullFaceMask); // [0, 6] - if (n < numFacesToCull) + for (uint i = 0; i < FACES_PER_THREAD; i++) { - uint f = NthBitLow(cullFaceMask, n); + uint n = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT; - if (TryCullFace(f, behindMasksOfVerts)) + if (n < numFacesToCull) { - cullClipFaceMask ^= 1 << f; // Clear the bit + uint f = NthBitLow(cullFaceMask, n); + + if (TryCullFace(f, behindMasksOfVerts)) + { + cullClipFaceMask ^= 1 << f; // Clear the bit + } } } } @@ -656,19 +661,21 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) #endif // (4) Clip the faces. - const uint clipFaceMask = cullClipFaceMask; - const uint numFacesToClip = countbits(clipFaceMask); // [0, 6] - - for (uint i = 0; i < FACES_PER_THREAD; i++) { - uint n = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT; + const uint clipFaceMask = cullClipFaceMask; + const uint numFacesToClip = countbits(clipFaceMask); // [0, 6] - if (n < numFacesToCull) + for (uint i = 0; i < FACES_PER_THREAD; i++) { - uint f = NthBitLow(clipFaceMask, n); + uint n = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT; + + if (n < numFacesToClip) + { + uint f = NthBitLow(clipFaceMask, n); - ClipFaceAgainstViewVolumeAndUpdateAaBb(f, behindMasksOfVerts, firstVertexOffset, - rapAaBbMinPt, rapAaBbMaxPt); + ClipFaceAgainstViewVolumeAndUpdateAaBb(f, behindMasksOfVerts, firstVertexOffset, + rapAaBbMinPt, rapAaBbMaxPt); + } } } @@ -677,12 +684,12 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) #else // Integer comparison works for floating-point numbers as long as the sign bit is 0. // We must take care of the signed zero ourselves. - InterlockedMin(gs_RapAaBbMinPtX[groupLocalLightIndex], asuint(rapAaBbMinPt.x) & INT_MAX); - InterlockedMax(gs_RapAaBbMaxPtX[groupLocalLightIndex], asuint(rapAaBbMaxPt.x) & INT_MAX); - InterlockedMin(gs_RapAaBbMinPtY[groupLocalLightIndex], asuint(rapAaBbMinPt.y) & INT_MAX); - InterlockedMax(gs_RapAaBbMaxPtY[groupLocalLightIndex], asuint(rapAaBbMaxPt.y) & INT_MAX); - InterlockedMin(gs_RapAaBbMinPtZ[groupLocalLightIndex], asuint(rapAaBbMinPt.z) & INT_MAX); - InterlockedMax(gs_RapAaBbMaxPtZ[groupLocalLightIndex], asuint(rapAaBbMaxPt.z) & INT_MAX); + InterlockedMin(gs_RapAaBbMinPtX[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(rapAaBbMinPt.x))); + InterlockedMax(gs_RapAaBbMaxPtX[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(rapAaBbMaxPt.x))); + InterlockedMin(gs_RapAaBbMinPtY[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(rapAaBbMinPt.y))); + InterlockedMax(gs_RapAaBbMaxPtY[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(rapAaBbMaxPt.y))); + InterlockedMin(gs_RapAaBbMinPtZ[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(rapAaBbMinPt.z))); + InterlockedMax(gs_RapAaBbMaxPtZ[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(rapAaBbMaxPt.z))); GroupMemoryBarrierWithGroupSync(); @@ -694,7 +701,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) rapAaBbMaxPt.z = asfloat(gs_RapAaBbMaxPtZ[groupLocalLightIndex]); #endif // USE_WAVE_INTRINSICS - if (t % THREADS_PER_LIGHT == 0) + if (t % THREADS_PER_LIGHT == 0) // Avoid bank conflicts { // Each light's AABB is represented by two float3s, the min and max of the box. // And for stereo, we have two sets of lights. Therefore, each eye has a set of mins, followed by From 8a2458a5f5d2f362fce4090f8e9466cfa86712d3 Mon Sep 17 00:00:00 2001 From: Evgenii Date: Thu, 6 Aug 2020 21:38:10 -0700 Subject: [PATCH 09/22] Optimize --- .../Lighting/LightLoop/scrbound.compute | 28 ++++++++----------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute index 665296d158b..bee9499abdd 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute +++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute @@ -10,7 +10,7 @@ #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs.hlsl" #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightCullUtils.hlsl" -#pragma enable_d3d11_debug_symbols +// #pragma enable_d3d11_debug_symbols #pragma only_renderers d3d11 playstation xboxone vulkan metal switch uniform int g_isOrthographic; @@ -27,7 +27,7 @@ StructuredBuffer g_data : register( t0 ); RWStructuredBuffer g_vBoundsBuffer : register( u0 ); #define Z_BINNING -// #define DUMB_COMPILER +#define DUMB_COMPILER // #define USE_WAVE_INTRINSICS // We use TGSM and atomic operations if wave intrinsics are not supported #ifdef Z_BINNING @@ -80,7 +80,6 @@ float3x3 Invert3x3(float3x3 R) float3x3 adj = float3x3(cross(C[1], C[2]), cross(C[2], C[0]), cross(C[0], C[1])); - return rcp(det) * adj; } @@ -90,14 +89,13 @@ float4x4 Homogenize3x3(float3x3 R) float4(R[1], 0), float4(R[2], 0), float4(0,0,0,1)); - return M; } float4x4 PerspectiveProjection4x4(float a, float g, float n, float f) { - float b = (f + n) * rcp(f - n); - float c = -2 * f * n * rcp(f - n); + float b = (f + n) * rcp(f - n); // z: [-1, 1] + float c = -2 * f * n * rcp(f - n); // No Z-reversal return float4x4(g/a, 0, 0, 0, 0, g, 0, 0, @@ -115,14 +113,14 @@ float4x4 PerspectiveProjection4x4(float a, float g, float n, float f) #define NUM_FACES (6) #define NUM_PLANES (6) #define THREADS_PER_GROUP (64) -#define THREADS_PER_LIGHT (1) // Set to 1 for debugging +#define THREADS_PER_LIGHT (4) // Set to 1 for debugging #define LIGHTS_PER_GROUP (THREADS_PER_GROUP / THREADS_PER_LIGHT) #define VERTS_PER_GROUP (NUM_VERTS * LIGHTS_PER_GROUP) #define VERTS_PER_THREAD (NUM_VERTS / THREADS_PER_LIGHT) #define FACES_PER_THREAD DIV_ROUND_UP(NUM_FACES, THREADS_PER_LIGHT) // All planes and faces are always in the standard order (see below). -// Near and far planes may be swapped for Reverse Z-Buffering, but it does not change the algorithm. +// Near and far planes are swapped in the case of Z-reversal, but it does not affect the algorithm. #define FACE_LEFT (1 << 0) // x = -1 #define FACE_RIGHT (1 << 1) // x = +1 #define FACE_FRONT (1 << 2) // y = -1 @@ -481,7 +479,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) float3 m; // See the comment above m.x = (countbits(v % 4) == 1) ? 1 : -1; - m.y = (v & 2 != 0) ? 1 : -1; + m.y = ((v & 2) != 0) ? 1 : -1; m.z = (v >= 4) ? 1 : -1; m.xy *= (v >= 4) ? 1 : scale; @@ -506,7 +504,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) // Consider the vertex to be inside the view volume if: // 0 <= x <= w - // 0 <= y <= w <-- include boundary points, to avoid clipping them later + // 0 <= y <= w <-- include boundary points to avoid clipping them later // 0 <= z <= w // w is always valid // For the orthographic projection, (w = 1), so no modifications are necessary. @@ -561,7 +559,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) float4x4 invRotateAndScaleInLightSpace = Homogenize3x3(Invert3x3(Rotation3x3(rbpX, rbpY, rbpZ))); // TODO: avoid full inversion by using unit vectors and passing magnitudes explicitly. - // This (orhographic) projection matrix maps a view-space point to a light-space [-1, 1]^3 cube. + // This (orthographic) projection matrix maps a view-space point to a light-space [-1, 1]^3 cube. float4x4 lightSpaceMatrix = mul(invRotateAndScaleInLightSpace, invTranslateToLightSpace); if (scale != 1) // Perspective light space? @@ -595,7 +593,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) float3 rapVertCS; // See the comment above rapVertCS.x = (countbits(v % 4) == 1) ? 1 : -1; - rapVertCS.y = (v & 2 != 0) ? 1 : -1; + rapVertCS.y = ((v & 2) != 0) ? 1 : -1; rapVertCS.z = (v >= 4) ? 1 : 0; float4 hbpVertVS = mul(g_mInvProjection, float4(rapVertCS, 1)); // Clip to view space @@ -708,10 +706,8 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) // a set of maxs, and each set is equal to g_iNrVisibLights. const ScreenSpaceBoundsIndices boundsIndices = GenerateScreenSpaceBoundsIndices(lgtIndex, g_iNrVisibLights, eyeIndex); - float minLinearDepth = 0, maxLinearDepth = FLT_MAX; // TODO - - g_vBoundsBuffer[boundsIndices.min] = float4(rapAaBbMinPt, minLinearDepth); - g_vBoundsBuffer[boundsIndices.max] = float4(rapAaBbMaxPt, maxLinearDepth); + g_vBoundsBuffer[boundsIndices.min] = float4(rapAaBbMinPt, 0); // TODO: add me - lin depth + g_vBoundsBuffer[boundsIndices.max] = float4(rapAaBbMaxPt, 100000); // } #else // !Z_BINNING From 0453c6c5d0a15db7fe5efe5a48bf8f8282f0da6a Mon Sep 17 00:00:00 2001 From: Evgenii Date: Fri, 7 Aug 2020 13:15:40 -0700 Subject: [PATCH 10/22] Also store view space Z --- .../Lighting/LightLoop/scrbound.compute | 112 ++++++++++-------- 1 file changed, 65 insertions(+), 47 deletions(-) diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute index bee9499abdd..abb17765989 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute +++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute @@ -170,15 +170,17 @@ groupshared uint gs_BehindMasksOfVerts[VERTS_PER_GROUP]; // 6 planes each (HLSL // 1 array * 16 elements * 4 bytes each = 64 bytes. groupshared uint gs_CullClipFaceMasks[LIGHTS_PER_GROUP]; // 6 faces each (HLSL does not support small data types) -// 6 arrays * 16 elements * 4 bytes each = 384 bytes. +// 8 arrays * 16 elements * 4 bytes each = 512 bytes. // These are actually floats reinterpreted as uints. // The reason is because floating-point atomic operations are not supported. -groupshared uint gs_RapAaBbMinPtX[LIGHTS_PER_GROUP]; -groupshared uint gs_RapAaBbMaxPtX[LIGHTS_PER_GROUP]; -groupshared uint gs_RapAaBbMinPtY[LIGHTS_PER_GROUP]; -groupshared uint gs_RapAaBbMaxPtY[LIGHTS_PER_GROUP]; -groupshared uint gs_RapAaBbMinPtZ[LIGHTS_PER_GROUP]; -groupshared uint gs_RapAaBbMaxPtZ[LIGHTS_PER_GROUP]; +groupshared uint gs_NdcAaBbMinPtX[LIGHTS_PER_GROUP]; +groupshared uint gs_NdcAaBbMaxPtX[LIGHTS_PER_GROUP]; +groupshared uint gs_NdcAaBbMinPtY[LIGHTS_PER_GROUP]; +groupshared uint gs_NdcAaBbMaxPtY[LIGHTS_PER_GROUP]; +groupshared uint gs_NdcAaBbMinPtZ[LIGHTS_PER_GROUP]; // Note that min-max Z cannot be trivially reconstructed +groupshared uint gs_NdcAaBbMaxPtZ[LIGHTS_PER_GROUP]; // from min-max W if the projection is oblique. +groupshared uint gs_NdcAaBbMinPtW[LIGHTS_PER_GROUP]; // View-space Z coordinate +groupshared uint gs_NdcAaBbMaxPtW[LIGHTS_PER_GROUP]; // View-space Z coordinate #endif // USE_WAVE_INTRINSICS // Returns 'true' if it manages to cull the face. @@ -290,8 +292,8 @@ void ClipPolygonAgainstPlane(uint p, uint srcBegin, uint srcSize, } } -void ClipFaceAgainstViewVolumeAndUpdateAaBb(uint f, uint behindMasksOfVerts[NUM_VERTS], uint firstVertexOffset, - inout float3 rapAaBbMinPt, inout float3 rapAaBbMaxPt) +void ClipFaceAgainstViewVolumeAndUpdateAaBb(uint f, uint behindMasksOfVerts[NUM_VERTS], uint firstVertexOffset, float4x4 g_mInvProjection, + inout float4 ndcAaBbMinPt, inout float4 ndcAaBbMaxPt) { float4 vertRingBuffer[MAX_CLIP_VERTS]; uint srcBegin = 0, srcSize = 4; @@ -340,11 +342,15 @@ void ClipFaceAgainstViewVolumeAndUpdateAaBb(uint f, uint behindMasksOfVerts[NUM_ uint modSrcIdx = j % MAX_CLIP_VERTS; #endif - float4 hapVert = vertRingBuffer[modSrcIdx]; - float3 rapVert = saturate(hapVert.xyz * rcp(hapVert.w)); // Must not generate negative values + float4 hapVert = vertRingBuffer[modSrcIdx]; + float4 hbpVertVS = mul(g_mInvProjection, hapVert); // Just to support orthographic projection + float3 rapVertNDC = saturate(hapVert.xyz * rcp(hapVert.w)); // Must not generate negative values + float rbpVertVSz = hbpVertVS.z * rcp(hbpVertVS.w); - rapAaBbMinPt = min(rapAaBbMinPt, rapVert); - rapAaBbMaxPt = max(rapAaBbMaxPt, rapVert); + ndcAaBbMinPt.xyz = min(ndcAaBbMinPt.xyz, rapVertNDC); + ndcAaBbMaxPt.xyz = max(ndcAaBbMaxPt.xyz, rapVertNDC); + ndcAaBbMinPt.w = min(ndcAaBbMinPt.w, rbpVertVSz); + ndcAaBbMaxPt.w = max(ndcAaBbMaxPt.w, rbpVertVSz); #ifdef DUMB_COMPILER modSrcIdx++; @@ -444,17 +450,19 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) if (t % THREADS_PER_LIGHT == 0) // Avoid bank conflicts { gs_CullClipFaceMasks[groupLocalLightIndex] = 0; // Initially inside - gs_RapAaBbMinPtX[groupLocalLightIndex] = asuint(1.0f); - gs_RapAaBbMaxPtX[groupLocalLightIndex] = asuint(0.0f); - gs_RapAaBbMinPtY[groupLocalLightIndex] = asuint(1.0f); - gs_RapAaBbMaxPtY[groupLocalLightIndex] = asuint(0.0f); - gs_RapAaBbMinPtZ[groupLocalLightIndex] = asuint(1.0f); - gs_RapAaBbMaxPtZ[groupLocalLightIndex] = asuint(0.0f); + gs_NdcAaBbMinPtX[groupLocalLightIndex] = asuint(1.0f); + gs_NdcAaBbMaxPtX[groupLocalLightIndex] = asuint(0.0f); + gs_NdcAaBbMinPtY[groupLocalLightIndex] = asuint(1.0f); + gs_NdcAaBbMaxPtY[groupLocalLightIndex] = asuint(0.0f); + gs_NdcAaBbMinPtZ[groupLocalLightIndex] = asuint(1.0f); + gs_NdcAaBbMaxPtZ[groupLocalLightIndex] = asuint(0.0f); + gs_NdcAaBbMinPtW[groupLocalLightIndex] = asuint(FLT_INF); + gs_NdcAaBbMaxPtW[groupLocalLightIndex] = asuint(0.0f); } #endif // USE_WAVE_INTRINSICS - float3 rapAaBbMinPt = 1; - float3 rapAaBbMaxPt = 0; + float4 ndcAaBbMinPt = float4(1, 1, 1, FLT_INF); + float4 ndcAaBbMaxPt = 0; // We must determine whether we have to clip or cull any of the faces. // If all vertices of a face are inside with respect to all the culling planes, @@ -484,11 +492,11 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) m.xy *= (v >= 4) ? 1 : scale; - float3 rbpVert = rbpC + m.x * rbpX + m.y * rbpY + m.z * rbpZ; + float3 rbpVertVS = rbpC + m.x * rbpX + m.y * rbpY + m.z * rbpZ; // Avoid generating (w = 0). - rbpVert.z = (abs(rbpVert.z) > FLT_MIN) ? rbpVert.z : FLT_MIN; + rbpVertVS.z = (abs(rbpVertVS.z) > FLT_MIN) ? rbpVertVS.z : FLT_MIN; - float4 hapVert = mul(g_mProjection, float4(rbpVert, 1)); + float4 hapVert = mul(g_mProjection, float4(rbpVertVS, 1)); // Warning: the W component may be negative. // Flipping the -W pyramid by negating all coordinates is incorrect @@ -497,6 +505,8 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) // Transform the X and Y components: [-w, w] -> [0, w]. hapVert.xy = 0.5 * hapVert.xy + (0.5 * hapVert.w); + // TODO: multiply vertex by ViewZ if orthographic for unified processing! + // For each vertex, we must determine whether it is within the bounds. // For culling and clipping, we must know, per culling plane, whether the vertex // is in the positive or the negative half-space. @@ -520,10 +530,12 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) if (behindMask == 0) // Inside? { - float3 rapVert = saturate(hapVert.xyz * rcp(hapVert.w)); // Must not generate negative values + float3 rapVertNDC = saturate(hapVert.xyz * rcp(hapVert.w)); // Must not generate negative values - rapAaBbMinPt = min(rapAaBbMinPt, rapVert); - rapAaBbMaxPt = max(rapAaBbMaxPt, rapVert); + ndcAaBbMinPt.xyz = min(ndcAaBbMinPt.xyz, rapVertNDC); + ndcAaBbMaxPt.xyz = max(ndcAaBbMaxPt.xyz, rapVertNDC); + ndcAaBbMinPt.w = min(ndcAaBbMinPt.w, rbpVertVS.z); + ndcAaBbMaxPt.w = max(ndcAaBbMaxPt.w, rbpVertVS.z); } else // Outside { @@ -612,10 +624,12 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) if (inside) { float3 rapVertNDC = float3(rapVertCS.xy * 0.5 + 0.5, rapVertCS.z); + float rbpVertVSz = hbpVertVS.z * rcp(hbpVertVS.w); - // Update the AABB. - rapAaBbMinPt = min(rapAaBbMinPt, rapVertNDC); - rapAaBbMaxPt = max(rapAaBbMaxPt, rapVertNDC); + ndcAaBbMinPt.xyz = min(ndcAaBbMinPt.xyz, rapVertNDC); + ndcAaBbMaxPt.xyz = max(ndcAaBbMaxPt.xyz, rapVertNDC); + ndcAaBbMinPt.w = min(ndcAaBbMinPt.w, rbpVertVSz); + ndcAaBbMaxPt.w = max(ndcAaBbMaxPt.w, rbpVertVSz); } } } @@ -671,8 +685,8 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) { uint f = NthBitLow(clipFaceMask, n); - ClipFaceAgainstViewVolumeAndUpdateAaBb(f, behindMasksOfVerts, firstVertexOffset, - rapAaBbMinPt, rapAaBbMaxPt); + ClipFaceAgainstViewVolumeAndUpdateAaBb(f, behindMasksOfVerts, firstVertexOffset, g_mInvProjection, + ndcAaBbMinPt, ndcAaBbMaxPt); } } } @@ -681,22 +695,26 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) // ... #else // Integer comparison works for floating-point numbers as long as the sign bit is 0. - // We must take care of the signed zero ourselves. - InterlockedMin(gs_RapAaBbMinPtX[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(rapAaBbMinPt.x))); - InterlockedMax(gs_RapAaBbMaxPtX[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(rapAaBbMaxPt.x))); - InterlockedMin(gs_RapAaBbMinPtY[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(rapAaBbMinPt.y))); - InterlockedMax(gs_RapAaBbMaxPtY[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(rapAaBbMaxPt.y))); - InterlockedMin(gs_RapAaBbMinPtZ[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(rapAaBbMinPt.z))); - InterlockedMax(gs_RapAaBbMaxPtZ[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(rapAaBbMaxPt.z))); + // We must take care of the signed zero ourselves. saturate() does not help here. + InterlockedMin(gs_NdcAaBbMinPtX[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.x))); + InterlockedMax(gs_NdcAaBbMaxPtX[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.x))); + InterlockedMin(gs_NdcAaBbMinPtY[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.y))); + InterlockedMax(gs_NdcAaBbMaxPtY[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.y))); + InterlockedMin(gs_NdcAaBbMinPtZ[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.z))); + InterlockedMax(gs_NdcAaBbMaxPtZ[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.z))); + InterlockedMin(gs_NdcAaBbMinPtW[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.w))); + InterlockedMax(gs_NdcAaBbMaxPtW[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.w))); GroupMemoryBarrierWithGroupSync(); - rapAaBbMinPt.x = asfloat(gs_RapAaBbMinPtX[groupLocalLightIndex]); - rapAaBbMaxPt.x = asfloat(gs_RapAaBbMaxPtX[groupLocalLightIndex]); - rapAaBbMinPt.y = asfloat(gs_RapAaBbMinPtY[groupLocalLightIndex]); - rapAaBbMaxPt.y = asfloat(gs_RapAaBbMaxPtY[groupLocalLightIndex]); - rapAaBbMinPt.z = asfloat(gs_RapAaBbMinPtZ[groupLocalLightIndex]); - rapAaBbMaxPt.z = asfloat(gs_RapAaBbMaxPtZ[groupLocalLightIndex]); + ndcAaBbMinPt.x = asfloat(gs_NdcAaBbMinPtX[groupLocalLightIndex]); + ndcAaBbMaxPt.x = asfloat(gs_NdcAaBbMaxPtX[groupLocalLightIndex]); + ndcAaBbMinPt.y = asfloat(gs_NdcAaBbMinPtY[groupLocalLightIndex]); + ndcAaBbMaxPt.y = asfloat(gs_NdcAaBbMaxPtY[groupLocalLightIndex]); + ndcAaBbMinPt.z = asfloat(gs_NdcAaBbMinPtZ[groupLocalLightIndex]); + ndcAaBbMaxPt.z = asfloat(gs_NdcAaBbMaxPtZ[groupLocalLightIndex]); + ndcAaBbMinPt.w = asfloat(gs_NdcAaBbMinPtW[groupLocalLightIndex]); + ndcAaBbMaxPt.w = asfloat(gs_NdcAaBbMaxPtW[groupLocalLightIndex]); #endif // USE_WAVE_INTRINSICS if (t % THREADS_PER_LIGHT == 0) // Avoid bank conflicts @@ -706,8 +724,8 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) // a set of maxs, and each set is equal to g_iNrVisibLights. const ScreenSpaceBoundsIndices boundsIndices = GenerateScreenSpaceBoundsIndices(lgtIndex, g_iNrVisibLights, eyeIndex); - g_vBoundsBuffer[boundsIndices.min] = float4(rapAaBbMinPt, 0); // TODO: add me - lin depth - g_vBoundsBuffer[boundsIndices.max] = float4(rapAaBbMaxPt, 100000); // + g_vBoundsBuffer[boundsIndices.min] = ndcAaBbMinPt; + g_vBoundsBuffer[boundsIndices.max] = ndcAaBbMaxPt; } #else // !Z_BINNING From 7aa331ce25d7f7c2362ecf4fd162638d6588ed7a Mon Sep 17 00:00:00 2001 From: Evgenii Date: Sat, 8 Aug 2020 14:05:14 -0700 Subject: [PATCH 11/22] Optimize orthographic --- .../Lighting/LightLoop/scrbound.compute | 65 ++++++++++--------- 1 file changed, 35 insertions(+), 30 deletions(-) diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute index abb17765989..a2cbbbd9e74 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute +++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute @@ -120,7 +120,7 @@ float4x4 PerspectiveProjection4x4(float a, float g, float n, float f) #define FACES_PER_THREAD DIV_ROUND_UP(NUM_FACES, THREADS_PER_LIGHT) // All planes and faces are always in the standard order (see below). -// Near and far planes are swapped in the case of Z-reversal, but it does not affect the algorithm. +// Near and far planes are swapped in the case of Z-reversal, but it does not change the algorithm. #define FACE_LEFT (1 << 0) // x = -1 #define FACE_RIGHT (1 << 1) // x = +1 #define FACE_FRONT (1 << 2) // y = -1 @@ -254,7 +254,7 @@ void ClipPolygonAgainstPlane(uint p, uint srcBegin, uint srcSize, // 1. v0 in, v1 out -> add intersection // 2. v0 out, v1 in -> add intersection, add v1 // 3. v0 in, v1 in -> add v1 - // (bc >= 0) <-> in, (bc < 0) <-> out. Beware of the signed zero. + // (bc >= 0) <-> in, (bc < 0) <-> out. Beware of -0. if ((tailVert.bc >= 0) != (leadVert.bc >= 0)) { @@ -292,11 +292,12 @@ void ClipPolygonAgainstPlane(uint p, uint srcBegin, uint srcSize, } } -void ClipFaceAgainstViewVolumeAndUpdateAaBb(uint f, uint behindMasksOfVerts[NUM_VERTS], uint firstVertexOffset, float4x4 g_mInvProjection, - inout float4 ndcAaBbMinPt, inout float4 ndcAaBbMaxPt) +void ClipFaceAgainstViewVolume(uint f, uint behindMasksOfVerts[NUM_VERTS], uint firstVertexOffset, + out uint srcBegin, out uint srcSize, + out float4 vertRingBuffer[MAX_CLIP_VERTS]) { - float4 vertRingBuffer[MAX_CLIP_VERTS]; - uint srcBegin = 0, srcSize = 4; + srcBegin = 0; + srcSize = 4; uint clipMaskOfFace = 0; // Initially in front uint vertMaskOfFace = s_VertMasksOfFaces[f]; @@ -331,7 +332,12 @@ void ClipFaceAgainstViewVolumeAndUpdateAaBb(uint f, uint behindMasksOfVerts[NUM_ clipMaskOfFace ^= 1 << p; // Clear the bit to continue using firstbitlow() } +} +void UpdateAaBb(uint srcBegin, uint srcSize, float4 vertRingBuffer[MAX_CLIP_VERTS], + bool isOrthoProj, float4x4 invProj, + inout float4 ndcAaBbMinPt, inout float4 ndcAaBbMaxPt) +{ #ifdef DUMB_COMPILER uint modSrcIdx = srcBegin % MAX_CLIP_VERTS; #endif @@ -341,17 +347,18 @@ void ClipFaceAgainstViewVolumeAndUpdateAaBb(uint f, uint behindMasksOfVerts[NUM_ #ifndef DUMB_COMPILER uint modSrcIdx = j % MAX_CLIP_VERTS; #endif - float4 hapVert = vertRingBuffer[modSrcIdx]; - float4 hbpVertVS = mul(g_mInvProjection, hapVert); // Just to support orthographic projection - float3 rapVertNDC = saturate(hapVert.xyz * rcp(hapVert.w)); // Must not generate negative values - float rbpVertVSz = hbpVertVS.z * rcp(hbpVertVS.w); + // Clamp to the bounds in case of numerical errors (may still generate -0). + float3 rapVertNDC = saturate(hapVert.xyz * rcp(hapVert.w)); + float rbpVertVSz = hapVert.w; - ndcAaBbMinPt.xyz = min(ndcAaBbMinPt.xyz, rapVertNDC); - ndcAaBbMaxPt.xyz = max(ndcAaBbMaxPt.xyz, rapVertNDC); - ndcAaBbMinPt.w = min(ndcAaBbMinPt.w, rbpVertVSz); - ndcAaBbMaxPt.w = max(ndcAaBbMaxPt.w, rbpVertVSz); + if (isOrthoProj) // Must replace (w = 1) + { + rbpVertVSz = dot(invProj[2], hapVert); + } + ndcAaBbMinPt = min(ndcAaBbMinPt, float4(rapVertNDC, rbpVertVSz)); + ndcAaBbMaxPt = max(ndcAaBbMaxPt, float4(rapVertNDC, rbpVertVSz)); #ifdef DUMB_COMPILER modSrcIdx++; modSrcIdx = (modSrcIdx == MAX_CLIP_VERTS) ? 0 : modSrcIdx; @@ -501,12 +508,11 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) // Warning: the W component may be negative. // Flipping the -W pyramid by negating all coordinates is incorrect // and will break both classification and clipping. + // For the orthographic projection, (w = 1). // Transform the X and Y components: [-w, w] -> [0, w]. hapVert.xy = 0.5 * hapVert.xy + (0.5 * hapVert.w); - // TODO: multiply vertex by ViewZ if orthographic for unified processing! - // For each vertex, we must determine whether it is within the bounds. // For culling and clipping, we must know, per culling plane, whether the vertex // is in the positive or the negative half-space. @@ -517,7 +523,6 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) // 0 <= y <= w <-- include boundary points to avoid clipping them later // 0 <= z <= w // w is always valid - // For the orthographic projection, (w = 1), so no modifications are necessary. // TODO: epsilon for numerical robustness? for (uint j = 0; j < (NUM_PLANES / 2); j++) @@ -530,12 +535,11 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) if (behindMask == 0) // Inside? { - float3 rapVertNDC = saturate(hapVert.xyz * rcp(hapVert.w)); // Must not generate negative values + // Clamp to the bounds in case of numerical errors (may still generate -0). + float3 rapVertNDC = saturate(hapVert.xyz * rcp(hapVert.w)); - ndcAaBbMinPt.xyz = min(ndcAaBbMinPt.xyz, rapVertNDC); - ndcAaBbMaxPt.xyz = max(ndcAaBbMaxPt.xyz, rapVertNDC); - ndcAaBbMinPt.w = min(ndcAaBbMinPt.w, rbpVertVS.z); - ndcAaBbMaxPt.w = max(ndcAaBbMaxPt.w, rbpVertVS.z); + ndcAaBbMinPt = min(ndcAaBbMinPt, float4(rapVertNDC, rbpVertVS.z)); + ndcAaBbMaxPt = max(ndcAaBbMaxPt, float4(rapVertNDC, rbpVertVS.z)); } else // Outside { @@ -616,7 +620,6 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) // -w < y < w <-- exclude boundary points, as we will not clip using these vertices // -w < z < w <-- assume that Z-precision is not very important here // 0 < w - // For the orthographic projection, (w = 1), so no modifications are necessary. // TODO: epsilon for numerical robustness? bool inside = Max3(abs(hapVertLS.x), abs(hapVertLS.y), abs(hapVertLS.z)) < hapVertLS.w; @@ -626,10 +629,8 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) float3 rapVertNDC = float3(rapVertCS.xy * 0.5 + 0.5, rapVertCS.z); float rbpVertVSz = hbpVertVS.z * rcp(hbpVertVS.w); - ndcAaBbMinPt.xyz = min(ndcAaBbMinPt.xyz, rapVertNDC); - ndcAaBbMaxPt.xyz = max(ndcAaBbMaxPt.xyz, rapVertNDC); - ndcAaBbMinPt.w = min(ndcAaBbMinPt.w, rbpVertVSz); - ndcAaBbMaxPt.w = max(ndcAaBbMaxPt.w, rbpVertVSz); + ndcAaBbMinPt = min(ndcAaBbMinPt, float4(rapVertNDC, rbpVertVSz)); + ndcAaBbMaxPt = max(ndcAaBbMaxPt, float4(rapVertNDC, rbpVertVSz)); } } } @@ -685,8 +686,12 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) { uint f = NthBitLow(clipFaceMask, n); - ClipFaceAgainstViewVolumeAndUpdateAaBb(f, behindMasksOfVerts, firstVertexOffset, g_mInvProjection, - ndcAaBbMinPt, ndcAaBbMaxPt); + uint srcBegin, srcSize; + float4 vertRingBuffer[MAX_CLIP_VERTS]; + ClipFaceAgainstViewVolume(f, behindMasksOfVerts, firstVertexOffset, + srcBegin, srcSize, vertRingBuffer); + UpdateAaBb(srcBegin, srcSize, vertRingBuffer, g_isOrthographic != 0, g_mInvProjection, + ndcAaBbMinPt, ndcAaBbMaxPt); } } } @@ -695,7 +700,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) // ... #else // Integer comparison works for floating-point numbers as long as the sign bit is 0. - // We must take care of the signed zero ourselves. saturate() does not help here. + // We must take care of -0 ourselves. saturate() does not help here. InterlockedMin(gs_NdcAaBbMinPtX[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.x))); InterlockedMax(gs_NdcAaBbMaxPtX[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.x))); InterlockedMin(gs_NdcAaBbMinPtY[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.y))); From 05a222e96f602f49088e141a647e8e8255e04892 Mon Sep 17 00:00:00 2001 From: Evgenii Date: Mon, 10 Aug 2020 13:10:23 -0700 Subject: [PATCH 12/22] Optimize LUT --- .../Lighting/LightLoop/scrbound.compute | 144 +++++++++--------- 1 file changed, 71 insertions(+), 73 deletions(-) diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute index a2cbbbd9e74..32c403ce85d 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute +++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute @@ -65,7 +65,8 @@ float4x4 Translation4x4(float3 d) return M; } -float3x3 Rotation3x3(float3 xAxis, float3 yAxis, float3 zAxis) +// Scale followed by rotation (scaled axes). +float3x3 ScaledRotation3x3(float3 xAxis, float3 yAxis, float3 zAxis) { float3x3 R = float3x3(xAxis, yAxis, zAxis); float3x3 C = transpose(R); // Row to column @@ -94,7 +95,7 @@ float4x4 Homogenize3x3(float3x3 R) float4x4 PerspectiveProjection4x4(float a, float g, float n, float f) { - float b = (f + n) * rcp(f - n); // z: [-1, 1] + float b = (f + n) * rcp(f - n); // Z in [-1, 1] float c = -2 * f * n * rcp(f - n); // No Z-reversal return float4x4(g/a, 0, 0, 0, @@ -103,7 +104,7 @@ float4x4 PerspectiveProjection4x4(float a, float g, float n, float f) 0, 0, 1, 0); } -#define CLEAR_SIGN_BIT(X) (asuint(X) & INT_MAX) +#define CLEAR_SIGN_BIT(X) (asint(X) & INT_MAX) #define DIV_ROUND_UP(N, D) (((N) + (D) - 1) / (D)) // No division by 0 checks // Clipping a plane by a cube may produce a hexagon (6-gon). @@ -121,43 +122,60 @@ float4x4 PerspectiveProjection4x4(float a, float g, float n, float f) // All planes and faces are always in the standard order (see below). // Near and far planes are swapped in the case of Z-reversal, but it does not change the algorithm. -#define FACE_LEFT (1 << 0) // x = -1 -#define FACE_RIGHT (1 << 1) // x = +1 -#define FACE_FRONT (1 << 2) // y = -1 -#define FACE_BACK (1 << 3) // y = +1 -#define FACE_TOP (1 << 4) // z = -1 -#define FACE_BOTTOM (1 << 5) // z = +1 +#define FACE_LEFT (1 << 0) // -X z +#define FACE_RIGHT (1 << 1) // +X / +#define FACE_TOP (1 << 2) // -Y 0 -- x +#define FACE_BOTTOM (1 << 3) // +Y | +#define FACE_FRONT (1 << 4) // -Z y +#define FACE_BACK (1 << 5) // +Z #define FACE_MASK ((1 << NUM_FACES) - 1) -// TODO: the compiler generates 'tbuffer_load_format_x' instructions -// when we access the look-up tables. Can we avoid this? - -// TODO: try vert order (0 0 0), (1 0 0), (0 1 0), (1 1 0), (0 0 1), (1 0 1), (0 1 1), (1 1 1) +// A list of vertices for each face (CCW order w.r.t. its normal, starting from the LSB). +#define VERT_LIST_LEFT ((2) << 9 | (6) << 6 | (4) << 3 | (0) << 0) +#define VERT_LIST_RIGHT ((5) << 9 | (7) << 6 | (3) << 3 | (1) << 0) +#define VERT_LIST_TOP ((1) << 9 | (3) << 6 | (2) << 3 | (0) << 0) +#define VERT_LIST_BOTTOM ((6) << 9 | (7) << 6 | (5) << 3 | (4) << 0) +#define VERT_LIST_FRONT ((4) << 9 | (5) << 6 | (1) << 3 | (0) << 0) +#define VERT_LIST_BACK ((3) << 9 | (7) << 6 | (6) << 3 | (2) << 0) // All vertices are always in the standard order (see below). -static const uint s_FaceMasksOfVerts[NUM_VERTS] = +uint GetFaceMaskOfVertex(uint v) { - FACE_LEFT | FACE_FRONT | FACE_TOP, // 0: (-1, -1, -1) - FACE_RIGHT | FACE_FRONT | FACE_TOP, // 1: (+1, -1, -1) - FACE_RIGHT | FACE_BACK | FACE_TOP, // 2: (+1, +1, -1) - FACE_LEFT | FACE_BACK | FACE_TOP, // 3: (-1, +1, -1) - FACE_LEFT | FACE_FRONT | FACE_BOTTOM, // 4: (-1, -1, +1) - FACE_RIGHT | FACE_FRONT | FACE_BOTTOM, // 5: (+1, -1, +1) - FACE_RIGHT | FACE_BACK | FACE_BOTTOM, // 6: (+1, +1, +1) - FACE_LEFT | FACE_BACK | FACE_BOTTOM // 7: (-1, +1, +1) + // 0: (-1, -1, -1) -> { FACE_LEFT | FACE_TOP | FACE_FRONT } + // 1: (+1, -1, -1) -> { FACE_RIGHT | FACE_TOP | FACE_FRONT } + // 2: (-1, +1, -1) -> { FACE_LEFT | FACE_BOTTOM | FACE_FRONT } + // 3: (+1, +1, -1) -> { FACE_RIGHT | FACE_BOTTOM | FACE_FRONT } + // 4: (-1, -1, +1) -> { FACE_LEFT | FACE_TOP | FACE_BACK } + // 5: (+1, -1, +1) -> { FACE_RIGHT | FACE_TOP | FACE_BACK } + // 6: (-1, +1, +1) -> { FACE_LEFT | FACE_BOTTOM | FACE_BACK } + // 7: (+1, +1, +1) -> { FACE_RIGHT | FACE_BOTTOM | FACE_BACK } + // ((v & 1) == 0) ? 1 : 2) | ((v & 2) == 0) ? 4 : 8) | ((v & 4) == 0) ? 16 : 32) + uint f = (FACE_LEFT << BitFieldExtract(v, 0, 1)) + | (FACE_TOP << BitFieldExtract(v, 1, 1)) + | (FACE_FRONT << BitFieldExtract(v, 2, 1)); + + return f; }; -// CCW order (starting with the LSB) of vertices for each face (w.r.t. its normal), -// with normals pointing in the interior of the volume. -static const uint s_VertMasksOfFaces[NUM_FACES] = +float3 GenerateVertexOfStandardCube(uint v) { - (3) << 9 | (7) << 6 | (4) << 3 | (0) << 0, // 0: FACE_LEFT - (5) << 9 | (6) << 6 | (2) << 3 | (1) << 0, // 1: FACE_RIGHT - (4) << 9 | (5) << 6 | (1) << 3 | (0) << 0, // 2: FACE_FRONT - (6) << 9 | (7) << 6 | (3) << 3 | (2) << 0, // 3: FACE_BACK - (1) << 9 | (2) << 6 | (3) << 3 | (0) << 0, // 4: FACE_TOP - (7) << 9 | (6) << 6 | (5) << 3 | (4) << 0 // 5: FACE_BOTTOM -}; + float3 p; + + p.x = ((v & 1) == 0) ? -1 : 1; + p.y = ((v & 2) == 0) ? -1 : 1; + p.z = ((v & 4) == 0) ? -1 : 1; + + return p; +} + +uint GetVertexListOfFace(uint f) +{ + static const uint3 allVertLists = uint3((VERT_LIST_RIGHT << 12) | VERT_LIST_LEFT, + (VERT_LIST_BOTTOM << 12) | VERT_LIST_TOP, + (VERT_LIST_BACK << 12) | VERT_LIST_FRONT); + + return BitFieldExtract(allVertLists[f >> 1], 12 * (f & 1), 12); +} // 5 arrays * 128 elements * 4 bytes each = 2560 bytes. groupshared float gs_HapVertsX[VERTS_PER_GROUP]; @@ -187,11 +205,11 @@ groupshared uint gs_NdcAaBbMaxPtW[LIGHTS_PER_GROUP]; // View-space Z coordinate bool TryCullFace(uint f, uint behindMasksOfVerts[NUM_VERTS]) { uint cullMaskOfFace = FACE_MASK; // Initially behind - uint vertMaskOfFace = s_VertMasksOfFaces[f]; + uint vertListOfFace = GetVertexListOfFace(f); for (int j = 0; j < 4; j++) { - uint v = BitFieldExtract(vertMaskOfFace, 3 * j, 3); + uint v = BitFieldExtract(vertListOfFace, 3 * j, 3); // Non-zero if ALL the vertices are behind any of the planes. cullMaskOfFace &= behindMasksOfVerts[v]; } @@ -207,9 +225,9 @@ struct ClipVertex ClipVertex CreateClipVertex(uint p, float4 v) { - bool evenPlane = (p % 2) == 0; + bool evenPlane = (p & 1) == 0; - float c = v[p / 2]; + float c = v[p >> 1]; float w = v.w; ClipVertex cv; @@ -300,7 +318,7 @@ void ClipFaceAgainstViewVolume(uint f, uint behindMasksOfVerts[NUM_VERTS], uint srcSize = 4; uint clipMaskOfFace = 0; // Initially in front - uint vertMaskOfFace = s_VertMasksOfFaces[f]; + uint vertListOfFace = GetVertexListOfFace(f); for (int j = 0; j < 4; j++) { @@ -316,11 +334,9 @@ void ClipFaceAgainstViewVolume(uint f, uint behindMasksOfVerts[NUM_VERTS], uint vertRingBuffer[j].w = gs_HapVertsW[firstVertexOffset + v]; } - const uint numPlanesToClipAgainst = countbits(clipMaskOfFace); // [1, 6] - // Sutherland-Hodgeman polygon clipping algorithm. // It works by clipping the entire polygon against one clipping plane at a time. - for (uint j = 0; j < numPlanesToClipAgainst; j++) + while (clipMaskOfFace != 0) { uint p = firstbitlow(clipMaskOfFace); @@ -341,13 +357,12 @@ void UpdateAaBb(uint srcBegin, uint srcSize, float4 vertRingBuffer[MAX_CLIP_VERT #ifdef DUMB_COMPILER uint modSrcIdx = srcBegin % MAX_CLIP_VERTS; #endif - for (uint j = srcBegin; j < (srcBegin + srcSize); j++) { #ifndef DUMB_COMPILER uint modSrcIdx = j % MAX_CLIP_VERTS; #endif - float4 hapVert = vertRingBuffer[modSrcIdx]; + float4 hapVert = vertRingBuffer[modSrcIdx]; // Clamp to the bounds in case of numerical errors (may still generate -0). float3 rapVertNDC = saturate(hapVert.xyz * rcp(hapVert.w)); float rbpVertVSz = hapVert.w; @@ -482,22 +497,17 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) { uint v = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT; - // rbpVerts[0] = rbpC - rbpX * scale.x - rbpY * scale.y - rbpZ; // (-1, -1, -1) - // rbpVerts[1] = rbpC + rbpX * scale.x - rbpY * scale.y - rbpZ; // (+1, -1, -1) - // rbpVerts[2] = rbpC + rbpX * scale.x + rbpY * scale.y - rbpZ; // (+1, +1, -1) - // rbpVerts[3] = rbpC - rbpX * scale.x + rbpY * scale.y - rbpZ; // (-1, +1, -1) - // rbpVerts[4] = rbpC - rbpX - rbpY + rbpZ; // (-1, -1, +1) - // rbpVerts[5] = rbpC + rbpX - rbpY + rbpZ; // (+1, -1, +1) - // rbpVerts[6] = rbpC + rbpX + rbpY + rbpZ; // (+1, +1, +1) - // rbpVerts[7] = rbpC - rbpX + rbpY + rbpZ; // (-1, +1, +1) + // rbpVerts[0] = rbpC - rbpX * scale - rbpY * scale - rbpZ; (-s, -s, -1) + // rbpVerts[1] = rbpC + rbpX * scale - rbpY * scale - rbpZ; (+s, -s, -1) + // rbpVerts[2] = rbpC - rbpX * scale + rbpY * scale - rbpZ; (-s, +s, -1) + // rbpVerts[3] = rbpC + rbpX * scale + rbpY * scale - rbpZ; (+s, +s, -1) + // rbpVerts[4] = rbpC - rbpX - rbpY + rbpZ; (-1, -1, +1) + // rbpVerts[5] = rbpC + rbpX - rbpY + rbpZ; (+1, -1, +1) + // rbpVerts[6] = rbpC - rbpX + rbpY + rbpZ; (-1, +1, +1) + // rbpVerts[7] = rbpC + rbpX + rbpY + rbpZ; (+1, +1, +1) - float3 m; // See the comment above - - m.x = (countbits(v % 4) == 1) ? 1 : -1; - m.y = ((v & 2) != 0) ? 1 : -1; - m.z = (v >= 4) ? 1 : -1; - - m.xy *= (v >= 4) ? 1 : scale; + float3 m = GenerateVertexOfStandardCube(v); + m.xy *= ((v & 4) == 0) ? scale : 1; // X, Y in [-scale, scale] float3 rbpVertVS = rbpC + m.x * rbpX + m.y * rbpY + m.z * rbpZ; // Avoid generating (w = 0). @@ -543,7 +553,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) } else // Outside { - cullClipFaceMask |= s_FaceMasksOfVerts[v]; + cullClipFaceMask |= GetFaceMaskOfVertex(v); } gs_HapVertsX[firstVertexOffset + v] = hapVert.x; @@ -572,7 +582,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) // The light volume is a special type of cuboid - a right frustum. // We can exploit this fact by building a light-space projection matrix. float4x4 invTranslateToLightSpace = Translation4x4(-rbpC); - float4x4 invRotateAndScaleInLightSpace = Homogenize3x3(Invert3x3(Rotation3x3(rbpX, rbpY, rbpZ))); + float4x4 invRotateAndScaleInLightSpace = Homogenize3x3(Invert3x3(ScaledRotation3x3(rbpX, rbpY, rbpZ))); // TODO: avoid full inversion by using unit vectors and passing magnitudes explicitly. // This (orthographic) projection matrix maps a view-space point to a light-space [-1, 1]^3 cube. @@ -597,20 +607,8 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) { uint v = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT; - // rapVertsCS[0] = (-1, -1, 0) - // rapVertsCS[1] = (+1, -1, 0) - // rapVertsCS[2] = (+1, +1, 0) - // rapVertsCS[3] = (-1, +1, 0) - // rapVertsCS[4] = (-1, -1, 1) - // rapVertsCS[5] = (+1, -1, 1) - // rapVertsCS[6] = (+1, +1, 1) - // rapVertsCS[7] = (-1, +1, 1) - - float3 rapVertCS; // See the comment above - - rapVertCS.x = (countbits(v % 4) == 1) ? 1 : -1; - rapVertCS.y = ((v & 2) != 0) ? 1 : -1; - rapVertCS.z = (v >= 4) ? 1 : 0; + float3 rapVertCS = GenerateVertexOfStandardCube(v); + rapVertCS.z = rapVertCS.z * 0.5 + 0.5; // View's projection matrix MUST map Z to [0, 1] float4 hbpVertVS = mul(g_mInvProjection, float4(rapVertCS, 1)); // Clip to view space float4 hapVertLS = mul(lightSpaceMatrix, hbpVertVS); // View to light space From 1e90134ad9e6104b80d38ccdf8b830ef2ffba440 Mon Sep 17 00:00:00 2001 From: Evgenii Date: Tue, 11 Aug 2020 12:54:05 -0700 Subject: [PATCH 13/22] Add wave intrinsic support --- .../Lighting/LightLoop/scrbound.compute | 66 ++++++++++++++----- 1 file changed, 49 insertions(+), 17 deletions(-) diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute index 32c403ce85d..efa35aed440 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute +++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute @@ -28,7 +28,6 @@ RWStructuredBuffer g_vBoundsBuffer : register( u0 ); #define Z_BINNING #define DUMB_COMPILER -// #define USE_WAVE_INTRINSICS // We use TGSM and atomic operations if wave intrinsics are not supported #ifdef Z_BINNING @@ -170,9 +169,10 @@ float3 GenerateVertexOfStandardCube(uint v) uint GetVertexListOfFace(uint f) { - static const uint3 allVertLists = uint3((VERT_LIST_RIGHT << 12) | VERT_LIST_LEFT, - (VERT_LIST_BOTTOM << 12) | VERT_LIST_TOP, - (VERT_LIST_BACK << 12) | VERT_LIST_FRONT); + // Warning: don't add 'static' here unless you want really bad code gen. + const uint3 allVertLists = uint3((VERT_LIST_RIGHT << 12) | VERT_LIST_LEFT, + (VERT_LIST_BOTTOM << 12) | VERT_LIST_TOP, + (VERT_LIST_BACK << 12) | VERT_LIST_FRONT); return BitFieldExtract(allVertLists[f >> 1], 12 * (f & 1), 12); } @@ -184,7 +184,7 @@ groupshared float gs_HapVertsZ[VERTS_PER_GROUP]; groupshared float gs_HapVertsW[VERTS_PER_GROUP]; groupshared uint gs_BehindMasksOfVerts[VERTS_PER_GROUP]; // 6 planes each (HLSL does not support small data types) -#ifndef USE_WAVE_INTRINSICS +#ifndef PLATFORM_SUPPORTS_WAVE_INTRINSICS // 1 array * 16 elements * 4 bytes each = 64 bytes. groupshared uint gs_CullClipFaceMasks[LIGHTS_PER_GROUP]; // 6 faces each (HLSL does not support small data types) @@ -199,7 +199,7 @@ groupshared uint gs_NdcAaBbMinPtZ[LIGHTS_PER_GROUP]; // Note that min-max Z can groupshared uint gs_NdcAaBbMaxPtZ[LIGHTS_PER_GROUP]; // from min-max W if the projection is oblique. groupshared uint gs_NdcAaBbMinPtW[LIGHTS_PER_GROUP]; // View-space Z coordinate groupshared uint gs_NdcAaBbMaxPtW[LIGHTS_PER_GROUP]; // View-space Z coordinate -#endif // USE_WAVE_INTRINSICS +#endif // PLATFORM_SUPPORTS_WAVE_INTRINSICS // Returns 'true' if it manages to cull the face. bool TryCullFace(uint f, uint behindMasksOfVerts[NUM_VERTS]) @@ -322,7 +322,7 @@ void ClipFaceAgainstViewVolume(uint f, uint behindMasksOfVerts[NUM_VERTS], uint for (int j = 0; j < 4; j++) { - uint v = BitFieldExtract(vertMaskOfFace, 3 * j, 3); + uint v = BitFieldExtract(vertListOfFace, 3 * j, 3); // Non-zero if ANY of the vertices are behind any of the planes. clipMaskOfFace |= behindMasksOfVerts[v]; @@ -467,7 +467,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) const float3 rbpY = lgtDat.boxAxisY.xyz; // Pre-scaled const float3 rbpZ = lgtDat.boxAxisZ.xyz; // Pre-scaled -#ifndef USE_WAVE_INTRINSICS +#ifndef PLATFORM_SUPPORTS_WAVE_INTRINSICS // (0) Initialize the TGSM. if (t % THREADS_PER_LIGHT == 0) // Avoid bank conflicts { @@ -481,7 +481,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) gs_NdcAaBbMinPtW[groupLocalLightIndex] = asuint(FLT_INF); gs_NdcAaBbMaxPtW[groupLocalLightIndex] = asuint(0.0f); } -#endif // USE_WAVE_INTRINSICS +#endif // PLATFORM_SUPPORTS_WAVE_INTRINSICS float4 ndcAaBbMinPt = float4(1, 1, 1, FLT_INF); float4 ndcAaBbMaxPt = 0; @@ -563,8 +563,15 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) gs_BehindMasksOfVerts[firstVertexOffset + v] = behindMask; } -#ifdef USE_WAVE_INTRINSICS - // ... +#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS + for (uint i = 0; i < FastLog2(THREADS_PER_LIGHT); i++) + { + uint andMask = PLATFORM_LANE_COUNT - 1; // All lanes + uint orMask = 0; // Plays no role + uint xorMask = 1 << i; // Flip bits one by one starting from the LSB + // TODO: Francesco - expose the right intrinsic. + cullClipFaceMask |= LaneSwizzle(cullClipFaceMask, orMask, 0, xorMask); + } #else InterlockedOr(gs_CullClipFaceMasks[groupLocalLightIndex], cullClipFaceMask); @@ -633,6 +640,10 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) } } +#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS + GroupMemoryBarrierWithGroupSync(); +#endif + uint behindMasksOfVerts[NUM_VERTS]; for (uint i = 0; i < NUM_VERTS; i++) @@ -661,8 +672,15 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) } } -#ifdef USE_WAVE_INTRINSICS - // ... +#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS + for (uint i = 0; i < FastLog2(THREADS_PER_LIGHT); i++) + { + uint andMask = PLATFORM_LANE_COUNT - 1; // All lanes + uint orMask = 0; // Plays no role + uint xorMask = 1 << i; // Flip bits one by one starting from the LSB + // TODO: Francesco - expose the right intrinsic. + cullClipFaceMask &= LaneSwizzle(cullClipFaceMask, orMask, 0, xorMask); + } #else InterlockedAnd(gs_CullClipFaceMasks[groupLocalLightIndex], cullClipFaceMask); @@ -694,11 +712,25 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) } } -#ifdef USE_WAVE_INTRINSICS - // ... +#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS + for (uint i = 0; i < FastLog2(THREADS_PER_LIGHT); i++) + { + uint andMask = PLATFORM_LANE_COUNT - 1; // All lanes + uint orMask = 0; // Plays no role + uint xorMask = 1 << i; // Flip bits one by one starting from the LSB + // TODO: Francesco - expose the right intrinsic. + ndcAaBbMinPt.x = min(ndcAaBbMinPt.x, LaneSwizzle(ndcAaBbMinPt.x, orMask, 0, xorMask)); + ndcAaBbMaxPt.x = max(ndcAaBbMaxPt.x, LaneSwizzle(ndcAaBbMaxPt.x, orMask, 0, xorMask)); + ndcAaBbMinPt.y = min(ndcAaBbMinPt.y, LaneSwizzle(ndcAaBbMinPt.y, orMask, 0, xorMask)); + ndcAaBbMaxPt.y = max(ndcAaBbMaxPt.y, LaneSwizzle(ndcAaBbMaxPt.y, orMask, 0, xorMask)); + ndcAaBbMinPt.z = min(ndcAaBbMinPt.z, LaneSwizzle(ndcAaBbMinPt.z, orMask, 0, xorMask)); + ndcAaBbMaxPt.z = max(ndcAaBbMaxPt.z, LaneSwizzle(ndcAaBbMaxPt.z, orMask, 0, xorMask)); + ndcAaBbMinPt.w = min(ndcAaBbMinPt.w, LaneSwizzle(ndcAaBbMinPt.w, orMask, 0, xorMask)); + ndcAaBbMaxPt.w = max(ndcAaBbMaxPt.w, LaneSwizzle(ndcAaBbMaxPt.w, orMask, 0, xorMask)); + } #else // Integer comparison works for floating-point numbers as long as the sign bit is 0. - // We must take care of -0 ourselves. saturate() does not help here. + // We must take care of -0 ourselves. saturate() does not help. InterlockedMin(gs_NdcAaBbMinPtX[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.x))); InterlockedMax(gs_NdcAaBbMaxPtX[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.x))); InterlockedMin(gs_NdcAaBbMinPtY[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.y))); @@ -718,7 +750,7 @@ void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) ndcAaBbMaxPt.z = asfloat(gs_NdcAaBbMaxPtZ[groupLocalLightIndex]); ndcAaBbMinPt.w = asfloat(gs_NdcAaBbMinPtW[groupLocalLightIndex]); ndcAaBbMaxPt.w = asfloat(gs_NdcAaBbMaxPtW[groupLocalLightIndex]); -#endif // USE_WAVE_INTRINSICS +#endif // PLATFORM_SUPPORTS_WAVE_INTRINSICS if (t % THREADS_PER_LIGHT == 0) // Avoid bank conflicts { From 9e4b8c635c4c5205cac84951aa13c1bce8321edf Mon Sep 17 00:00:00 2001 From: Evgenii Date: Tue, 11 Aug 2020 16:07:59 -0700 Subject: [PATCH 14/22] Fix group count --- .../Runtime/Lighting/LightLoop/LightLoop.cs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs index 3d6fc7dc90c..9ae90873a5a 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs +++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs @@ -2786,7 +2786,12 @@ static void GenerateLightsScreenSpaceAABBs(in BuildGPULightListParameters parame cmd.SetComputeMatrixArrayParam(parameters.screenSpaceAABBShader, HDShaderIDs.g_mProjectionArr, parameters.lightListProjHMatrices); cmd.SetComputeMatrixArrayParam(parameters.screenSpaceAABBShader, HDShaderIDs.g_mInvProjectionArr, parameters.lightListInvProjHMatrices); - cmd.DispatchCompute(parameters.screenSpaceAABBShader, parameters.screenSpaceAABBKernel, (parameters.totalLightCount + 7) / 8, parameters.viewCount, 1); + const int threadsPerLight = 4; // Shader: THREADS_PER_LIGHT (4) + const int threadsPerGroup = 64; // Shader: THREADS_PER_GROUP (64) + + int groupCount = HDUtils.DivRoundUp(parameters.totalLightCount * threadsPerLight, threadsPerGroup); + + cmd.DispatchCompute(parameters.screenSpaceAABBShader, parameters.screenSpaceAABBKernel, groupCount, parameters.viewCount, 1); } } From f52d29ba941cb70226606f189b0867858958a7ec Mon Sep 17 00:00:00 2001 From: Evgenii Date: Tue, 11 Aug 2020 16:14:11 -0700 Subject: [PATCH 15/22] Reduce the kernel count to 1 --- .../Runtime/Lighting/LightLoop/LightLoop.cs | 7 +------ .../Runtime/Lighting/LightLoop/scrbound.compute | 8 +++----- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs index 9ae90873a5a..a476e3b789f 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs +++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs @@ -564,8 +564,6 @@ public void Allocate() Shader deferredTilePixelShader { get { return defaultResources.shaders.deferredTilePS; } } - static int s_GenAABBKernel; - static int s_GenAABBKernel_Oblique; static int s_GenListPerTileKernel; static int s_GenListPerTileKernel_Oblique; static int s_GenListPerVoxelKernel; @@ -782,9 +780,6 @@ void InitializeLightLoop(IBLFilterBSDF[] iBLFilterBSDFArray) m_MaxLightsOnScreen = m_MaxDirectionalLightsOnScreen + m_MaxPunctualLightsOnScreen + m_MaxAreaLightsOnScreen + m_MaxEnvLightsOnScreen; m_MaxPlanarReflectionOnScreen = lightLoopSettings.maxPlanarReflectionOnScreen; - s_GenAABBKernel = buildScreenAABBShader.FindKernel("ScreenBoundsAABB"); - s_GenAABBKernel_Oblique = buildScreenAABBShader.FindKernel("ScreenBoundsAABB_Oblique"); - // Cluster { s_ClearVoxelAtomicKernel = buildPerVoxelLightListShader.FindKernel("ClearAtomic"); @@ -3075,7 +3070,7 @@ BuildGPULightListParameters PrepareBuildGPULightListParameters(HDCamera hdCamera // Screen space AABB parameters.screenSpaceAABBShader = buildScreenAABBShader; - parameters.screenSpaceAABBKernel = isProjectionOblique ? s_GenAABBKernel_Oblique : s_GenAABBKernel; + parameters.screenSpaceAABBKernel = 0; // camera to screen matrix (and it's inverse) for (int viewIndex = 0; viewIndex < hdCamera.viewCount; ++viewIndex) { diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute index efa35aed440..6f697c6cb4e 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute +++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute @@ -1,10 +1,6 @@ // The implementation is based on the demo on "fine pruned tiled lighting" published in GPU Pro 7. // https://github.com/wolfgangfengel/GPU-Pro-7 -#pragma kernel ScreenBoundsAABB SCRAABBGEN=ScreenBoundsAABB -#pragma kernel ScreenBoundsAABB_Oblique SCRAABBGEN=ScreenBoundsAABB_Oblique USE_OBLIQUE_MODE - - #include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl" #include "Packages/com.unity.render-pipelines.high-definition-config/Runtime/ShaderConfig.cs.hlsl" #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs.hlsl" @@ -13,6 +9,8 @@ // #pragma enable_d3d11_debug_symbols #pragma only_renderers d3d11 playstation xboxone vulkan metal switch +#pragma kernel GenLightAABB + uniform int g_isOrthographic; uniform int g_iNrVisibLights; @@ -408,7 +406,7 @@ void CalcBound(out bool2 bIsMinValid, out bool2 bIsMaxValid, out float2 vMin, ou #endif // Z_BINNING [numthreads(NR_THREADS, 1, 1)] -void SCRAABBGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) +void GenLightAABB(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) { uint groupID = u3GroupID.x; uint eyeIndex = u3GroupID.y; // currently, can only be 0 or 1 From 64f50ba7319eb9ad91a084ecb49376ff68acb010 Mon Sep 17 00:00:00 2001 From: Evgenii Date: Tue, 11 Aug 2020 16:49:23 -0700 Subject: [PATCH 16/22] Remove old code --- .../Lighting/LightLoop/scrbound.compute | 701 +++--------------- 1 file changed, 96 insertions(+), 605 deletions(-) diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute index 6f697c6cb4e..86144d43973 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute +++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute @@ -11,23 +11,21 @@ #pragma kernel GenLightAABB +/* ------------------------------ Inputs ------------------------------------ */ + uniform int g_isOrthographic; uniform int g_iNrVisibLights; uniform float4x4 g_mInvProjectionArr[SHADEROPTIONS_XR_MAX_VIEWS]; uniform float4x4 g_mProjectionArr[SHADEROPTIONS_XR_MAX_VIEWS]; -StructuredBuffer g_data : register( t0 ); - -#define NR_THREADS 64 +StructuredBuffer g_data : register(t0); -// output buffer -RWStructuredBuffer g_vBoundsBuffer : register( u0 ); +/* ------------------------------ Outputs ----------------------------------- */ -#define Z_BINNING -#define DUMB_COMPILER +RWStructuredBuffer g_vBoundsBuffer : register(u0); -#ifdef Z_BINNING +/* ------------------------------ Utilities --------------------------------- */ // Returns the location of the N-th set bit starting from the lowest order bit and working upward. // Slow implementation - do not use for large bit sets. @@ -101,6 +99,10 @@ float4x4 PerspectiveProjection4x4(float a, float g, float n, float f) 0, 0, 1, 0); } +/* ------------------------------ Implementation ---------------------------- */ + +#define DUMB_COMPILER // Improve the quality of generated code + #define CLEAR_SIGN_BIT(X) (asint(X) & INT_MAX) #define DIV_ROUND_UP(N, D) (((N) + (D) - 1) / (D)) // No division by 0 checks @@ -349,7 +351,7 @@ void ClipFaceAgainstViewVolume(uint f, uint behindMasksOfVerts[NUM_VERTS], uint } void UpdateAaBb(uint srcBegin, uint srcSize, float4 vertRingBuffer[MAX_CLIP_VERTS], - bool isOrthoProj, float4x4 invProj, + bool isOrthoProj, float4x4 invProjMat, inout float4 ndcAaBbMinPt, inout float4 ndcAaBbMaxPt) { #ifdef DUMB_COMPILER @@ -367,7 +369,7 @@ void UpdateAaBb(uint srcBegin, uint srcSize, float4 vertRingBuffer[MAX_CLIP_VERT if (isOrthoProj) // Must replace (w = 1) { - rbpVertVSz = dot(invProj[2], hapVert); + rbpVertVSz = dot(invProjMat[2], hapVert); } ndcAaBbMinPt = min(ndcAaBbMinPt, float4(rapVertNDC, rbpVertVSz)); @@ -379,105 +381,70 @@ void UpdateAaBb(uint srcBegin, uint srcSize, float4 vertRingBuffer[MAX_CLIP_VERT } } -#else // !Z_BINNING - -#define THREADS_PER_LIGHT (8) -#define THREADS_PER_GROUP (64) -#define LIGHTS_PER_GROUP (THREADS_PER_GROUP / THREADS_PER_LIGHT) -#define MAX_PNTS (9) // strictly this should be 10=6+4 but we get more wavefronts and 10 seems to never hit (fingers crossed) - // However, worst case the plane that would be skipped if such an extreme case ever happened would be backplane - // clipping gets skipped which doesn't cause any errors. - - -// LDS (2496 bytes) -groupshared float posX[MAX_PNTS*8*2]; -groupshared float posY[MAX_PNTS*8*2]; -groupshared float posZ[MAX_PNTS*8*2]; -groupshared float posW[MAX_PNTS*8*2]; -groupshared unsigned int clipFlags[48]; - +//********************************************************************************************** +// The goal of this program is to compute the AABB of the light in the NDC space ([0, 1] range). +// The light is represented by a convex volume (a cuboid) with 6 faces (planar quads) and 8 vertices. +// +// Since a light volume may be partially off-screen, we must clip it before computing the AABB. +// Clipping the resulting AABB (rather than the light volume itself) may result in a loose AABB. +// +// To avoid having to deal with the "Moebius twist" property of the perspective transform, +// we perform clipping using the homogeneous (projective) post-perspective coordinates. +// This clipping method in described in Blinn's paper titled "Line Clipping". +// +// The algorithm processes a light on 4 threads. While all 6 faces may require clipping in the +// worst case, clipping more than 4 faces is very uncommon (typically, we clip 0, 3 or 4). +// Some faces may require culling rather than clipping (the former is simpler). +// +// It's important to realize that face culling may end up culling 5 (or even all 6) faces. +// This means that the clipped light volume may be reduced to a single polygon, or nothing at all. +// (Imagine a view volume completely or partially inside a light volume). +// Therefore, we must perform view-volume-corner-inside-light-volume tests. +// +// +// Notation: +// rbp - real (3D) coordinates before perspective +// hbp - hom. (4D) coordinates before perspective +// hap - hom. (4D) coordinates after perspective +// rap - real (3D) coordinates after perspective (after division by w) +// ********************************************************************************************* + +[numthreads(THREADS_PER_GROUP, 1, 1)] +void GenLightAABB(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID) +{ + const uint t = threadID; + const uint g = groupID.x; + const uint eyeIndex = groupID.y; // Currently, can only be 0 or 1 -unsigned int GetClip(const float4 P); -int ClipAgainstPlane(const int iSrcIndex, const int iNrSrcVerts, const int subLigt, const int p); -void CalcBound(out bool2 bIsMinValid, out bool2 bIsMaxValid, out float2 vMin, out float2 vMax, float4x4 InvProjection, float3 pos_view_space, float r); + const uint intraGroupLightIndex = t / THREADS_PER_LIGHT; + const uint globalLightIndex = g * LIGHTS_PER_GROUP + intraGroupLightIndex; + const uint firstVertexOffset = intraGroupLightIndex * NUM_VERTS; -#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightingConvexHullUtils.hlsl" + const int eyeAdjustedInputOffset = GenerateLightCullDataIndex(globalLightIndex, g_iNrVisibLights, eyeIndex); + const SFiniteLightBound cullData = g_data[eyeAdjustedInputOffset]; -#endif // Z_BINNING + const float4x4 projMat = g_mProjectionArr[eyeIndex]; + const float4x4 invProjMat = g_mInvProjectionArr[eyeIndex]; -[numthreads(NR_THREADS, 1, 1)] -void GenLightAABB(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) -{ - uint groupID = u3GroupID.x; - uint eyeIndex = u3GroupID.y; // currently, can only be 0 or 1 - - // The g_ is preserved in order to make cross-pipeline (FPTL) updates easier - float4x4 g_mInvProjection = g_mInvProjectionArr[eyeIndex]; - float4x4 g_mProjection = g_mProjectionArr[eyeIndex]; - - //uint vindex = groupID * NR_THREADS + threadID; - unsigned int g = groupID; - unsigned int t = threadID; - - const int subLigt = (uint) (t/THREADS_PER_LIGHT); - const int lgtIndex = subLigt+(uint) g*LIGHTS_PER_GROUP; - const int sideIndex = (uint) (t%8); - - const int eyeAdjustedLgtIndex = GenerateLightCullDataIndex(lgtIndex, g_iNrVisibLights, eyeIndex); - SFiniteLightBound lgtDat = g_data[eyeAdjustedLgtIndex]; - -#ifdef Z_BINNING - //********************************************************************************************** - // The goal of this program is to compute the AABB of the light in the NDC space ([0, 1] range). - // The light is represented by a convex volume (a cuboid) with 6 faces (planar quads) and 8 vertices. - // - // Since a light volume may be partially off-screen, we must clip it before computing the AABB. - // Clipping the resulting AABB (rather than the light volume itself) may result in a loose AABB. - // - // To avoid having to deal with the "Moebius twist" property of the perspective transform, - // we perform clipping using the homogeneous (projective) post-perspective coordinates. - // This clipping method in described in Blinn's paper titled "Line Clipping". - // - // The algorithm processes a light on 4 threads. While all 6 faces may require clipping in the - // worst case, clipping more than 4 faces is very uncommon (typically, we clip 0, 3 or 4). - // Some faces may require culling rather than clipping (the former is simpler). - // - // It's important to realize that face culling may end up culling 5 (or even all 6) faces. - // This means that the clipped light volume may be reduced to a single polygon, or nothing at all. - // (Imagine a view volume completely or partially inside a light volume). - // Therefore, we must perform view-volume-corner-inside-light-volume tests. - // - // - // Notation: - // rbp - real (3D) coordinates before perspective - // hbp - hom. (4D) coordinates before perspective - // hap - hom. (4D) coordinates after perspective - // rap - real (3D) coordinates after perspective (after division by w) - // ********************************************************************************************* - - const uint groupLocalLightIndex = t / THREADS_PER_LIGHT; - const uint firstVertexOffset = NUM_VERTS * groupLocalLightIndex; - - const float scale = lgtDat.scaleXY; // scale.x = scale.y - const float3 rbpC = lgtDat.center.xyz; - // TODO: store X, Y, Scale - const float3 rbpX = lgtDat.boxAxisX.xyz; // Pre-scaled - const float3 rbpY = lgtDat.boxAxisY.xyz; // Pre-scaled - const float3 rbpZ = lgtDat.boxAxisZ.xyz; // Pre-scaled + const float scale = cullData.scaleXY; // scale.x = scale.y + const float3 rbpC = cullData.center.xyz; // View-space + const float3 rbpX = cullData.boxAxisX.xyz; // Pre-scaled + const float3 rbpY = cullData.boxAxisY.xyz; // Pre-scaled + const float3 rbpZ = cullData.boxAxisZ.xyz; // Pre-scaled #ifndef PLATFORM_SUPPORTS_WAVE_INTRINSICS // (0) Initialize the TGSM. if (t % THREADS_PER_LIGHT == 0) // Avoid bank conflicts { - gs_CullClipFaceMasks[groupLocalLightIndex] = 0; // Initially inside - gs_NdcAaBbMinPtX[groupLocalLightIndex] = asuint(1.0f); - gs_NdcAaBbMaxPtX[groupLocalLightIndex] = asuint(0.0f); - gs_NdcAaBbMinPtY[groupLocalLightIndex] = asuint(1.0f); - gs_NdcAaBbMaxPtY[groupLocalLightIndex] = asuint(0.0f); - gs_NdcAaBbMinPtZ[groupLocalLightIndex] = asuint(1.0f); - gs_NdcAaBbMaxPtZ[groupLocalLightIndex] = asuint(0.0f); - gs_NdcAaBbMinPtW[groupLocalLightIndex] = asuint(FLT_INF); - gs_NdcAaBbMaxPtW[groupLocalLightIndex] = asuint(0.0f); + gs_CullClipFaceMasks[intraGroupLightIndex] = 0; // Initially inside + gs_NdcAaBbMinPtX[intraGroupLightIndex] = asuint(1.0f); + gs_NdcAaBbMaxPtX[intraGroupLightIndex] = asuint(0.0f); + gs_NdcAaBbMinPtY[intraGroupLightIndex] = asuint(1.0f); + gs_NdcAaBbMaxPtY[intraGroupLightIndex] = asuint(0.0f); + gs_NdcAaBbMinPtZ[intraGroupLightIndex] = asuint(1.0f); + gs_NdcAaBbMaxPtZ[intraGroupLightIndex] = asuint(0.0f); + gs_NdcAaBbMinPtW[intraGroupLightIndex] = asuint(FLT_INF); + gs_NdcAaBbMaxPtW[intraGroupLightIndex] = asuint(0.0f); } #endif // PLATFORM_SUPPORTS_WAVE_INTRINSICS @@ -511,7 +478,7 @@ void GenLightAABB(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) // Avoid generating (w = 0). rbpVertVS.z = (abs(rbpVertVS.z) > FLT_MIN) ? rbpVertVS.z : FLT_MIN; - float4 hapVert = mul(g_mProjection, float4(rbpVertVS, 1)); + float4 hapVert = mul(projMat, float4(rbpVertVS, 1)); // Warning: the W component may be negative. // Flipping the -W pyramid by negating all coordinates is incorrect @@ -571,11 +538,11 @@ void GenLightAABB(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) cullClipFaceMask |= LaneSwizzle(cullClipFaceMask, orMask, 0, xorMask); } #else - InterlockedOr(gs_CullClipFaceMasks[groupLocalLightIndex], cullClipFaceMask); + InterlockedOr(gs_CullClipFaceMasks[intraGroupLightIndex], cullClipFaceMask); GroupMemoryBarrierWithGroupSync(); - cullClipFaceMask = gs_CullClipFaceMasks[groupLocalLightIndex]; + cullClipFaceMask = gs_CullClipFaceMasks[intraGroupLightIndex]; #endif // (2) Test the corners of the view volume. @@ -615,8 +582,8 @@ void GenLightAABB(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) float3 rapVertCS = GenerateVertexOfStandardCube(v); rapVertCS.z = rapVertCS.z * 0.5 + 0.5; // View's projection matrix MUST map Z to [0, 1] - float4 hbpVertVS = mul(g_mInvProjection, float4(rapVertCS, 1)); // Clip to view space - float4 hapVertLS = mul(lightSpaceMatrix, hbpVertVS); // View to light space + float4 hbpVertVS = mul(invProjMat, float4(rapVertCS, 1)); // Clip to view space + float4 hapVertLS = mul(lightSpaceMatrix, hbpVertVS); // View to light space // Consider the vertex to be inside the light volume if: // -w < x < w @@ -680,11 +647,11 @@ void GenLightAABB(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) cullClipFaceMask &= LaneSwizzle(cullClipFaceMask, orMask, 0, xorMask); } #else - InterlockedAnd(gs_CullClipFaceMasks[groupLocalLightIndex], cullClipFaceMask); + InterlockedAnd(gs_CullClipFaceMasks[intraGroupLightIndex], cullClipFaceMask); GroupMemoryBarrierWithGroupSync(); - cullClipFaceMask = gs_CullClipFaceMasks[groupLocalLightIndex]; + cullClipFaceMask = gs_CullClipFaceMasks[intraGroupLightIndex]; #endif // (4) Clip the faces. @@ -704,7 +671,7 @@ void GenLightAABB(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) float4 vertRingBuffer[MAX_CLIP_VERTS]; ClipFaceAgainstViewVolume(f, behindMasksOfVerts, firstVertexOffset, srcBegin, srcSize, vertRingBuffer); - UpdateAaBb(srcBegin, srcSize, vertRingBuffer, g_isOrthographic != 0, g_mInvProjection, + UpdateAaBb(srcBegin, srcSize, vertRingBuffer, g_isOrthographic != 0, invProjMat, ndcAaBbMinPt, ndcAaBbMaxPt); } } @@ -729,510 +696,34 @@ void GenLightAABB(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) #else // Integer comparison works for floating-point numbers as long as the sign bit is 0. // We must take care of -0 ourselves. saturate() does not help. - InterlockedMin(gs_NdcAaBbMinPtX[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.x))); - InterlockedMax(gs_NdcAaBbMaxPtX[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.x))); - InterlockedMin(gs_NdcAaBbMinPtY[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.y))); - InterlockedMax(gs_NdcAaBbMaxPtY[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.y))); - InterlockedMin(gs_NdcAaBbMinPtZ[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.z))); - InterlockedMax(gs_NdcAaBbMaxPtZ[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.z))); - InterlockedMin(gs_NdcAaBbMinPtW[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.w))); - InterlockedMax(gs_NdcAaBbMaxPtW[groupLocalLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.w))); + InterlockedMin(gs_NdcAaBbMinPtX[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.x))); + InterlockedMax(gs_NdcAaBbMaxPtX[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.x))); + InterlockedMin(gs_NdcAaBbMinPtY[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.y))); + InterlockedMax(gs_NdcAaBbMaxPtY[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.y))); + InterlockedMin(gs_NdcAaBbMinPtZ[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.z))); + InterlockedMax(gs_NdcAaBbMaxPtZ[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.z))); + InterlockedMin(gs_NdcAaBbMinPtW[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.w))); + InterlockedMax(gs_NdcAaBbMaxPtW[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.w))); GroupMemoryBarrierWithGroupSync(); - ndcAaBbMinPt.x = asfloat(gs_NdcAaBbMinPtX[groupLocalLightIndex]); - ndcAaBbMaxPt.x = asfloat(gs_NdcAaBbMaxPtX[groupLocalLightIndex]); - ndcAaBbMinPt.y = asfloat(gs_NdcAaBbMinPtY[groupLocalLightIndex]); - ndcAaBbMaxPt.y = asfloat(gs_NdcAaBbMaxPtY[groupLocalLightIndex]); - ndcAaBbMinPt.z = asfloat(gs_NdcAaBbMinPtZ[groupLocalLightIndex]); - ndcAaBbMaxPt.z = asfloat(gs_NdcAaBbMaxPtZ[groupLocalLightIndex]); - ndcAaBbMinPt.w = asfloat(gs_NdcAaBbMinPtW[groupLocalLightIndex]); - ndcAaBbMaxPt.w = asfloat(gs_NdcAaBbMaxPtW[groupLocalLightIndex]); + ndcAaBbMinPt.x = asfloat(gs_NdcAaBbMinPtX[intraGroupLightIndex]); + ndcAaBbMaxPt.x = asfloat(gs_NdcAaBbMaxPtX[intraGroupLightIndex]); + ndcAaBbMinPt.y = asfloat(gs_NdcAaBbMinPtY[intraGroupLightIndex]); + ndcAaBbMaxPt.y = asfloat(gs_NdcAaBbMaxPtY[intraGroupLightIndex]); + ndcAaBbMinPt.z = asfloat(gs_NdcAaBbMinPtZ[intraGroupLightIndex]); + ndcAaBbMaxPt.z = asfloat(gs_NdcAaBbMaxPtZ[intraGroupLightIndex]); + ndcAaBbMinPt.w = asfloat(gs_NdcAaBbMinPtW[intraGroupLightIndex]); + ndcAaBbMaxPt.w = asfloat(gs_NdcAaBbMaxPtW[intraGroupLightIndex]); #endif // PLATFORM_SUPPORTS_WAVE_INTRINSICS if (t % THREADS_PER_LIGHT == 0) // Avoid bank conflicts { - // Each light's AABB is represented by two float3s, the min and max of the box. - // And for stereo, we have two sets of lights. Therefore, each eye has a set of mins, followed by - // a set of maxs, and each set is equal to g_iNrVisibLights. - const ScreenSpaceBoundsIndices boundsIndices = GenerateScreenSpaceBoundsIndices(lgtIndex, g_iNrVisibLights, eyeIndex); - - g_vBoundsBuffer[boundsIndices.min] = ndcAaBbMinPt; - g_vBoundsBuffer[boundsIndices.max] = ndcAaBbMaxPt; - } - -#else // !Z_BINNING - const float3 boxX = lgtDat.boxAxisX.xyz; - const float3 boxY = lgtDat.boxAxisY.xyz; - const float3 boxZ = -lgtDat.boxAxisZ.xyz; // flip axis (so it points away from the light direction for a spot-light) - const float3 center = lgtDat.center.xyz; - const float radius = lgtDat.radius; - const float2 scaleXY = lgtDat.scaleXY; - - { - if(sideIndex<6 && lgtIndex<(int) g_iNrVisibLights) // mask 2 out of 8 threads - { - float3 q0, q1, q2, q3; - GetHullQuad(q0, q1, q2, q3, boxX, boxY, boxZ, center, scaleXY, sideIndex); - - - const float4 vP0 = mul(g_mProjection, float4(q0, 1)); - const float4 vP1 = mul(g_mProjection, float4(q1, 1)); - const float4 vP2 = mul(g_mProjection, float4(q2, 1)); - const float4 vP3 = mul(g_mProjection, float4(q3, 1)); - - // test vertices of one quad (of the convex hull) for intersection - const unsigned int uFlag0 = GetClip(vP0); - const unsigned int uFlag1 = GetClip(vP1); - const unsigned int uFlag2 = GetClip(vP2); - const unsigned int uFlag3 = GetClip(vP3); - - const float4 vPnts[] = {vP0, vP1, vP2, vP3}; - - // screen-space AABB of one quad (assuming no intersection) - float3 vMin, vMax; - for(int k=0; k<4; k++) - { - float fW = vPnts[k].w; - float fS = fW<0 ? -1 : 1; - float fWabs = fW<0 ? (-fW) : fW; - fW = fS * (fWabs>(i*6))&0x3f; - uFlagAnd &= uClipBits; - uFlagOr |= uClipBits; - } - - uCollectiveAnd &= uFlagAnd; - uCollectiveOr |= uFlagOr; - } - - bool bSetBoundYet = false; - float3 vMin=0.0, vMax=0.0; - if(uCollectiveAnd!=0 || uCollectiveOr==0) // all invisible or all visible (early out) - { - if(uCollectiveOr==0) // all visible - { - for(f=0; f<6; f++) - { - const int sideIndex = f; - - float3 vFaceMi = float3(posX[subLigt*MAX_PNTS*2 + sideIndex + 0], posY[subLigt*MAX_PNTS*2 + sideIndex + 0], posZ[subLigt*MAX_PNTS*2 + sideIndex + 0]); - float3 vFaceMa = float3(posX[subLigt*MAX_PNTS*2 + sideIndex + 6], posY[subLigt*MAX_PNTS*2 + sideIndex + 6], posZ[subLigt*MAX_PNTS*2 + sideIndex + 6]); - - for(int k=0; k<2; k++) - { - float3 vP = k==0 ? vFaceMi : vFaceMa; - if(f==0 && k==0) { vMin=vP; vMax=vP; } - - vMax = max(vMax, vP); vMin = min(vMin, vP); - } - } - bSetBoundYet=true; - } - } - else // :( need true clipping - { - - for(f=0; f<6; f++) - { - float3 q0, q1, q2, q3; - GetHullQuad(q0, q1, q2, q3, boxX, boxY, boxZ, center, scaleXY, f); - - // 4 vertices to a quad of the convex hull in post projection space - const float4 vP0 = mul(g_mProjection, float4(q0, 1)); - const float4 vP1 = mul(g_mProjection, float4(q1, 1)); - const float4 vP2 = mul(g_mProjection, float4(q2, 1)); - const float4 vP3 = mul(g_mProjection, float4(q3, 1)); - - - int iSrcIndex = 0; - - int offs = iSrcIndex*MAX_PNTS+subLigt*MAX_PNTS*2; - - // fill up source clip buffer with the quad - posX[offs+0]=vP0.x; posX[offs+1]=vP1.x; posX[offs+2]=vP2.x; posX[offs+3]=vP3.x; - posY[offs+0]=vP0.y; posY[offs+1]=vP1.y; posY[offs+2]=vP2.y; posY[offs+3]=vP3.y; - posZ[offs+0]=vP0.z; posZ[offs+1]=vP1.z; posZ[offs+2]=vP2.z; posZ[offs+3]=vP3.z; - posW[offs+0]=vP0.w; posW[offs+1]=vP1.w; posW[offs+2]=vP2.w; posW[offs+3]=vP3.w; - - int iNrSrcVerts = 4; - - // do true clipping - for(int p=0; p<6; p++) - { - const int nrVertsDst = ClipAgainstPlane(iSrcIndex, iNrSrcVerts, subLigt, p); - - iSrcIndex = 1-iSrcIndex; - iNrSrcVerts = nrVertsDst; - - if(iNrSrcVerts<3 || iNrSrcVerts>=MAX_PNTS) break; - } - - // final clipped convex primitive is in src buffer - if(iNrSrcVerts>2) - { - int offs_src = iSrcIndex*MAX_PNTS+subLigt*MAX_PNTS*2; - for(int k=0; kradius) - { - float2 vMi, vMa; - bool2 bMi, bMa; - CalcBound(bMi, bMa, vMi, vMa, g_mInvProjection, center, radius); - - vMin.xy = bMi ? max(vMin.xy, vMi) : vMin.xy; - vMax.xy = bMa ? min(vMax.xy, vMa) : vMax.xy; - } - else if(g_isOrthographic!=0) - { - float2 vMi = mul(g_mProjection, float4(center.xyz-radius,1)).xy; // no division needed for ortho - float2 vMa = mul(g_mProjection, float4(center.xyz+radius,1)).xy; // no division needed for ortho - vMin.xy = max(vMin.xy, vMi); - vMax.xy = min(vMax.xy, vMa); - } -#ifndef USE_OBLIQUE_MODE -#if USE_LEFT_HAND_CAMERA_SPACE - if((center.z-radius)>0.0) - { - float4 vPosF = mul(g_mProjection, float4(0,0,center.z-radius,1)); - vMin.z = max(vMin.z, vPosF.z/vPosF.w); - } - if((center.z+radius)>0.0) - { - float4 vPosB = mul(g_mProjection, float4(0,0,center.z+radius,1)); - vMax.z = min(vMax.z, vPosB.z/vPosB.w); - } -#else - if((center.z+radius)<0.0) - { - float4 vPosF = mul(g_mProjection, float4(0,0,center.z+radius,1)); - vMin.z = max(vMin.z, vPosF.z/vPosF.w); - } - if((center.z-radius)<0.0) - { - float4 vPosB = mul(g_mProjection, float4(0,0,center.z-radius,1)); - vMax.z = min(vMax.z, vPosB.z/vPosB.w); - } -#endif - else - { - vMin = float3(-3,-3,-3); - vMax = float3(-2,-2,-2); - } -#endif - } - - - // we should consider doing a look-up here into a max depth mip chain - // to see if the light is occluded: vMin.z*VIEWPORT_SCALE_Z > MipTexelMaxDepth - //g_vBoundsBuffer[lgtIndex+0] = float3(0.5*vMin.x+0.5, -0.5*vMax.y+0.5, vMin.z*VIEWPORT_SCALE_Z); - //g_vBoundsBuffer[lgtIndex+g_iNrVisibLights] = float3(0.5*vMax.x+0.5, -0.5*vMin.y+0.5, vMax.z*VIEWPORT_SCALE_Z); - - // changed for unity - - // Each light's AABB is represented by two float3s, the min and max of the box. - // And for stereo, we have two sets of lights. Therefore, each eye has a set of mins, followed by - // a set of maxs, and each set is equal to g_iNrVisibLights. - const ScreenSpaceBoundsIndices boundsIndices = GenerateScreenSpaceBoundsIndices(lgtIndex, g_iNrVisibLights, eyeIndex); - - // build a linear (in camera space) min/max Z for the aabb. This is needed for clustered when oblique is active - float linMiZ, linMaZ; -#ifndef USE_OBLIQUE_MODE - float2 vMiZW = mul(g_mInvProjection, float4(vMin,1)).zw; - float2 vMaZW = mul(g_mInvProjection, float4(vMax,1)).zw; - linMiZ = vMiZW.x/vMiZW.y; linMaZ = vMaZW.x/vMaZW.y; -#else - for(int i=0; i<8; i++) // establish 8 aabb points in camera space. - { - float3 vP = float3((i&1)!=0 ? vMax.x : vMin.x, (i&2)!=0 ? vMax.y : vMin.y, (i&4)!=0 ? vMax.z : vMin.z); - - float2 v2Pc = mul(g_mInvProjection, float4(vP,1)).zw; - float linZ = v2Pc.x/v2Pc.y; - - if(i==0) { linMiZ=linZ; linMaZ=linZ; } -#if USE_LEFT_HAND_CAMERA_SPACE - linMiZ = min(linMiZ, linZ); linMaZ = max(linMaZ, linZ); -#else - linMiZ = max(linMiZ, linZ); linMaZ = min(linMaZ, linZ); -#endif - } - - float z0 = center.z-radius, z1 = center.z+radius; -#if USE_LEFT_HAND_CAMERA_SPACE - linMiZ = max(linMiZ, z0); linMaZ = min(linMaZ, z1); -#else - linMiZ = min(linMiZ, z1); linMaZ = max(linMaZ, z0); -#endif - -#endif - - g_vBoundsBuffer[boundsIndices.min] = float4(0.5*vMin.x + 0.5, 0.5*vMin.y + 0.5, vMin.z*VIEWPORT_SCALE_Z, linMiZ); - g_vBoundsBuffer[boundsIndices.max] = float4(0.5*vMax.x + 0.5, 0.5*vMax.y + 0.5, vMax.z*VIEWPORT_SCALE_Z, linMaZ); - } - } -#endif // Z_BINNING -} - -#ifndef Z_BINNING - -float4 GenNewVert(const float4 vVisib, const float4 vInvisib, const int p); - -int ClipAgainstPlane(const int iSrcIndex, const int iNrSrcVerts, const int subLigt, const int p) -{ - int offs_src = iSrcIndex*MAX_PNTS+subLigt*MAX_PNTS*2; - int offs_dst = (1-iSrcIndex)*MAX_PNTS+subLigt*MAX_PNTS*2; - - float4 vPrev = float4(posX[offs_src+(iNrSrcVerts-1)], posY[offs_src+(iNrSrcVerts-1)], posZ[offs_src+(iNrSrcVerts-1)], posW[offs_src+(iNrSrcVerts-1)]); - - int nrVertsDst = 0; - - unsigned int uMask = (1<P.w)?2:0) | ((P.y<-P.w)?4:0) | ((P.y>P.w)?8:0) | ((P.z<0)?16:0) | ((P.z>P.w)?32:0)) & (bIsObliqueClipPlane ? 0x1f : 0x3f); -} - -float4 GenNewVert(const float4 vVisib, const float4 vInvisib, const int p) -{ - const float fS = p==4 ? 0 : ((p&1)==0 ? -1 : 1); - const int index = ((uint) p)/2; - float x1 = index==0 ? vVisib.x : (index==1 ? vVisib.y : vVisib.z); - float x0 = index==0 ? vInvisib.x : (index==1 ? vInvisib.y : vInvisib.z); - - //fS*((vVisib.w-vInvisib.w)*t + vInvisib.w) = (x1-x0)*t + x0; - - const float fT = (fS*vInvisib.w-x0)/((x1-x0) - fS*(vVisib.w-vInvisib.w)); - float4 vNew = vVisib*fT + vInvisib*(1-fT); - - // just to be really anal we make sure the clipped against coordinate is precise - if(index==0) vNew.x = fS*vNew.w; - else if(index==1) vNew.y = fS*vNew.w; - else vNew.z = fS*vNew.w; - - return vNew; -} - - -float4 TransformPlaneToPostSpace(float4x4 InvProjection, float4 plane) -{ - return mul(plane, InvProjection); -} - -float4 EvalPlanePair(out bool validPlanes, float2 posXY_in, float r) -{ - // rotate by 90 degrees to avoid potential division by zero - bool bMustFlip = abs(posXY_in.y)0.0; - - return res; } - -void CalcBound(out bool2 bIsMinValid, out bool2 bIsMaxValid, out float2 vMin, out float2 vMax, float4x4 InvProjection, float3 pos_view_space, float r) -{ - bool validX, validY; - float4 planeX = EvalPlanePair(validX, float2(pos_view_space.x, pos_view_space.z), r); - float4 planeY = EvalPlanePair(validY, float2(pos_view_space.y, pos_view_space.z), r); - - -#if USE_LEFT_HAND_CAMERA_SPACE - planeX = planeX.zwxy; // need to swap left/right and top/bottom planes when using left hand system - planeY = planeY.zwxy; -#endif - - bIsMinValid = bool2(planeX.z<0, planeY.z<0) && bool2(validX,validY); - bIsMaxValid = bool2((-planeX.x)<0, (-planeY.x)<0) && bool2(validX,validY); - - // hopefully the compiler takes zeros into account - // should be the case since the transformation in TransformPlaneToPostSpace() - // is done using multiply-adds and not dot product instructions. - float4 planeX0 = TransformPlaneToPostSpace(InvProjection, float4(planeX.x, 0, planeX.y, 0)); - float4 planeX1 = TransformPlaneToPostSpace(InvProjection, float4(planeX.z, 0, planeX.w, 0)); - float4 planeY0 = TransformPlaneToPostSpace(InvProjection, float4(0, planeY.x, planeY.y, 0)); - float4 planeY1 = TransformPlaneToPostSpace(InvProjection, float4(0, planeY.z, planeY.w, 0)); - - - // convert planes to the forms (1,0,0,D) and (0,1,0,D) - // 2D bound is given by -D components - float2 A = -float2(planeX0.w / planeX0.x, planeY0.w / planeY0.y); - float2 B = -float2(planeX1.w / planeX1.x, planeY1.w / planeY1.y); - - // Bound is complete - vMin = B; - vMax = A; -} - -#endif // !Z_BINNING \ No newline at end of file From 1a3e1725aea8e2e913c6509ff3804593e4b414fd Mon Sep 17 00:00:00 2001 From: Evgenii Date: Tue, 11 Aug 2020 17:25:46 -0700 Subject: [PATCH 17/22] Bounds check --- .../Lighting/LightLoop/LightCullUtils.hlsl | 16 ++++---- .../Lighting/LightLoop/scrbound.compute | 40 +++++++++---------- 2 files changed, 29 insertions(+), 27 deletions(-) diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightCullUtils.hlsl b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightCullUtils.hlsl index ea8d937ca7c..4a2a69df125 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightCullUtils.hlsl +++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightCullUtils.hlsl @@ -3,31 +3,33 @@ // Used to index into our SFiniteLightBound (g_data) and // LightVolumeData (_LightVolumeData) buffers. -int GenerateLightCullDataIndex(int lightIndex, uint numVisibleLights, uint eyeIndex) +uint GenerateLightCullDataIndex(uint lightIndex, uint numVisibleLights, uint eyeIndex) { + lightIndex = min(lightIndex, numVisibleLights - 1); // Stay within bounds + // For monoscopic, there is just one set of light cull data structs. // In stereo, all of the left eye structs are first, followed by the right eye structs. - const int perEyeBaseIndex = (int)eyeIndex * (int)numVisibleLights; + const uint perEyeBaseIndex = eyeIndex * numVisibleLights; return (perEyeBaseIndex + lightIndex); } struct ScreenSpaceBoundsIndices { - int min; - int max; + uint min; + uint max; }; // The returned values are used to index into our AABB screen space bounding box buffer // Usually named g_vBoundsBuffer. The two values represent the min/max indices. -ScreenSpaceBoundsIndices GenerateScreenSpaceBoundsIndices(int lightIndex, uint numVisibleLights, uint eyeIndex) +ScreenSpaceBoundsIndices GenerateScreenSpaceBoundsIndices(uint lightIndex, uint numVisibleLights, uint eyeIndex) { // In the monoscopic mode, there is one set of bounds (min,max -> 2 * g_iNrVisibLights) // In stereo, there are two sets of bounds (leftMin, leftMax, rightMin, rightMax -> 4 * g_iNrVisibLights) - const int eyeRelativeBase = (int)eyeIndex * 2 * (int)numVisibleLights; + const uint eyeRelativeBase = eyeIndex * 2 * numVisibleLights; ScreenSpaceBoundsIndices indices; indices.min = eyeRelativeBase + lightIndex; - indices.max = eyeRelativeBase + lightIndex + (int)numVisibleLights; + indices.max = indices.min + numVisibleLights; return indices; } diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute index 86144d43973..c804adad4dc 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute +++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute @@ -13,8 +13,8 @@ /* ------------------------------ Inputs ------------------------------------ */ -uniform int g_isOrthographic; -uniform int g_iNrVisibLights; +uniform uint g_isOrthographic; +uniform uint g_iNrVisibLights; uniform float4x4 g_mInvProjectionArr[SHADEROPTIONS_XR_MAX_VIEWS]; uniform float4x4 g_mProjectionArr[SHADEROPTIONS_XR_MAX_VIEWS]; @@ -207,7 +207,7 @@ bool TryCullFace(uint f, uint behindMasksOfVerts[NUM_VERTS]) uint cullMaskOfFace = FACE_MASK; // Initially behind uint vertListOfFace = GetVertexListOfFace(f); - for (int j = 0; j < 4; j++) + for (uint j = 0; j < 4; j++) { uint v = BitFieldExtract(vertListOfFace, 3 * j, 3); // Non-zero if ALL the vertices are behind any of the planes. @@ -310,7 +310,7 @@ void ClipPolygonAgainstPlane(uint p, uint srcBegin, uint srcSize, } } -void ClipFaceAgainstViewVolume(uint f, uint behindMasksOfVerts[NUM_VERTS], uint firstVertexOffset, +void ClipFaceAgainstViewVolume(uint f, uint behindMasksOfVerts[NUM_VERTS], uint baseVertexOffset, out uint srcBegin, out uint srcSize, out float4 vertRingBuffer[MAX_CLIP_VERTS]) { @@ -320,7 +320,7 @@ void ClipFaceAgainstViewVolume(uint f, uint behindMasksOfVerts[NUM_VERTS], uint uint clipMaskOfFace = 0; // Initially in front uint vertListOfFace = GetVertexListOfFace(f); - for (int j = 0; j < 4; j++) + for (uint j = 0; j < 4; j++) { uint v = BitFieldExtract(vertListOfFace, 3 * j, 3); // Non-zero if ANY of the vertices are behind any of the planes. @@ -328,10 +328,10 @@ void ClipFaceAgainstViewVolume(uint f, uint behindMasksOfVerts[NUM_VERTS], uint // Not all edges may require clipping. However, filtering the vertex list // is somewhat expensive, so we currently don't do it. - vertRingBuffer[j].x = gs_HapVertsX[firstVertexOffset + v]; - vertRingBuffer[j].y = gs_HapVertsY[firstVertexOffset + v]; - vertRingBuffer[j].z = gs_HapVertsZ[firstVertexOffset + v]; - vertRingBuffer[j].w = gs_HapVertsW[firstVertexOffset + v]; + vertRingBuffer[j].x = gs_HapVertsX[baseVertexOffset + v]; + vertRingBuffer[j].y = gs_HapVertsY[baseVertexOffset + v]; + vertRingBuffer[j].z = gs_HapVertsZ[baseVertexOffset + v]; + vertRingBuffer[j].w = gs_HapVertsW[baseVertexOffset + v]; } // Sutherland-Hodgeman polygon clipping algorithm. @@ -418,10 +418,10 @@ void GenLightAABB(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID) const uint intraGroupLightIndex = t / THREADS_PER_LIGHT; const uint globalLightIndex = g * LIGHTS_PER_GROUP + intraGroupLightIndex; - const uint firstVertexOffset = intraGroupLightIndex * NUM_VERTS; + const uint baseVertexOffset = intraGroupLightIndex * NUM_VERTS; - const int eyeAdjustedInputOffset = GenerateLightCullDataIndex(globalLightIndex, g_iNrVisibLights, eyeIndex); - const SFiniteLightBound cullData = g_data[eyeAdjustedInputOffset]; + const uint eyeAdjustedInputOffset = GenerateLightCullDataIndex(globalLightIndex, g_iNrVisibLights, eyeIndex); + const SFiniteLightBound cullData = g_data[eyeAdjustedInputOffset]; const float4x4 projMat = g_mProjectionArr[eyeIndex]; const float4x4 invProjMat = g_mInvProjectionArr[eyeIndex]; @@ -521,11 +521,11 @@ void GenLightAABB(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID) cullClipFaceMask |= GetFaceMaskOfVertex(v); } - gs_HapVertsX[firstVertexOffset + v] = hapVert.x; - gs_HapVertsY[firstVertexOffset + v] = hapVert.y; - gs_HapVertsZ[firstVertexOffset + v] = hapVert.z; - gs_HapVertsW[firstVertexOffset + v] = hapVert.w; - gs_BehindMasksOfVerts[firstVertexOffset + v] = behindMask; + gs_HapVertsX[baseVertexOffset + v] = hapVert.x; + gs_HapVertsY[baseVertexOffset + v] = hapVert.y; + gs_HapVertsZ[baseVertexOffset + v] = hapVert.z; + gs_HapVertsW[baseVertexOffset + v] = hapVert.w; + gs_BehindMasksOfVerts[baseVertexOffset + v] = behindMask; } #ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS @@ -613,7 +613,7 @@ void GenLightAABB(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID) for (uint i = 0; i < NUM_VERTS; i++) { - behindMasksOfVerts[i] = gs_BehindMasksOfVerts[firstVertexOffset + i]; + behindMasksOfVerts[i] = gs_BehindMasksOfVerts[baseVertexOffset + i]; } // (3) Cull the faces. @@ -669,7 +669,7 @@ void GenLightAABB(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID) uint srcBegin, srcSize; float4 vertRingBuffer[MAX_CLIP_VERTS]; - ClipFaceAgainstViewVolume(f, behindMasksOfVerts, firstVertexOffset, + ClipFaceAgainstViewVolume(f, behindMasksOfVerts, baseVertexOffset, srcBegin, srcSize, vertRingBuffer); UpdateAaBb(srcBegin, srcSize, vertRingBuffer, g_isOrthographic != 0, invProjMat, ndcAaBbMinPt, ndcAaBbMaxPt); @@ -717,7 +717,7 @@ void GenLightAABB(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID) ndcAaBbMaxPt.w = asfloat(gs_NdcAaBbMaxPtW[intraGroupLightIndex]); #endif // PLATFORM_SUPPORTS_WAVE_INTRINSICS - if (t % THREADS_PER_LIGHT == 0) // Avoid bank conflicts + if ((globalLightIndex < g_iNrVisibLights) && (t % THREADS_PER_LIGHT == 0)) // Avoid bank conflicts { // For stereo, we have two sets of lights. Therefore, each eye has a set of mins // followed by a set of maxs, and each set is equal to g_iNrVisibLights. From 3e28378351118888600c4a4b5b3495c39d71302e Mon Sep 17 00:00:00 2001 From: Evgenii Date: Tue, 11 Aug 2020 17:39:14 -0700 Subject: [PATCH 18/22] Add a profiling marker --- .../Runtime/Lighting/LightLoop/LightLoop.cs | 27 ++++++++++--------- .../Lighting/LightLoop/scrbound.compute | 4 +-- .../Runtime/RenderPipeline/HDProfileId.cs | 1 + 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs index a476e3b789f..8af2d142711 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs +++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs @@ -2769,24 +2769,27 @@ static void GenerateLightsScreenSpaceAABBs(in BuildGPULightListParameters parame { if (parameters.totalLightCount != 0) { - var tileAndCluster = resources.tileAndClusterData; + using (new ProfilingScope(cmd, ProfilingSampler.Get(HDProfileId.GenerateLightAABBs))) + { + var tileAndCluster = resources.tileAndClusterData; - cmd.SetComputeIntParam(parameters.screenSpaceAABBShader, HDShaderIDs.g_isOrthographic, parameters.isOrthographic ? 1 : 0); + cmd.SetComputeIntParam(parameters.screenSpaceAABBShader, HDShaderIDs.g_isOrthographic, parameters.isOrthographic ? 1 : 0); - // With XR single-pass, we have one set of light bounds per view to iterate over (bounds are in view space for each view) - cmd.SetComputeIntParam(parameters.screenSpaceAABBShader, HDShaderIDs.g_iNrVisibLights, parameters.totalLightCount); - cmd.SetComputeBufferParam(parameters.screenSpaceAABBShader, parameters.screenSpaceAABBKernel, HDShaderIDs.g_data, tileAndCluster.convexBoundsBuffer); - cmd.SetComputeBufferParam(parameters.screenSpaceAABBShader, parameters.screenSpaceAABBKernel, HDShaderIDs.g_vBoundsBuffer, tileAndCluster.AABBBoundsBuffer); + // With XR single-pass, we have one set of light bounds per view to iterate over (bounds are in view space for each view) + cmd.SetComputeIntParam(parameters.screenSpaceAABBShader, HDShaderIDs.g_iNrVisibLights, parameters.totalLightCount); + cmd.SetComputeBufferParam(parameters.screenSpaceAABBShader, parameters.screenSpaceAABBKernel, HDShaderIDs.g_data, tileAndCluster.convexBoundsBuffer); + cmd.SetComputeBufferParam(parameters.screenSpaceAABBShader, parameters.screenSpaceAABBKernel, HDShaderIDs.g_vBoundsBuffer, tileAndCluster.AABBBoundsBuffer); - cmd.SetComputeMatrixArrayParam(parameters.screenSpaceAABBShader, HDShaderIDs.g_mProjectionArr, parameters.lightListProjHMatrices); - cmd.SetComputeMatrixArrayParam(parameters.screenSpaceAABBShader, HDShaderIDs.g_mInvProjectionArr, parameters.lightListInvProjHMatrices); + cmd.SetComputeMatrixArrayParam(parameters.screenSpaceAABBShader, HDShaderIDs.g_mProjectionArr, parameters.lightListProjHMatrices); + cmd.SetComputeMatrixArrayParam(parameters.screenSpaceAABBShader, HDShaderIDs.g_mInvProjectionArr, parameters.lightListInvProjHMatrices); - const int threadsPerLight = 4; // Shader: THREADS_PER_LIGHT (4) - const int threadsPerGroup = 64; // Shader: THREADS_PER_GROUP (64) + const int threadsPerLight = 4; // Shader: THREADS_PER_LIGHT (4) + const int threadsPerGroup = 64; // Shader: THREADS_PER_GROUP (64) - int groupCount = HDUtils.DivRoundUp(parameters.totalLightCount * threadsPerLight, threadsPerGroup); + int groupCount = HDUtils.DivRoundUp(parameters.totalLightCount * threadsPerLight, threadsPerGroup); - cmd.DispatchCompute(parameters.screenSpaceAABBShader, parameters.screenSpaceAABBKernel, groupCount, parameters.viewCount, 1); + cmd.DispatchCompute(parameters.screenSpaceAABBShader, parameters.screenSpaceAABBKernel, groupCount, parameters.viewCount, 1); + } } } diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute index c804adad4dc..233876c830c 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute +++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute @@ -9,7 +9,7 @@ // #pragma enable_d3d11_debug_symbols #pragma only_renderers d3d11 playstation xboxone vulkan metal switch -#pragma kernel GenLightAABB +#pragma kernel main /* ------------------------------ Inputs ------------------------------------ */ @@ -410,7 +410,7 @@ void UpdateAaBb(uint srcBegin, uint srcSize, float4 vertRingBuffer[MAX_CLIP_VERT // ********************************************************************************************* [numthreads(THREADS_PER_GROUP, 1, 1)] -void GenLightAABB(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID) +void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID) { const uint t = threadID; const uint g = groupID.x; diff --git a/com.unity.render-pipelines.high-definition/Runtime/RenderPipeline/HDProfileId.cs b/com.unity.render-pipelines.high-definition/Runtime/RenderPipeline/HDProfileId.cs index 8959bcbc944..1d5e71b62f7 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/RenderPipeline/HDProfileId.cs +++ b/com.unity.render-pipelines.high-definition/Runtime/RenderPipeline/HDProfileId.cs @@ -15,6 +15,7 @@ internal enum HDProfileId DenoiseSSAO, UpSampleSSAO, ScreenSpaceShadows, + GenerateLightAABBs, BuildLightList, ContactShadows, BlitToFinalRTDevBuildOnly, From 06a8d7081cce9889c573d8816d28f1d457436e51 Mon Sep 17 00:00:00 2001 From: Evgenii Date: Tue, 11 Aug 2020 17:58:14 -0700 Subject: [PATCH 19/22] Fix lane masks --- .../Lighting/LightLoop/scrbound.compute | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute index 233876c830c..585441e30c0 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute +++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute @@ -535,7 +535,7 @@ void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID) uint orMask = 0; // Plays no role uint xorMask = 1 << i; // Flip bits one by one starting from the LSB // TODO: Francesco - expose the right intrinsic. - cullClipFaceMask |= LaneSwizzle(cullClipFaceMask, orMask, 0, xorMask); + cullClipFaceMask |= LaneSwizzle(cullClipFaceMask, andMask, orMask, xorMask); } #else InterlockedOr(gs_CullClipFaceMasks[intraGroupLightIndex], cullClipFaceMask); @@ -644,7 +644,7 @@ void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID) uint orMask = 0; // Plays no role uint xorMask = 1 << i; // Flip bits one by one starting from the LSB // TODO: Francesco - expose the right intrinsic. - cullClipFaceMask &= LaneSwizzle(cullClipFaceMask, orMask, 0, xorMask); + cullClipFaceMask &= LaneSwizzle(cullClipFaceMask, andMask, orMask, xorMask); } #else InterlockedAnd(gs_CullClipFaceMasks[intraGroupLightIndex], cullClipFaceMask); @@ -684,14 +684,14 @@ void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID) uint orMask = 0; // Plays no role uint xorMask = 1 << i; // Flip bits one by one starting from the LSB // TODO: Francesco - expose the right intrinsic. - ndcAaBbMinPt.x = min(ndcAaBbMinPt.x, LaneSwizzle(ndcAaBbMinPt.x, orMask, 0, xorMask)); - ndcAaBbMaxPt.x = max(ndcAaBbMaxPt.x, LaneSwizzle(ndcAaBbMaxPt.x, orMask, 0, xorMask)); - ndcAaBbMinPt.y = min(ndcAaBbMinPt.y, LaneSwizzle(ndcAaBbMinPt.y, orMask, 0, xorMask)); - ndcAaBbMaxPt.y = max(ndcAaBbMaxPt.y, LaneSwizzle(ndcAaBbMaxPt.y, orMask, 0, xorMask)); - ndcAaBbMinPt.z = min(ndcAaBbMinPt.z, LaneSwizzle(ndcAaBbMinPt.z, orMask, 0, xorMask)); - ndcAaBbMaxPt.z = max(ndcAaBbMaxPt.z, LaneSwizzle(ndcAaBbMaxPt.z, orMask, 0, xorMask)); - ndcAaBbMinPt.w = min(ndcAaBbMinPt.w, LaneSwizzle(ndcAaBbMinPt.w, orMask, 0, xorMask)); - ndcAaBbMaxPt.w = max(ndcAaBbMaxPt.w, LaneSwizzle(ndcAaBbMaxPt.w, orMask, 0, xorMask)); + ndcAaBbMinPt.x = min(ndcAaBbMinPt.x, LaneSwizzle(ndcAaBbMinPt.x, andMask, orMask, xorMask)); + ndcAaBbMaxPt.x = max(ndcAaBbMaxPt.x, LaneSwizzle(ndcAaBbMaxPt.x, andMask, orMask, xorMask)); + ndcAaBbMinPt.y = min(ndcAaBbMinPt.y, LaneSwizzle(ndcAaBbMinPt.y, andMask, orMask, xorMask)); + ndcAaBbMaxPt.y = max(ndcAaBbMaxPt.y, LaneSwizzle(ndcAaBbMaxPt.y, andMask, orMask, xorMask)); + ndcAaBbMinPt.z = min(ndcAaBbMinPt.z, LaneSwizzle(ndcAaBbMinPt.z, andMask, orMask, xorMask)); + ndcAaBbMaxPt.z = max(ndcAaBbMaxPt.z, LaneSwizzle(ndcAaBbMaxPt.z, andMask, orMask, xorMask)); + ndcAaBbMinPt.w = min(ndcAaBbMinPt.w, LaneSwizzle(ndcAaBbMinPt.w, andMask, orMask, xorMask)); + ndcAaBbMaxPt.w = max(ndcAaBbMaxPt.w, LaneSwizzle(ndcAaBbMaxPt.w, andMask, orMask, xorMask)); } #else // Integer comparison works for floating-point numbers as long as the sign bit is 0. From a7fcd99e66edfdf1539186ccfd20fcc845731a35 Mon Sep 17 00:00:00 2001 From: Evgenii Date: Tue, 11 Aug 2020 18:44:10 -0700 Subject: [PATCH 20/22] Fix compiler warning --- .../Lighting/LightLoop/scrbound.compute | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute index 585441e30c0..7cdeab66b02 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute +++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute @@ -101,7 +101,7 @@ float4x4 PerspectiveProjection4x4(float a, float g, float n, float f) /* ------------------------------ Implementation ---------------------------- */ -#define DUMB_COMPILER // Improve the quality of generated code +#define DUMB_COMPILER // Improve the quality of generated code at the expense of readability #define CLEAR_SIGN_BIT(X) (asint(X) & INT_MAX) #define DIV_ROUND_UP(N, D) (((N) + (D) - 1) / (D)) // No division by 0 checks @@ -259,10 +259,10 @@ void ClipPolygonAgainstPlane(uint p, uint srcBegin, uint srcSize, uint modDstIdx = dstBegin % MAX_CLIP_VERTS; #endif - for (uint k = srcBegin; k < (srcBegin + srcSize); k++) + for (uint j = srcBegin; j < (srcBegin + srcSize); j++) { #ifndef DUMB_COMPILER - uint modSrcIdx = k % MAX_CLIP_VERTS; + uint modSrcIdx = j % MAX_CLIP_VERTS; #endif ClipVertex leadVert = CreateClipVertex(p, vertRingBuffer[modSrcIdx]); @@ -457,8 +457,10 @@ void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID) // any single plane, we can trivially reject (cull) that face. uint cullClipFaceMask = 0; // Initially inside + uint i; // Avoid multiply-declared variable warning + // (1) Compute the vertices of the light volume. - for (uint i = 0; i < VERTS_PER_THREAD; i++) + for (i = 0; i < VERTS_PER_THREAD; i++) { uint v = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT; @@ -529,7 +531,7 @@ void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID) } #ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS - for (uint i = 0; i < FastLog2(THREADS_PER_LIGHT); i++) + for (i = 0; i < FastLog2(THREADS_PER_LIGHT); i++) { uint andMask = PLATFORM_LANE_COUNT - 1; // All lanes uint orMask = 0; // Plays no role @@ -575,7 +577,7 @@ void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID) lightSpaceMatrix = mul(mul(perspProjMatrix, invTranslateEye), lightSpaceMatrix); } - for (uint i = 0; i < VERTS_PER_THREAD; i++) + for (i = 0; i < VERTS_PER_THREAD; i++) { uint v = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT; @@ -611,7 +613,7 @@ void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID) uint behindMasksOfVerts[NUM_VERTS]; - for (uint i = 0; i < NUM_VERTS; i++) + for (i = 0; i < NUM_VERTS; i++) { behindMasksOfVerts[i] = gs_BehindMasksOfVerts[baseVertexOffset + i]; } @@ -621,7 +623,7 @@ void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID) const uint cullFaceMask = cullClipFaceMask; const uint numFacesToCull = countbits(cullFaceMask); // [0, 6] - for (uint i = 0; i < FACES_PER_THREAD; i++) + for (i = 0; i < FACES_PER_THREAD; i++) { uint n = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT; @@ -638,7 +640,7 @@ void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID) } #ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS - for (uint i = 0; i < FastLog2(THREADS_PER_LIGHT); i++) + for (i = 0; i < FastLog2(THREADS_PER_LIGHT); i++) { uint andMask = PLATFORM_LANE_COUNT - 1; // All lanes uint orMask = 0; // Plays no role @@ -659,7 +661,7 @@ void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID) const uint clipFaceMask = cullClipFaceMask; const uint numFacesToClip = countbits(clipFaceMask); // [0, 6] - for (uint i = 0; i < FACES_PER_THREAD; i++) + for (i = 0; i < FACES_PER_THREAD; i++) { uint n = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT; @@ -678,7 +680,7 @@ void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID) } #ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS - for (uint i = 0; i < FastLog2(THREADS_PER_LIGHT); i++) + for (i = 0; i < FastLog2(THREADS_PER_LIGHT); i++) { uint andMask = PLATFORM_LANE_COUNT - 1; // All lanes uint orMask = 0; // Plays no role From 71004f0b983dbb1c16d1842b8011147ab469b2a9 Mon Sep 17 00:00:00 2001 From: Evgenii Date: Tue, 11 Aug 2020 18:56:38 -0700 Subject: [PATCH 21/22] Remove GPU Pro reference --- .../Runtime/Lighting/LightLoop/scrbound.compute | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute index 7cdeab66b02..26783afb163 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute +++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute @@ -1,16 +1,13 @@ -// The implementation is based on the demo on "fine pruned tiled lighting" published in GPU Pro 7. -// https://github.com/wolfgangfengel/GPU-Pro-7 +// #pragma enable_d3d11_debug_symbols +#pragma only_renderers d3d11 playstation xboxone vulkan metal switch + +#pragma kernel main #include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl" #include "Packages/com.unity.render-pipelines.high-definition-config/Runtime/ShaderConfig.cs.hlsl" #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs.hlsl" #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightCullUtils.hlsl" -// #pragma enable_d3d11_debug_symbols -#pragma only_renderers d3d11 playstation xboxone vulkan metal switch - -#pragma kernel main - /* ------------------------------ Inputs ------------------------------------ */ uniform uint g_isOrthographic; From 21c2481e2ee71bbd88878a37fd24e197f61837bd Mon Sep 17 00:00:00 2001 From: Evgenii Date: Wed, 26 Aug 2020 12:00:28 -0700 Subject: [PATCH 22/22] No instrinsics on Xbox --- .../Lighting/LightLoop/scrbound.compute | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute index 26783afb163..95670ade423 100644 --- a/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute +++ b/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/scrbound.compute @@ -98,7 +98,19 @@ float4x4 PerspectiveProjection4x4(float a, float g, float n, float f) /* ------------------------------ Implementation ---------------------------- */ -#define DUMB_COMPILER // Improve the quality of generated code at the expense of readability +// Improve the quality of generated code at the expense of readability. +// Remove when the shader compiler is clever enough to perform this optimization for us. +#define DUMB_COMPILER + +#ifdef SHADER_API_XBOXONE +// The Xbox shader compiler expects the lane swizzle mask to be a compile-time constant. +// In our case, the mask is a compile-time constant, but it is defined inside a loop +// that is unrolled at the compile time, and the constants are generated during the +// constant propagation pass of the optimizer. This works fine on PlayStation, but does not work +// on Xbox. In order to avoid writing hideous code specifically for Xbox, we disable the support +// of wave intrinsics on Xbox until the Xbox compiler is fixed. +#undef PLATFORM_SUPPORTS_WAVE_INTRINSICS +#endif #define CLEAR_SIGN_BIT(X) (asint(X) & INT_MAX) #define DIV_ROUND_UP(N, D) (((N) + (D) - 1) / (D)) // No division by 0 checks @@ -533,7 +545,7 @@ void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID) uint andMask = PLATFORM_LANE_COUNT - 1; // All lanes uint orMask = 0; // Plays no role uint xorMask = 1 << i; // Flip bits one by one starting from the LSB - // TODO: Francesco - expose the right intrinsic. + cullClipFaceMask |= LaneSwizzle(cullClipFaceMask, andMask, orMask, xorMask); } #else @@ -642,7 +654,7 @@ void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID) uint andMask = PLATFORM_LANE_COUNT - 1; // All lanes uint orMask = 0; // Plays no role uint xorMask = 1 << i; // Flip bits one by one starting from the LSB - // TODO: Francesco - expose the right intrinsic. + cullClipFaceMask &= LaneSwizzle(cullClipFaceMask, andMask, orMask, xorMask); } #else @@ -682,7 +694,7 @@ void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID) uint andMask = PLATFORM_LANE_COUNT - 1; // All lanes uint orMask = 0; // Plays no role uint xorMask = 1 << i; // Flip bits one by one starting from the LSB - // TODO: Francesco - expose the right intrinsic. + ndcAaBbMinPt.x = min(ndcAaBbMinPt.x, LaneSwizzle(ndcAaBbMinPt.x, andMask, orMask, xorMask)); ndcAaBbMaxPt.x = max(ndcAaBbMaxPt.x, LaneSwizzle(ndcAaBbMaxPt.x, andMask, orMask, xorMask)); ndcAaBbMinPt.y = min(ndcAaBbMinPt.y, LaneSwizzle(ndcAaBbMinPt.y, andMask, orMask, xorMask));