Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions com.unity.render-pipelines.high-definition/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
### Fixed
- Fixed GC allocations from XR occlusion mesh when using multipass.
- Fixed XR depth copy when using MSAA.
- Fixed register spilling on FXC in light list shaders.

## [11.0.0] - 2020-10-21

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,38 @@ groupshared float4 lightPlanes[4*6]; // Each plane is defined by a float4. 6 pla

groupshared uint lightOffs;

groupshared int categoryListCountScratch[NR_THREADS * LIGHTCATEGORY_COUNT];
groupshared int shiftIndexScratch[NR_THREADS * LIGHTCATEGORY_COUNT];

void ZeroCategoryListCountAndShiftIndex(uint threadIdx)
{
for (int i = 0; i < LIGHTCATEGORY_COUNT; ++i)
{
categoryListCountScratch[threadIdx * LIGHTCATEGORY_COUNT + i] = 0;
shiftIndexScratch[threadIdx * LIGHTCATEGORY_COUNT + i] = 0;
}
}

void WriteShiftIndex(uint threadIdx, uint index, int value)
{
shiftIndexScratch[threadIdx * LIGHTCATEGORY_COUNT + index] = value;
}

int ReadShiftIndex(uint threadIdx, uint index)
{
return shiftIndexScratch[threadIdx * LIGHTCATEGORY_COUNT + index];
}

void IncrementCategoryListCount(uint threadIdx, uint index)
{
categoryListCountScratch[threadIdx * LIGHTCATEGORY_COUNT + index]++;
}

int ReadCategoryListCount(uint threadIdx, uint index)
{
return categoryListCountScratch[threadIdx * LIGHTCATEGORY_COUNT + index];
}

#ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE
groupshared uint ldsZMax;
#endif
Expand Down Expand Up @@ -375,14 +407,11 @@ void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)

// All our cull data are in the same list, but at render time envLights are separated so we need to shift the index
// to make it work correctly
int shiftIndex[LIGHTCATEGORY_COUNT];
ZERO_INITIALIZE_ARRAY(int, shiftIndex, LIGHTCATEGORY_COUNT);
shiftIndex[LIGHTCATEGORY_ENV] = _EnvLightIndexShift;
shiftIndex[LIGHTCATEGORY_DECAL] = _DecalIndexShift;
shiftIndex[LIGHTCATEGORY_DENSITY_VOLUME] = _DensityVolumeIndexShift;
ZeroCategoryListCountAndShiftIndex(t);

int categoryListCount[LIGHTCATEGORY_COUNT]; // number of direct lights, reflection probes, decals, density volumes, and probe volumes
ZERO_INITIALIZE_ARRAY(int, categoryListCount, LIGHTCATEGORY_COUNT);
WriteShiftIndex(t, LIGHTCATEGORY_ENV, _EnvLightIndexShift);
WriteShiftIndex(t, LIGHTCATEGORY_DECAL, _DecalIndexShift);
WriteShiftIndex(t, LIGHTCATEGORY_DENSITY_VOLUME, _DensityVolumeIndexShift);

uint offs = start;
for(int ll=0; ll<iNrCoarseLights; ll+=4)
Expand All @@ -401,8 +430,8 @@ void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
const int lightVolIndex = GenerateLightCullDataIndex(coarseList[l], g_iNrVisibLights, eyeIndex);
uint lightCategory = _LightVolumeData[lightVolIndex].lightCategory;
++categoryListCount[lightCategory];
g_vLayeredLightList[offs++] = coarseList[l] - shiftIndex[lightCategory];
IncrementCategoryListCount(t, lightCategory);
g_vLayeredLightList[offs++] = coarseList[l] - ReadShiftIndex(t, lightCategory);
}
}

Expand All @@ -416,12 +445,12 @@ void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
offs = GenerateLayeredOffsetBufferIndex(0, tileIDX, i, nrTilesX, nrTilesY, nrClusters, eyeIndex);
for(int category=0; category<LIGHTCATEGORY_COUNT; category++)
{
int numLights = min(categoryListCount[category],31); // only allow 5 bits
int numLights = min(ReadCategoryListCount(t, category),31); // only allow 5 bits
if(i<nrClusters)
{
g_LayeredOffset[offs] = (start+localOffs) | (((uint) numLights)<<27);
offs += (nrClusters*nrTilesX*nrTilesY);
localOffs += categoryListCount[category]; // use unclamped count for localOffs
localOffs += ReadCategoryListCount(t, category); // use unclamped count for localOffs
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -215,8 +215,36 @@ groupshared uint gs_NdcAaBbMinPtW[LIGHTS_PER_GROUP]; // View-space Z coordinate
groupshared uint gs_NdcAaBbMaxPtW[LIGHTS_PER_GROUP]; // View-space Z coordinate
#endif // PLATFORM_SUPPORTS_WAVE_INTRINSICS


// ----------- Use LDS for the vertex ring buffer as otherwise on FXC we create register spilling

groupshared float gs_VertexRingBufferX[MAX_CLIP_VERTS * THREADS_PER_GROUP];
groupshared float gs_VertexRingBufferY[MAX_CLIP_VERTS * THREADS_PER_GROUP];
groupshared float gs_VertexRingBufferZ[MAX_CLIP_VERTS * THREADS_PER_GROUP];
groupshared float gs_VertexRingBufferW[MAX_CLIP_VERTS * THREADS_PER_GROUP];

float4 GetFromRingBuffer(uint threadIdx, uint entry)
{
float4 outV;
outV.x = gs_VertexRingBufferX[threadIdx * MAX_CLIP_VERTS + entry];
outV.y = gs_VertexRingBufferY[threadIdx * MAX_CLIP_VERTS + entry];
outV.z = gs_VertexRingBufferZ[threadIdx * MAX_CLIP_VERTS + entry];
outV.w = gs_VertexRingBufferW[threadIdx * MAX_CLIP_VERTS + entry];
return outV;
}

void WriteToRingBuffer(uint threadIdx, uint entry, float4 value)
{
gs_VertexRingBufferX[threadIdx * MAX_CLIP_VERTS + entry] = value.x;
gs_VertexRingBufferY[threadIdx * MAX_CLIP_VERTS + entry] = value.y;
gs_VertexRingBufferZ[threadIdx * MAX_CLIP_VERTS + entry] = value.z;
gs_VertexRingBufferW[threadIdx * MAX_CLIP_VERTS + entry] = value.w;
}
/////////////////////////////////////////////////////////


// Returns 'true' if it manages to cull the face.
bool TryCullFace(uint f, uint behindMasksOfVerts[NUM_VERTS])
bool TryCullFace(uint f, uint baseOffsetVertex)
{
uint cullMaskOfFace = FACE_MASK; // Initially behind
uint vertListOfFace = GetVertexListOfFace(f);
Expand All @@ -225,7 +253,7 @@ bool TryCullFace(uint f, uint behindMasksOfVerts[NUM_VERTS])
{
uint v = BitFieldExtract(vertListOfFace, 3 * j, 3);
// Non-zero if ALL the vertices are behind any of the planes.
cullMaskOfFace &= behindMasksOfVerts[v];
cullMaskOfFace &= gs_BehindMasksOfVerts[baseOffsetVertex + v];
}

return (cullMaskOfFace != 0);
Expand Down Expand Up @@ -260,13 +288,13 @@ float4 IntersectEdgeAgainstPlane(ClipVertex v0, ClipVertex v1)
}

void ClipPolygonAgainstPlane(uint p, uint srcBegin, uint srcSize,
inout float4 vertRingBuffer[MAX_CLIP_VERTS],
uint threadIdx,
out uint dstBegin, out uint dstSize)
{
dstBegin = srcBegin + srcSize; // Start at the end; we don't use modular arithmetic here
dstSize = 0;

ClipVertex tailVert = CreateClipVertex(p, vertRingBuffer[(srcBegin + srcSize - 1) % MAX_CLIP_VERTS]);
ClipVertex tailVert = CreateClipVertex(p, GetFromRingBuffer(threadIdx, (srcBegin + srcSize - 1) % MAX_CLIP_VERTS));

#ifdef OBTUSE_COMPILER
uint modSrcIdx = srcBegin % MAX_CLIP_VERTS;
Expand All @@ -278,7 +306,7 @@ void ClipPolygonAgainstPlane(uint p, uint srcBegin, uint srcSize,
#ifndef OBTUSE_COMPILER
uint modSrcIdx = j % MAX_CLIP_VERTS;
#endif
ClipVertex leadVert = CreateClipVertex(p, vertRingBuffer[modSrcIdx]);
ClipVertex leadVert = CreateClipVertex(p, GetFromRingBuffer(threadIdx, modSrcIdx));

// Execute Blinn's line clipping algorithm.
// Classify the line segment. 4 cases:
Expand All @@ -295,7 +323,7 @@ void ClipPolygonAgainstPlane(uint p, uint srcBegin, uint srcSize,
#ifndef OBTUSE_COMPILER
uint modDstIdx = (dstBegin + dstSize++) % MAX_CLIP_VERTS;
#endif
vertRingBuffer[modDstIdx] = clipVert;
WriteToRingBuffer(threadIdx, modDstIdx, clipVert);
#ifdef OBTUSE_COMPILER
dstSize++;
modDstIdx++;
Expand All @@ -308,7 +336,9 @@ void ClipPolygonAgainstPlane(uint p, uint srcBegin, uint srcSize,
#ifndef OBTUSE_COMPILER
uint modDstIdx = (dstBegin + dstSize++) % MAX_CLIP_VERTS;
#endif
vertRingBuffer[modDstIdx] = leadVert.pt;
WriteToRingBuffer(threadIdx, modDstIdx, leadVert.pt);

//vertRingBuffer[modDstIdx] = leadVert.pt;
#ifdef OBTUSE_COMPILER
dstSize++;
modDstIdx++;
Expand All @@ -324,9 +354,9 @@ void ClipPolygonAgainstPlane(uint p, uint srcBegin, uint srcSize,
}
}

void ClipFaceAgainstViewVolume(uint f, uint behindMasksOfVerts[NUM_VERTS], uint baseVertexOffset,
void ClipFaceAgainstViewVolume(uint f, uint baseVertexOffset,
out uint srcBegin, out uint srcSize,
out float4 vertRingBuffer[MAX_CLIP_VERTS])
uint threadIdx)
{
srcBegin = 0;
srcSize = 4;
Expand All @@ -338,14 +368,15 @@ void ClipFaceAgainstViewVolume(uint f, uint behindMasksOfVerts[NUM_VERTS], uint
{
uint v = BitFieldExtract(vertListOfFace, 3 * j, 3);
// Non-zero if ANY of the vertices are behind any of the planes.
clipMaskOfFace |= behindMasksOfVerts[v];
clipMaskOfFace |= gs_BehindMasksOfVerts[baseVertexOffset + v];

// Not all edges may require clipping. However, filtering the vertex list
// is somewhat expensive, so we currently don't do it.
vertRingBuffer[j].x = gs_HapVertsX[baseVertexOffset + v];
vertRingBuffer[j].y = gs_HapVertsY[baseVertexOffset + v];
vertRingBuffer[j].z = gs_HapVertsZ[baseVertexOffset + v];
vertRingBuffer[j].w = gs_HapVertsW[baseVertexOffset + v];
WriteToRingBuffer(threadIdx, j, float4(gs_HapVertsX[baseVertexOffset + v], gs_HapVertsY[baseVertexOffset + v], gs_HapVertsZ[baseVertexOffset + v], gs_HapVertsW[baseVertexOffset + v]));
//vertRingBuffer[j].x = gs_HapVertsX[baseVertexOffset + v];
//vertRingBuffer[j].y = gs_HapVertsY[baseVertexOffset + v];
//vertRingBuffer[j].z = gs_HapVertsZ[baseVertexOffset + v];
//vertRingBuffer[j].w = gs_HapVertsW[baseVertexOffset + v];
}

// Sutherland-Hodgeman polygon clipping algorithm.
Expand All @@ -355,7 +386,7 @@ void ClipFaceAgainstViewVolume(uint f, uint behindMasksOfVerts[NUM_VERTS], uint
uint p = firstbitlow(clipMaskOfFace);

uint dstBegin, dstSize;
ClipPolygonAgainstPlane(p, srcBegin, srcSize, vertRingBuffer, dstBegin, dstSize);
ClipPolygonAgainstPlane(p, srcBegin, srcSize, threadIdx, dstBegin, dstSize);

srcBegin = dstBegin;
srcSize = dstSize;
Expand All @@ -364,7 +395,7 @@ void ClipFaceAgainstViewVolume(uint f, uint behindMasksOfVerts[NUM_VERTS], uint
}
}

void UpdateAaBb(uint srcBegin, uint srcSize, float4 vertRingBuffer[MAX_CLIP_VERTS],
void UpdateAaBb(uint srcBegin, uint srcSize, uint threadIdx,
bool isOrthoProj, float4x4 invProjMat,
inout float4 ndcAaBbMinPt, inout float4 ndcAaBbMaxPt)
{
Expand All @@ -376,7 +407,7 @@ void UpdateAaBb(uint srcBegin, uint srcSize, float4 vertRingBuffer[MAX_CLIP_VERT
#ifndef OBTUSE_COMPILER
uint modSrcIdx = j % MAX_CLIP_VERTS;
#endif
float4 hapVert = vertRingBuffer[modSrcIdx];
float4 hapVert = GetFromRingBuffer(threadIdx, modSrcIdx);
// Clamp to the bounds in case of numerical errors (may still generate -0).
float3 rapVertNDC = saturate(hapVert.xyz * rcp(hapVert.w));
float rbpVertVSz = hapVert.w;
Expand Down Expand Up @@ -714,13 +745,6 @@ void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID)
GroupMemoryBarrierWithGroupSync();
#endif

uint behindMasksOfVerts[NUM_VERTS];

for (i = 0; i < NUM_VERTS; i++)
{
behindMasksOfVerts[i] = gs_BehindMasksOfVerts[baseVertexOffset + i];
}

// (3) Cull the faces.
{
const uint cullFaceMask = cullClipFaceMask;
Expand All @@ -734,7 +758,7 @@ void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID)
{
uint f = NthBitLow(cullFaceMask, n);

if (TryCullFace(f, behindMasksOfVerts))
if (TryCullFace(f, baseVertexOffset))
{
cullClipFaceMask ^= 1 << f; // Clear the bit
}
Expand Down Expand Up @@ -773,10 +797,9 @@ void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID)
uint f = NthBitLow(clipFaceMask, n);

uint srcBegin, srcSize;
float4 vertRingBuffer[MAX_CLIP_VERTS];
ClipFaceAgainstViewVolume(f, behindMasksOfVerts, baseVertexOffset,
srcBegin, srcSize, vertRingBuffer);
UpdateAaBb(srcBegin, srcSize, vertRingBuffer, g_isOrthographic != 0, invProjMat,
ClipFaceAgainstViewVolume(f, baseVertexOffset,
srcBegin, srcSize, t);
UpdateAaBb(srcBegin, srcSize, t, g_isOrthographic != 0, invProjMat,
ndcAaBbMinPt, ndcAaBbMaxPt);
}
}
Expand Down