Skip to content

Commit

Permalink
Revert "Make sure light loop is scalarized by defining correctly SCAL…
Browse files Browse the repository at this point in the history
…ARIZE_LIGHT_LOOP (#1684)"

This reverts commit 2d016c5.
  • Loading branch information
sebastienlagarde committed Sep 14, 2020
1 parent 5bd028c commit f5ab594
Show file tree
Hide file tree
Showing 10 changed files with 78 additions and 91 deletions.
1 change: 0 additions & 1 deletion com.unity.render-pipelines.high-definition/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- Fixed warning with area mesh (case 1268379)
- Fixed issue with diffusion profile not being updated upon reset of the editor.
- Fixed an issue that lead to corrupted refraction in some scenarios on xbox.
- Fixed for light loop scalarization not happening.

### Changed
- Preparation pass for RTSSShadows to be supported by render graph.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,13 +66,15 @@ void ShadowLoopMin(HDShadowContext shadowContext, PositionInputs posInput, float
#endif

bool fastPath = false;
#if SCALARIZE_LIGHT_LOOP
uint lightStartLane0;
fastPath = IsFastPath(lightStart, lightStartLane0);

if (fastPath)
{
lightStart = lightStartLane0;
}
#endif

// Scalarized loop. All lights that are in a tile/cluster touched by any pixel in the wave are loaded (scalar load), only the one relevant to current thread/pixel are processed.
// For clarity, the following code will follow the convention: variables starting with s_ are meant to be wave uniform (meant for scalar register),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,9 @@
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/ScreenSpaceLighting/ScreenSpaceGlobalIllumination.cs.hlsl"
#endif

#ifndef SCALARIZE_LIGHT_LOOP
// We perform scalarization only for forward rendering as for deferred loads will already be scalar since tiles will match waves and therefore all threads will read from the same tile.
// More info on scalarization: https://flashypixels.wordpress.com/2018/11/10/intro-to-gpu-scalarization-part-2-scalarize-all-the-lights/
#define SCALARIZE_LIGHT_LOOP (defined(PLATFORM_SUPPORTS_WAVE_INTRINSICS) && !defined(LIGHTLOOP_DISABLE_TILE_AND_CLUSTER) && SHADERPASS == SHADERPASS_FORWARD)
#endif


//-----------------------------------------------------------------------------
// LightLoop
Expand Down Expand Up @@ -255,11 +252,7 @@ void LightLoop( float3 V, PositionInputs posInput, PreLightData preLightData, BS
while (v_lightListOffset < lightCount)
{
v_lightIdx = FetchIndex(lightStart, v_lightListOffset);
#if SCALARIZE_LIGHT_LOOP
uint s_lightIdx = ScalarizeElementIndex(v_lightIdx, fastPath);
#else
uint s_lightIdx = v_lightIdx;
#endif
if (s_lightIdx == -1)
break;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,43 @@ uint FetchIndex(uint lightStart, uint lightOffset)

#endif // LIGHTLOOP_DISABLE_TILE_AND_CLUSTER

bool IsFastPath(uint lightStart, out uint lightStartLane0)
{
#if SCALARIZE_LIGHT_LOOP
// Fast path is when we all pixels in a wave are accessing same tile or cluster.
lightStartLane0 = WaveReadLaneFirst(lightStart);
return WaveActiveAllTrue(lightStart == lightStartLane0);
#else
lightStartLane0 = lightStart;
return false;
#endif
}

// This function scalarize an index accross all lanes. To be effecient it must be used in the context
// of the scalarization of a loop. It is to use with IsFastPath so it can optimize the number of
// element to load, which is optimal when all the lanes are contained into a tile.
uint ScalarizeElementIndex(uint v_elementIdx, bool fastPath)
{
uint s_elementIdx = v_elementIdx;
#if SCALARIZE_LIGHT_LOOP
if (!fastPath)
{
// If we are not in fast path, v_elementIdx is not scalar, so we need to query the Min value across the wave.
s_elementIdx = WaveActiveMin(v_elementIdx);
// If WaveActiveMin returns 0xffffffff it means that all lanes are actually dead, so we can safely ignore the loop and move forward.
// This could happen as an helper lane could reach this point, hence having a valid v_elementIdx, but their values will be ignored by the WaveActiveMin
if (s_elementIdx == -1)
{
return -1;
}
}
// Note that the WaveReadLaneFirst should not be needed, but the compiler might insist in putting the result in VGPR.
// However, we are certain at this point that the index is scalar.
s_elementIdx = WaveReadLaneFirst(s_elementIdx);
#endif
return s_elementIdx;
}

uint FetchIndexWithBoundsCheck(uint start, uint count, uint i)
{
if (i < count)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,6 @@
// This file should only be included inside of ProbeVolume.hlsl.
// There are no #ifndef HEADER guards to stop multiple inclusion, as this is simply used for code gen.

#ifndef SCALARIZE_LIGHT_LOOP
// We perform scalarization only for forward rendering as for deferred loads will already be scalar since tiles will match waves and therefore all threads will read from the same tile.
// More info on scalarization: https://flashypixels.wordpress.com/2018/11/10/intro-to-gpu-scalarization-part-2-scalarize-all-the-lights/
#define SCALARIZE_LIGHT_LOOP (defined(PLATFORM_SUPPORTS_WAVE_INTRINSICS) && !defined(LIGHTLOOP_DISABLE_TILE_AND_CLUSTER) && SHADERPASS == SHADERPASS_FORWARD)
#endif


#ifndef PROBE_VOLUMES_ACCUMULATE_MODE
#error "PROBE_VOLUMES_ACCUMULATE_MODE must be defined as 0, 1, or 2 before including ProbeVolumeAccumulate.hlsl. 0 triggers generation of SH0 variant, 1 triggers generation of SH1 variant, and 2 triggers generation of SH2 variant.";
#endif
Expand Down Expand Up @@ -47,35 +40,17 @@
if (weightHierarchy >= 1.0) { return; }
#endif

bool fastPath = false;


uint probeVolumeStart, probeVolumeCount;
// Fetch first probe volume to provide the scene proxy for screen space computation
ProbeVolumeGetCountAndStart(posInput, probeVolumeStart, probeVolumeCount);

#if SCALARIZE_LIGHT_LOOP
uint probeStartLane0;
fastPath = IsFastPath(probeVolumeStart, probeStartLane0);

if (fastPath)
{
probeVolumeStart = probeStartLane0;
}
#endif
bool fastPath;
ProbeVolumeGetCountAndStartAndFastPath(posInput, probeVolumeStart, probeVolumeCount, fastPath);

// Scalarized loop, same rationale of the punctual light version
uint v_probeVolumeListOffset = 0;
uint v_probeVolumeIdx = probeVolumeStart;
while (v_probeVolumeListOffset < probeVolumeCount)
{
v_probeVolumeIdx = ProbeVolumeFetchIndex(probeVolumeStart, v_probeVolumeListOffset);
#if SCALARIZE_LIGHT_LOOP
uint s_probeVolumeIdx = ScalarizeElementIndex(v_probeVolumeIdx, fastPath);
#else
uint s_probeVolumeIdx = v_probeVolumeIdx;
#endif

uint s_probeVolumeIdx = ProbeVolumeScalarizeElementIndex(v_probeVolumeIdx, fastPath);
if (s_probeVolumeIdx == -1) { break; }

// Scalar load.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,6 @@
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs.hlsl"
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/ProbeVolume/ProbeVolumeShaderVariables.hlsl"

#ifndef SCALARIZE_LIGHT_LOOP
// We perform scalarization only for forward rendering as for deferred loads will already be scalar since tiles will match waves and therefore all threads will read from the same tile.
// More info on scalarization: https://flashypixels.wordpress.com/2018/11/10/intro-to-gpu-scalarization-part-2-scalarize-all-the-lights/
#define SCALARIZE_LIGHT_LOOP (defined(PLATFORM_SUPPORTS_WAVE_INTRINSICS) && !defined(LIGHTLOOP_DISABLE_TILE_AND_CLUSTER) && SHADERPASS == SHADERPASS_FORWARD)
#endif

#if SHADEROPTIONS_PROBE_VOLUMES_EVALUATION_MODE == PROBEVOLUMESEVALUATIONMODES_MATERIAL_PASS
// Cluster helper functions copied and lightly modified from ClusteredUtils.hlsl with ENABLE_DEPTH_TEXTURE_BACKPLANE undefined

Expand Down Expand Up @@ -117,6 +111,8 @@ void ProbeVolumeGetCountAndStart(PositionInputs posInput, out uint probeVolumeSt

void ProbeVolumeGetCountAndStartAndFastPath(PositionInputs posInput, out uint probeVolumeStart, out uint probeVolumeCount, out bool fastPath)
{
// Fetch first probe volume to provide the scene proxy for screen space computation
ProbeVolumeGetCountAndStart(posInput, probeVolumeStart, probeVolumeCount);
fastPath = false;

#if SCALARIZE_LIGHT_LOOP
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoopDef.hlsl"
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/Shadow/ContactShadows.hlsl"

// We perform scalarization all the time here as we don't know if we have clustered data structure or not at this point.
// More info on scalarization: https://flashypixels.wordpress.com/2018/11/10/intro-to-gpu-scalarization-part-2-scalarize-all-the-lights/
#define SCALARIZE_LIGHT_LOOP (defined(PLATFORM_SUPPORTS_WAVE_INTRINSICS) && !defined(LIGHTLOOP_DISABLE_TILE_AND_CLUSTER))

#pragma only_renderers d3d11 playstation xboxone vulkan metal switch

// #pragma enable_d3d11_debug_symbols
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Material/Decal/Decal.hlsl"
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Material/Decal/DecalPrepassBuffer.hlsl"

#ifndef SCALARIZE_LIGHT_LOOP
#define SCALARIZE_LIGHT_LOOP (defined(PLATFORM_SUPPORTS_WAVE_INTRINSICS) && !defined(LIGHTLOOP_DISABLE_TILE_AND_CLUSTER) && SHADERPASS == SHADERPASS_FORWARD)
#endif

DECLARE_DBUFFER_TEXTURE(_DBufferTexture);

// In order that the lod for with transpartent decal better match the lod for opaque decal
Expand Down Expand Up @@ -188,9 +192,11 @@ DecalSurfaceData GetDecalSurfaceData(PositionInputs posInput, inout float alpha)
#ifndef LIGHTLOOP_DISABLE_TILE_AND_CLUSTER
GetCountAndStart(posInput, LIGHTCATEGORY_DECAL, decalStart, decalCount);

#if SCALARIZE_LIGHT_LOOP
// Fast path is when we all pixels in a wave are accessing same tile or cluster.
uint decalStartLane0;
bool fastPath = IsFastPath(decalStart, decalStartLane0);
uint decalStartLane0 = WaveReadLaneFirst(decalStart);
bool fastPath = WaveActiveAllTrue(decalStart == decalStartLane0);
#endif

#else // LIGHTLOOP_DISABLE_TILE_AND_CLUSTER
decalCount = _DecalCount;
Expand Down Expand Up @@ -220,9 +226,26 @@ DecalSurfaceData GetDecalSurfaceData(PositionInputs posInput, inout float alpha)
v_decalIdx = decalStart + v_decalListOffset;
#endif // LIGHTLOOP_DISABLE_TILE_AND_CLUSTER

uint s_decalIdx = ScalarizeElementIndex(v_decalIdx, fastPath);
if (s_decalIdx == -1)
break;
uint s_decalIdx = v_decalIdx;

#if SCALARIZE_LIGHT_LOOP

if (!fastPath)
{
// If we are not in fast path, v_lightIdx is not scalar, so we need to query the Min value across the wave.
s_decalIdx = WaveActiveMin(v_decalIdx);
// If WaveActiveMin returns 0xffffffff it means that all lanes are actually dead, so we can safely ignore the loop and move forward.
// This could happen as an helper lane could reach this point, hence having a valid v_lightIdx, but their values will be ignored by the WaveActiveMin
if (s_decalIdx == -1)
{
break;
}
}
// Note that the WaveReadLaneFirst should not be needed, but the compiler might insist in putting the result in VGPR.
// However, we are certain at this point that the index is scalar.
s_decalIdx = WaveReadLaneFirst(s_decalIdx);

#endif // SCALARIZE_LIGHT_LOOP

DecalData s_decalData = FetchDecal(s_decalIdx);
bool isRejected = (s_decalData.decalLayerMask & decalLayerMask) == 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@
#define RAY_TRACING_OPTIONAL_ALPHA_TEST_PASS
#endif



// ----------------------------------------------------------------------------

CBUFFER_START(UnityPerDraw)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -163,48 +163,4 @@ float3 TransformPreviousObjectToWorld(float3 positionOS)
return mul(previousModelMatrix, float4(positionOS, 1.0)).xyz;
}


// ----------------------------------------------------------------------------
// Scalarization helper functions.
// These assume a scalarization of a list of elements as described in https://flashypixels.wordpress.com/2018/11/10/intro-to-gpu-scalarization-part-2-scalarize-all-the-lights/

bool IsFastPath(uint lightStart, out uint lightStartLane0)
{
#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
// Fast path is when we all pixels in a wave are accessing same tile or cluster.
lightStartLane0 = WaveReadLaneFirst(lightStart);
return WaveActiveAllTrue(lightStart == lightStartLane0);
#else
lightStartLane0 = lightStart;
return false;
#endif
}

// This function scalarize an index accross all lanes. To be effecient it must be used in the context
// of the scalarization of a loop. It is to use with IsFastPath so it can optimize the number of
// element to load, which is optimal when all the lanes are contained into a tile.
// Please note that if PLATFORM_SUPPORTS_WAVE_INTRINSICS is not defined, this will *not* scalarize the index.
uint ScalarizeElementIndex(uint v_elementIdx, bool fastPath)
{
uint s_elementIdx = v_elementIdx;
#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
if (!fastPath)
{
// If we are not in fast path, v_elementIdx is not scalar, so we need to query the Min value across the wave.
s_elementIdx = WaveActiveMin(v_elementIdx);
// If WaveActiveMin returns 0xffffffff it means that all lanes are actually dead, so we can safely ignore the loop and move forward.
// This could happen as an helper lane could reach this point, hence having a valid v_elementIdx, but their values will be ignored by the WaveActiveMin
if (s_elementIdx == -1)
{
return -1;
}
}
// Note that the WaveReadLaneFirst should not be needed, but the compiler might insist in putting the result in VGPR.
// However, we are certain at this point that the index is scalar.
s_elementIdx = WaveReadLaneFirst(s_elementIdx);
#endif
return s_elementIdx;
}


#endif // UNITY_SHADER_VARIABLES_FUNCTIONS_INCLUDED

0 comments on commit f5ab594

Please sign in to comment.