From bab85d4372ee7cc784acc7d743ffd2c6886ea460 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Sun, 21 Nov 2021 22:17:12 +0100 Subject: [PATCH 1/8] Add SSE version of CombinedShannonEntropy --- .../Formats/Webp/Lossless/LosslessUtils.cs | 154 ++++++++++++++++-- 1 file changed, 137 insertions(+), 17 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index 471c083cda..52453c77fb 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -759,28 +759,147 @@ public static void BundleColorMap(Span row, int width, int xBits, SpanShanon entropy. public static float CombinedShannonEntropy(Span x, Span y) { - double retVal = 0.0d; - uint sumX = 0, sumXY = 0; - for (int i = 0; i < 256; i++) - { - uint xi = (uint)x[i]; - if (xi != 0) +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse41.IsSupported) + { + double retVal = 0.0d; + Span tmp = stackalloc int[4]; + ref int xRef = ref MemoryMarshal.GetReference(x); + ref int yRef = ref MemoryMarshal.GetReference(y); + Vector128 sumXY128 = Vector128.Zero; + Vector128 sumX128 = Vector128.Zero; + ref int tmpRef = ref MemoryMarshal.GetReference(tmp); + for (int i = 0; i < 256; i += 4) { - uint xy = xi + (uint)y[i]; - sumX += xi; - retVal -= FastSLog2(xi); - sumXY += xy; - retVal -= FastSLog2(xy); + Vector128 xVec = Unsafe.As>(ref Unsafe.Add(ref xRef, i)); + Vector128 yVec = Unsafe.As>(ref Unsafe.Add(ref yRef, i)); + + // Check if any X is non-zero: this actually provides a speedup as X is usually sparse. + if (Sse2.MoveMask(Sse2.CompareEqual(xVec, Vector128.Zero).AsByte()) != 0xFFFF) + { + Vector128 xy128 = Sse2.Add(xVec, yVec); + sumXY128 = Sse2.Add(sumXY128, xy128); + sumX128 = Sse2.Add(sumX128, xVec); + + // Analyze the different X + Y. + Unsafe.As>(ref tmpRef) = xy128; + if (tmp[0] != 0) + { + retVal -= FastSLog2((uint)tmp[0]); + if (x[i + 0] != 0) + { + retVal -= FastSLog2((uint)x[i + 0]); + } + } + + if (tmp[1] != 0) + { + retVal -= FastSLog2((uint)tmp[1]); + if (x[i + 1] != 0) + { + retVal -= FastSLog2((uint)x[i + 1]); + } + } + + if (tmp[2] != 0) + { + retVal -= FastSLog2((uint)tmp[2]); + if (x[i + 2] != 0) + { + retVal -= FastSLog2((uint)x[i + 2]); + } + } + + if (tmp[3] != 0) + { + retVal -= FastSLog2((uint)tmp[3]); + if (x[i + 3] != 0) + { + retVal -= FastSLog2((uint)x[i + 3]); + } + } + } + else + { + // X is fully 0, so only deal with Y. + sumXY128 = Sse2.Add(sumXY128, yVec); + + if (y[i] != 0) + { + retVal -= FastSLog2((uint)y[i]); + } + + if (y[i + 1] != 0) + { + retVal -= FastSLog2((uint)y[i + 1]); + } + + if (y[i + 2] != 0) + { + retVal -= FastSLog2((uint)y[i + 2]); + } + + if (y[i + 3] != 0) + { + retVal -= FastSLog2((uint)y[i + 3]); + } + } } - else if (y[i] != 0) + + // Sum up sumX_128 to get sumX and sum up sumXY_128 to get sumXY. + // note: not using here Numerics.ReduceSum, because grouping the same methods together should be slightly faster. + Vector128 haddSumX = Ssse3.HorizontalAdd(sumX128, sumX128); + Vector128 haddSumXY = Ssse3.HorizontalAdd(sumXY128, sumXY128); + Vector128 swappedSumX = Sse2.Shuffle(haddSumX, 0x1); + Vector128 swappedSumXY = Sse2.Shuffle(haddSumXY, 0x1); + Vector128 tmpSumX = Sse2.Add(haddSumX, swappedSumX); + Vector128 tmpSumXY = Sse2.Add(haddSumXY, swappedSumXY); + int sumX = Sse2.ConvertToInt32(tmpSumX); + int sumXY = Sse2.ConvertToInt32(tmpSumXY); + + retVal += FastSLog2((uint)sumX) + FastSLog2((uint)sumXY); + + return (float)retVal; + } + else +#endif + { + double retVal = 0.0d; + uint sumX = 0, sumXY = 0; + for (int i = 0; i < 256; i++) { - sumXY += (uint)y[i]; - retVal -= FastSLog2((uint)y[i]); + uint xi = (uint)x[i]; + if (xi != 0) + { + uint xy = xi + (uint)y[i]; + sumX += xi; + retVal -= FastSLog2(xi); + sumXY += xy; + retVal -= FastSLog2(xy); + } + else if (y[i] != 0) + { + sumXY += (uint)y[i]; + retVal -= FastSLog2((uint)y[i]); + } } + + retVal += FastSLog2(sumX) + FastSLog2(sumXY); + return (float)retVal; } + } - retVal += FastSLog2(sumX) + FastSLog2(sumXY); - return (float)retVal; + [MethodImpl(InliningOptions.ShortMethod)] + private static void AnalyzeXy(Span tmp, Span x, int i, int pos, ref double retVal) + { + if (tmp[pos] != 0) + { + retVal -= FastSLog2((uint)tmp[pos]); + if (x[i + pos] != 0) + { + retVal -= FastSLog2((uint)x[i + pos]); + } + } } [MethodImpl(InliningOptions.ShortMethod)] @@ -836,6 +955,7 @@ public static void ColorCodeToMultipliers(uint colorCode, ref Vp8LMultipliers m) private static float FastSLog2Slow(uint v) { DebugGuard.MustBeGreaterThanOrEqualTo(v, LogLookupIdxMax, nameof(v)); + if (v < ApproxLogWithCorrectionMax) { int logCnt = 0; @@ -865,7 +985,7 @@ private static float FastSLog2Slow(uint v) private static float FastLog2Slow(uint v) { - Guard.MustBeGreaterThanOrEqualTo(v, LogLookupIdxMax, nameof(v)); + DebugGuard.MustBeGreaterThanOrEqualTo(v, LogLookupIdxMax, nameof(v)); if (v < ApproxLogWithCorrectionMax) { From cc430cc84626edf63c187f97fe37f6d4ad2ca0da Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Mon, 22 Nov 2021 12:56:45 +0100 Subject: [PATCH 2/8] Avoid bounds checks --- .../Formats/Webp/Lossless/LosslessUtils.cs | 61 ++++++++----------- 1 file changed, 24 insertions(+), 37 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index 52453c77fb..0f24e8e8f3 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -783,39 +783,39 @@ public static float CombinedShannonEntropy(Span x, Span y) // Analyze the different X + Y. Unsafe.As>(ref tmpRef) = xy128; - if (tmp[0] != 0) + if (tmpRef != 0) { - retVal -= FastSLog2((uint)tmp[0]); - if (x[i + 0] != 0) + retVal -= FastSLog2((uint)tmpRef); + if (Unsafe.Add(ref xRef, i) != 0) { - retVal -= FastSLog2((uint)x[i + 0]); + retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i)); } } - if (tmp[1] != 0) + if (Unsafe.Add(ref tmpRef, 1) != 0) { - retVal -= FastSLog2((uint)tmp[1]); - if (x[i + 1] != 0) + retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 1)); + if (Unsafe.Add(ref xRef, i + 1) != 0) { - retVal -= FastSLog2((uint)x[i + 1]); + retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 1)); } } - if (tmp[2] != 0) + if (Unsafe.Add(ref tmpRef, 2) != 0) { - retVal -= FastSLog2((uint)tmp[2]); - if (x[i + 2] != 0) + retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 2)); + if (Unsafe.Add(ref xRef, i + 2) != 0) { - retVal -= FastSLog2((uint)x[i + 2]); + retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 2)); } } - if (tmp[3] != 0) + if (Unsafe.Add(ref tmpRef, 3) != 0) { - retVal -= FastSLog2((uint)tmp[3]); - if (x[i + 3] != 0) + retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 3)); + if (Unsafe.Add(ref xRef, i + 3) != 0) { - retVal -= FastSLog2((uint)x[i + 3]); + retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 3)); } } } @@ -824,24 +824,24 @@ public static float CombinedShannonEntropy(Span x, Span y) // X is fully 0, so only deal with Y. sumXY128 = Sse2.Add(sumXY128, yVec); - if (y[i] != 0) + if (Unsafe.Add(ref yRef, i) != 0) { - retVal -= FastSLog2((uint)y[i]); + retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i)); } - if (y[i + 1] != 0) + if (Unsafe.Add(ref yRef, i + 1) != 0) { - retVal -= FastSLog2((uint)y[i + 1]); + retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 1)); } - if (y[i + 2] != 0) + if (Unsafe.Add(ref yRef, i + 2) != 0) { - retVal -= FastSLog2((uint)y[i + 2]); + retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 2)); } - if (y[i + 3] != 0) + if (Unsafe.Add(ref yRef, i + 3) != 0) { - retVal -= FastSLog2((uint)y[i + 3]); + retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 3)); } } } @@ -889,19 +889,6 @@ public static float CombinedShannonEntropy(Span x, Span y) } } - [MethodImpl(InliningOptions.ShortMethod)] - private static void AnalyzeXy(Span tmp, Span x, int i, int pos, ref double retVal) - { - if (tmp[pos] != 0) - { - retVal -= FastSLog2((uint)tmp[pos]); - if (x[i + pos] != 0) - { - retVal -= FastSLog2((uint)x[i + pos]); - } - } - } - [MethodImpl(InliningOptions.ShortMethod)] public static byte TransformColorRed(sbyte greenToRed, uint argb) { From ed8bd615f2be3cafd1a23782e9f7d07c6375d967 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Mon, 22 Nov 2021 15:38:46 +0100 Subject: [PATCH 3/8] Faster SSE2 version of ShanonEntropy --- .../Formats/Webp/Lossless/LosslessUtils.cs | 115 +++++------------- 1 file changed, 29 insertions(+), 86 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index 0f24e8e8f3..68004275bd 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -6,6 +6,7 @@ using System.Runtime.InteropServices; using SixLabors.ImageSharp.Memory; #if SUPPORTS_RUNTIME_INTRINSICS +using System.Numerics; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; #endif @@ -706,7 +707,7 @@ public static void ExpandColorMap(int numColors, Span transformData, Span< int colorMapLength4 = 4 * newColorMap.Length; for (; i < colorMapLength4; i++) { - newData[i] = 0; // black tail. + newData[i] = 0; // black tail. } } @@ -760,103 +761,45 @@ public static void BundleColorMap(Span row, int width, int xBits, Span x, Span y) { #if SUPPORTS_RUNTIME_INTRINSICS - if (Sse41.IsSupported) + if (Sse2.IsSupported) { double retVal = 0.0d; - Span tmp = stackalloc int[4]; ref int xRef = ref MemoryMarshal.GetReference(x); ref int yRef = ref MemoryMarshal.GetReference(y); - Vector128 sumXY128 = Vector128.Zero; - Vector128 sumX128 = Vector128.Zero; - ref int tmpRef = ref MemoryMarshal.GetReference(tmp); - for (int i = 0; i < 256; i += 4) - { - Vector128 xVec = Unsafe.As>(ref Unsafe.Add(ref xRef, i)); - Vector128 yVec = Unsafe.As>(ref Unsafe.Add(ref yRef, i)); - - // Check if any X is non-zero: this actually provides a speedup as X is usually sparse. - if (Sse2.MoveMask(Sse2.CompareEqual(xVec, Vector128.Zero).AsByte()) != 0xFFFF) - { - Vector128 xy128 = Sse2.Add(xVec, yVec); - sumXY128 = Sse2.Add(sumXY128, xy128); - sumX128 = Sse2.Add(sumX128, xVec); - - // Analyze the different X + Y. - Unsafe.As>(ref tmpRef) = xy128; - if (tmpRef != 0) - { - retVal -= FastSLog2((uint)tmpRef); - if (Unsafe.Add(ref xRef, i) != 0) - { - retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i)); - } - } - if (Unsafe.Add(ref tmpRef, 1) != 0) - { - retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 1)); - if (Unsafe.Add(ref xRef, i + 1) != 0) - { - retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 1)); - } - } - - if (Unsafe.Add(ref tmpRef, 2) != 0) - { - retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 2)); - if (Unsafe.Add(ref xRef, i + 2) != 0) - { - retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 2)); - } - } - - if (Unsafe.Add(ref tmpRef, 3) != 0) - { - retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 3)); - if (Unsafe.Add(ref xRef, i + 3) != 0) - { - retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 3)); - } - } - } - else + int sumXY = 0; + int sumX = 0; + for (int i = 0; i < 256; i += 16) + { + Vector128 x0 = Unsafe.As>(ref Unsafe.Add(ref xRef, i)); + Vector128 y0 = Unsafe.As>(ref Unsafe.Add(ref yRef, i)); + Vector128 x1 = Unsafe.As>(ref Unsafe.Add(ref xRef, i + 4)); + Vector128 y1 = Unsafe.As>(ref Unsafe.Add(ref yRef, i + 4)); + Vector128 x2 = Unsafe.As>(ref Unsafe.Add(ref xRef, i + 8)); + Vector128 y2 = Unsafe.As>(ref Unsafe.Add(ref yRef, i + 8)); + Vector128 x3 = Unsafe.As>(ref Unsafe.Add(ref xRef, i + 12)); + Vector128 y3 = Unsafe.As>(ref Unsafe.Add(ref yRef, i + 12)); + Vector128 x4 = Sse2.PackSignedSaturate(Sse2.PackSignedSaturate(x0, x1), Sse2.PackSignedSaturate(x2, x3)); + Vector128 y4 = Sse2.PackSignedSaturate(Sse2.PackSignedSaturate(y0, y1), Sse2.PackSignedSaturate(y2, y3)); + int mx = Sse2.MoveMask(Sse2.CompareGreaterThan(x4, Vector128.Zero).AsByte()); + int my = Sse2.MoveMask(Sse2.CompareGreaterThan(y4, Vector128.Zero).AsByte()) | mx; + while (my != 0) { - // X is fully 0, so only deal with Y. - sumXY128 = Sse2.Add(sumXY128, yVec); - - if (Unsafe.Add(ref yRef, i) != 0) + int j = BitOperations.TrailingZeroCount(my); + if (((mx >> j) & 1) != 0) { - retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i)); + int xij = Unsafe.Add(ref xRef, i + j); + sumXY += xij; + retVal -= FastSLog2((uint)xij); } - if (Unsafe.Add(ref yRef, i + 1) != 0) - { - retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 1)); - } - - if (Unsafe.Add(ref yRef, i + 2) != 0) - { - retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 2)); - } - - if (Unsafe.Add(ref yRef, i + 3) != 0) - { - retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 3)); - } + int xy = Unsafe.Add(ref xRef, i + j) + Unsafe.Add(ref yRef, i + j); + sumX += xy; + retVal -= FastSLog2((uint)xy); + my &= my - 1; } } - // Sum up sumX_128 to get sumX and sum up sumXY_128 to get sumXY. - // note: not using here Numerics.ReduceSum, because grouping the same methods together should be slightly faster. - Vector128 haddSumX = Ssse3.HorizontalAdd(sumX128, sumX128); - Vector128 haddSumXY = Ssse3.HorizontalAdd(sumXY128, sumXY128); - Vector128 swappedSumX = Sse2.Shuffle(haddSumX, 0x1); - Vector128 swappedSumXY = Sse2.Shuffle(haddSumXY, 0x1); - Vector128 tmpSumX = Sse2.Add(haddSumX, swappedSumX); - Vector128 tmpSumXY = Sse2.Add(haddSumXY, swappedSumXY); - int sumX = Sse2.ConvertToInt32(tmpSumX); - int sumXY = Sse2.ConvertToInt32(tmpSumXY); - retVal += FastSLog2((uint)sumX) + FastSLog2((uint)sumXY); return (float)retVal; From b1df6a97487f1d8ae68da60eaf3953fe6727f523 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Mon, 22 Nov 2021 20:34:22 +0100 Subject: [PATCH 4/8] Revert "Faster SSE2 version of ShanonEntropy" Profiling does not proof that this version is actually faster. --- .../Formats/Webp/Lossless/LosslessUtils.cs | 115 +++++++++++++----- 1 file changed, 86 insertions(+), 29 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index 68004275bd..0f24e8e8f3 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -6,7 +6,6 @@ using System.Runtime.InteropServices; using SixLabors.ImageSharp.Memory; #if SUPPORTS_RUNTIME_INTRINSICS -using System.Numerics; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; #endif @@ -707,7 +706,7 @@ public static void ExpandColorMap(int numColors, Span transformData, Span< int colorMapLength4 = 4 * newColorMap.Length; for (; i < colorMapLength4; i++) { - newData[i] = 0; // black tail. + newData[i] = 0; // black tail. } } @@ -761,45 +760,103 @@ public static void BundleColorMap(Span row, int width, int xBits, Span x, Span y) { #if SUPPORTS_RUNTIME_INTRINSICS - if (Sse2.IsSupported) + if (Sse41.IsSupported) { double retVal = 0.0d; + Span tmp = stackalloc int[4]; ref int xRef = ref MemoryMarshal.GetReference(x); ref int yRef = ref MemoryMarshal.GetReference(y); - - int sumXY = 0; - int sumX = 0; - for (int i = 0; i < 256; i += 16) + Vector128 sumXY128 = Vector128.Zero; + Vector128 sumX128 = Vector128.Zero; + ref int tmpRef = ref MemoryMarshal.GetReference(tmp); + for (int i = 0; i < 256; i += 4) { - Vector128 x0 = Unsafe.As>(ref Unsafe.Add(ref xRef, i)); - Vector128 y0 = Unsafe.As>(ref Unsafe.Add(ref yRef, i)); - Vector128 x1 = Unsafe.As>(ref Unsafe.Add(ref xRef, i + 4)); - Vector128 y1 = Unsafe.As>(ref Unsafe.Add(ref yRef, i + 4)); - Vector128 x2 = Unsafe.As>(ref Unsafe.Add(ref xRef, i + 8)); - Vector128 y2 = Unsafe.As>(ref Unsafe.Add(ref yRef, i + 8)); - Vector128 x3 = Unsafe.As>(ref Unsafe.Add(ref xRef, i + 12)); - Vector128 y3 = Unsafe.As>(ref Unsafe.Add(ref yRef, i + 12)); - Vector128 x4 = Sse2.PackSignedSaturate(Sse2.PackSignedSaturate(x0, x1), Sse2.PackSignedSaturate(x2, x3)); - Vector128 y4 = Sse2.PackSignedSaturate(Sse2.PackSignedSaturate(y0, y1), Sse2.PackSignedSaturate(y2, y3)); - int mx = Sse2.MoveMask(Sse2.CompareGreaterThan(x4, Vector128.Zero).AsByte()); - int my = Sse2.MoveMask(Sse2.CompareGreaterThan(y4, Vector128.Zero).AsByte()) | mx; - while (my != 0) + Vector128 xVec = Unsafe.As>(ref Unsafe.Add(ref xRef, i)); + Vector128 yVec = Unsafe.As>(ref Unsafe.Add(ref yRef, i)); + + // Check if any X is non-zero: this actually provides a speedup as X is usually sparse. + if (Sse2.MoveMask(Sse2.CompareEqual(xVec, Vector128.Zero).AsByte()) != 0xFFFF) { - int j = BitOperations.TrailingZeroCount(my); - if (((mx >> j) & 1) != 0) + Vector128 xy128 = Sse2.Add(xVec, yVec); + sumXY128 = Sse2.Add(sumXY128, xy128); + sumX128 = Sse2.Add(sumX128, xVec); + + // Analyze the different X + Y. + Unsafe.As>(ref tmpRef) = xy128; + if (tmpRef != 0) + { + retVal -= FastSLog2((uint)tmpRef); + if (Unsafe.Add(ref xRef, i) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i)); + } + } + + if (Unsafe.Add(ref tmpRef, 1) != 0) { - int xij = Unsafe.Add(ref xRef, i + j); - sumXY += xij; - retVal -= FastSLog2((uint)xij); + retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 1)); + if (Unsafe.Add(ref xRef, i + 1) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 1)); + } } - int xy = Unsafe.Add(ref xRef, i + j) + Unsafe.Add(ref yRef, i + j); - sumX += xy; - retVal -= FastSLog2((uint)xy); - my &= my - 1; + if (Unsafe.Add(ref tmpRef, 2) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 2)); + if (Unsafe.Add(ref xRef, i + 2) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 2)); + } + } + + if (Unsafe.Add(ref tmpRef, 3) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 3)); + if (Unsafe.Add(ref xRef, i + 3) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 3)); + } + } + } + else + { + // X is fully 0, so only deal with Y. + sumXY128 = Sse2.Add(sumXY128, yVec); + + if (Unsafe.Add(ref yRef, i) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i)); + } + + if (Unsafe.Add(ref yRef, i + 1) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 1)); + } + + if (Unsafe.Add(ref yRef, i + 2) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 2)); + } + + if (Unsafe.Add(ref yRef, i + 3) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 3)); + } } } + // Sum up sumX_128 to get sumX and sum up sumXY_128 to get sumXY. + // note: not using here Numerics.ReduceSum, because grouping the same methods together should be slightly faster. + Vector128 haddSumX = Ssse3.HorizontalAdd(sumX128, sumX128); + Vector128 haddSumXY = Ssse3.HorizontalAdd(sumXY128, sumXY128); + Vector128 swappedSumX = Sse2.Shuffle(haddSumX, 0x1); + Vector128 swappedSumXY = Sse2.Shuffle(haddSumXY, 0x1); + Vector128 tmpSumX = Sse2.Add(haddSumX, swappedSumX); + Vector128 tmpSumXY = Sse2.Add(haddSumXY, swappedSumXY); + int sumX = Sse2.ConvertToInt32(tmpSumX); + int sumXY = Sse2.ConvertToInt32(tmpSumXY); + retVal += FastSLog2((uint)sumX) + FastSLog2((uint)sumXY); return (float)retVal; From 32b97f41fc564d32110d6939f398618a7d683fc6 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Mon, 22 Nov 2021 22:21:46 +0100 Subject: [PATCH 5/8] Add AVX2 version of CombinedShannonEntropy --- src/ImageSharp/Common/Helpers/Numerics.cs | 15 +++ .../Formats/Webp/Lossless/LosslessUtils.cs | 97 ++++++++++++++----- 2 files changed, 89 insertions(+), 23 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs index ba5c588ca5..9dc13079d6 100644 --- a/src/ImageSharp/Common/Helpers/Numerics.cs +++ b/src/ImageSharp/Common/Helpers/Numerics.cs @@ -820,6 +820,21 @@ public static int ReduceSum(Vector128 accumulator) } } + /// + /// Reduces elements of the vector into one sum. + /// + /// The accumulator to reduce. + /// The sum of all elements. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int ReduceSum(Vector256 accumulator) + { + Vector128 vec0 = Avx2.ExtractVector128(accumulator, 0); + Vector128 vec1 = Avx2.ExtractVector128(accumulator, 1); + Vector128 sum128 = Sse2.Add(vec0, vec1); + + return ReduceSum(sum128); + } + /// /// Reduces even elements of the vector into one sum. /// diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index 0f24e8e8f3..314f26d64d 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -2,6 +2,7 @@ // Licensed under the Apache License, Version 2.0. using System; +using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using SixLabors.ImageSharp.Memory; @@ -760,29 +761,30 @@ public static void BundleColorMap(Span row, int width, int xBits, Span x, Span y) { #if SUPPORTS_RUNTIME_INTRINSICS - if (Sse41.IsSupported) + if (Avx2.IsSupported) { double retVal = 0.0d; - Span tmp = stackalloc int[4]; + Span tmp = stackalloc int[8]; ref int xRef = ref MemoryMarshal.GetReference(x); ref int yRef = ref MemoryMarshal.GetReference(y); - Vector128 sumXY128 = Vector128.Zero; - Vector128 sumX128 = Vector128.Zero; + Vector256 sumXY256 = Vector256.Zero; + Vector256 sumX256 = Vector256.Zero; ref int tmpRef = ref MemoryMarshal.GetReference(tmp); - for (int i = 0; i < 256; i += 4) + for (nint i = 0; i < 256; i += 8) { - Vector128 xVec = Unsafe.As>(ref Unsafe.Add(ref xRef, i)); - Vector128 yVec = Unsafe.As>(ref Unsafe.Add(ref yRef, i)); + Vector256 xVec = Unsafe.As>(ref Unsafe.Add(ref xRef, i)); + Vector256 yVec = Unsafe.As>(ref Unsafe.Add(ref yRef, i)); // Check if any X is non-zero: this actually provides a speedup as X is usually sparse. - if (Sse2.MoveMask(Sse2.CompareEqual(xVec, Vector128.Zero).AsByte()) != 0xFFFF) + int mask = Avx2.MoveMask(Avx2.CompareEqual(xVec, Vector256.Zero).AsByte()); + if (mask != -1) { - Vector128 xy128 = Sse2.Add(xVec, yVec); - sumXY128 = Sse2.Add(sumXY128, xy128); - sumX128 = Sse2.Add(sumX128, xVec); + Vector256 xy256 = Avx2.Add(xVec, yVec); + sumXY256 = Avx2.Add(sumXY256, xy256); + sumX256 = Avx2.Add(sumX256, xVec); // Analyze the different X + Y. - Unsafe.As>(ref tmpRef) = xy128; + Unsafe.As>(ref tmpRef) = xy256; if (tmpRef != 0) { retVal -= FastSLog2((uint)tmpRef); @@ -818,11 +820,47 @@ public static float CombinedShannonEntropy(Span x, Span y) retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 3)); } } + + if (Unsafe.Add(ref tmpRef, 4) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 4)); + if (Unsafe.Add(ref xRef, i + 4) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 4)); + } + } + + if (Unsafe.Add(ref tmpRef, 5) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 5)); + if (Unsafe.Add(ref xRef, i + 5) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 5)); + } + } + + if (Unsafe.Add(ref tmpRef, 6) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 6)); + if (Unsafe.Add(ref xRef, i + 6) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 6)); + } + } + + if (Unsafe.Add(ref tmpRef, 7) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 7)); + if (Unsafe.Add(ref xRef, i + 7) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 7)); + } + } } else { // X is fully 0, so only deal with Y. - sumXY128 = Sse2.Add(sumXY128, yVec); + sumXY256 = Avx2.Add(sumXY256, yVec); if (Unsafe.Add(ref yRef, i) != 0) { @@ -843,19 +881,32 @@ public static float CombinedShannonEntropy(Span x, Span y) { retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 3)); } + + if (Unsafe.Add(ref yRef, i + 4) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 4)); + } + + if (Unsafe.Add(ref yRef, i + 5) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 5)); + } + + if (Unsafe.Add(ref yRef, i + 6) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 6)); + } + + if (Unsafe.Add(ref yRef, i + 7) != 0) + { + retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 7)); + } } } - // Sum up sumX_128 to get sumX and sum up sumXY_128 to get sumXY. - // note: not using here Numerics.ReduceSum, because grouping the same methods together should be slightly faster. - Vector128 haddSumX = Ssse3.HorizontalAdd(sumX128, sumX128); - Vector128 haddSumXY = Ssse3.HorizontalAdd(sumXY128, sumXY128); - Vector128 swappedSumX = Sse2.Shuffle(haddSumX, 0x1); - Vector128 swappedSumXY = Sse2.Shuffle(haddSumXY, 0x1); - Vector128 tmpSumX = Sse2.Add(haddSumX, swappedSumX); - Vector128 tmpSumXY = Sse2.Add(haddSumXY, swappedSumXY); - int sumX = Sse2.ConvertToInt32(tmpSumX); - int sumXY = Sse2.ConvertToInt32(tmpSumXY); + // Sum up sumX256 to get sumX and sum up sumXY256 to get sumXY. + int sumX = Numerics.ReduceSum(sumX256); + int sumXY = Numerics.ReduceSum(sumXY256); retVal += FastSLog2((uint)sumX) + FastSLog2((uint)sumXY); From 0fc3ce721270c50999b715fbff6d9663a8386cc4 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Tue, 23 Nov 2021 11:12:46 +0100 Subject: [PATCH 6/8] Add CombinedShannonEntropy tests --- .../Formats/WebP/LosslessUtilsTests.cs | 28 ++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs index 97567ba218..9c7a2f7588 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs @@ -10,6 +10,17 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp [Trait("Format", "Webp")] public class LosslessUtilsTests { + private static void RunCombinedShannonEntropyTest() + { + int[] x = { 3, 5, 2, 5, 3, 1, 2, 2, 3, 3, 1, 2, 1, 2, 1, 1, 0, 0, 0, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 1, 1, 0, 0, 2, 1, 1, 0, 3, 1, 2, 3, 2, 3 }; + int[] y = { 11, 12, 8, 3, 4, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 2, 1, 1, 2, 4, 6, 4 }; + float expected = 884.7585f; + + float actual = LosslessUtils.CombinedShannonEntropy(x, y); + + Assert.Equal(expected, actual, 5); + } + private static void RunSubtractGreenTest() { uint[] pixelData = @@ -193,6 +204,9 @@ private static void RunPredictor13Test() } } + [Fact] + public void CombinedShannonEntropy_Works() => RunCombinedShannonEntropyTest(); + [Fact] public void Predictor11_Works() => RunPredictor11Test(); @@ -215,6 +229,12 @@ private static void RunPredictor13Test() public void TransformColorInverse_Works() => RunTransformColorInverseTest(); #if SUPPORTS_RUNTIME_INTRINSICS + [Fact] + public void CombinedShannonEntropy_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCombinedShannonEntropyTest, HwIntrinsics.AllowAll); + + [Fact] + public void CombinedShannonEntropy_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCombinedShannonEntropyTest, HwIntrinsics.DisableAVX2); + [Fact] public void Predictor11_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunPredictor11Test, HwIntrinsics.AllowAll); @@ -237,19 +257,19 @@ private static void RunPredictor13Test() public void SubtractGreen_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunSubtractGreenTest, HwIntrinsics.AllowAll); [Fact] - public void SubtractGreen_WithoutAvx_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunSubtractGreenTest, HwIntrinsics.DisableAVX); + public void SubtractGreen_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunSubtractGreenTest, HwIntrinsics.DisableAVX2); [Fact] - public void SubtractGreen_WithoutAvxOrSSSE3_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunSubtractGreenTest, HwIntrinsics.DisableAVX | HwIntrinsics.DisableSSSE3); + public void SubtractGreen_WithoutAvxOrSSSE3_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunSubtractGreenTest, HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSSE3); [Fact] public void AddGreenToBlueAndRed_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunAddGreenToBlueAndRedTest, HwIntrinsics.AllowAll); [Fact] - public void AddGreenToBlueAndRed_WithoutAvx_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunAddGreenToBlueAndRedTest, HwIntrinsics.DisableAVX); + public void AddGreenToBlueAndRed_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunAddGreenToBlueAndRedTest, HwIntrinsics.DisableAVX2); [Fact] - public void AddGreenToBlueAndRed_WithoutAvxOrSSSE3_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunAddGreenToBlueAndRedTest, HwIntrinsics.DisableAVX | HwIntrinsics.DisableSSE2 | HwIntrinsics.DisableSSSE3); + public void AddGreenToBlueAndRed_WithoutAVX2OrSSSE3_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunAddGreenToBlueAndRedTest, HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE2 | HwIntrinsics.DisableSSSE3); [Fact] public void TransformColor_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformColorTest, HwIntrinsics.AllowAll); From 110ff3d9e8f9651d73286576e17f69f1e2ecfa31 Mon Sep 17 00:00:00 2001 From: Brian Popow <38701097+brianpopow@users.noreply.github.com> Date: Wed, 24 Nov 2021 12:38:44 +0100 Subject: [PATCH 7/8] Avoid using Span tmp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Günther Foidl --- src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index 314f26d64d..4f247c434b 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -764,12 +764,12 @@ public static float CombinedShannonEntropy(Span x, Span y) if (Avx2.IsSupported) { double retVal = 0.0d; - Span tmp = stackalloc int[8]; + Vector256 tmp = Vector256.Zero; // has the size of the scratch space of sizeof(int) * 8 ref int xRef = ref MemoryMarshal.GetReference(x); ref int yRef = ref MemoryMarshal.GetReference(y); Vector256 sumXY256 = Vector256.Zero; Vector256 sumX256 = Vector256.Zero; - ref int tmpRef = ref MemoryMarshal.GetReference(tmp); + ref int tmpRef = ref Unsafe.As, int>(ref tmp); for (nint i = 0; i < 256; i += 8) { Vector256 xVec = Unsafe.As>(ref Unsafe.Add(ref xRef, i)); From 5403fbd8b2a4f42e9a9deed923d3017d449b3ab9 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Wed, 24 Nov 2021 12:34:34 +0100 Subject: [PATCH 8/8] Add better version of ReduceSum for Vector 256 --- src/ImageSharp/Common/Helpers/Numerics.cs | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs index 9dc13079d6..fa0af823d5 100644 --- a/src/ImageSharp/Common/Helpers/Numerics.cs +++ b/src/ImageSharp/Common/Helpers/Numerics.cs @@ -828,11 +828,16 @@ public static int ReduceSum(Vector128 accumulator) [MethodImpl(MethodImplOptions.AggressiveInlining)] public static int ReduceSum(Vector256 accumulator) { - Vector128 vec0 = Avx2.ExtractVector128(accumulator, 0); - Vector128 vec1 = Avx2.ExtractVector128(accumulator, 1); - Vector128 sum128 = Sse2.Add(vec0, vec1); + // Add upper lane to lower lane. + Vector128 vsum = Sse2.Add(accumulator.GetLower(), accumulator.GetUpper()); - return ReduceSum(sum128); + // Add odd to even. + vsum = Sse2.Add(vsum, Sse2.Shuffle(vsum, 0b_11_11_01_01)); + + // Add high to low. + vsum = Sse2.Add(vsum, Sse2.Shuffle(vsum, 0b_11_10_11_10)); + + return Sse2.ConvertToInt32(vsum); } ///