From 1f6a503b9b3e57480cb73ae9f1218143228802bd Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Wed, 8 Dec 2021 13:05:47 +0100 Subject: [PATCH 1/5] Add SSE2 version of Vp8_Sse16X16 and Vp8_Sse16X8 --- .../Formats/Webp/Lossy/LossyUtils.cs | 72 ++++++++++++++++++- 1 file changed, 70 insertions(+), 2 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index af5f136fa9..69ce6c7f98 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -40,11 +40,33 @@ internal static class LossyUtils // Note: method name in libwebp reference implementation is called VP8SSE16x16. [MethodImpl(InliningOptions.ShortMethod)] - public static int Vp8_Sse16X16(Span a, Span b) => Vp8_SseNxN(a, b, 16, 16); + public static int Vp8_Sse16X16(Span a, Span b) + { +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) + { + return Vp8_Sse16xN_Sse2(a, b, 8); + } +#endif + { + return Vp8_SseNxN(a, b, 16, 16); + } + } // Note: method name in libwebp reference implementation is called VP8SSE16x8. [MethodImpl(InliningOptions.ShortMethod)] - public static int Vp8_Sse16X8(Span a, Span b) => Vp8_SseNxN(a, b, 16, 8); + public static int Vp8_Sse16X8(Span a, Span b) + { +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) + { + return Vp8_Sse16xN_Sse2(a, b, 4); + } +#endif + { + return Vp8_SseNxN(a, b, 16, 8); + } + } // Note: method name in libwebp reference implementation is called VP8SSE4x4. [MethodImpl(InliningOptions.ShortMethod)] @@ -146,6 +168,52 @@ public static int Vp8_SseNxN(Span a, Span b, int w, int h) return count; } +#if SUPPORTS_RUNTIME_INTRINSICS + [MethodImpl(InliningOptions.ShortMethod)] + private static int Vp8_Sse16xN_Sse2(Span a, Span b, int numPairs) + { + Vector128 sum = Vector128.Zero; + int offset = 0; + for (int i = 0; i < numPairs; i++) + { + // Load values. + ref byte aRef = ref MemoryMarshal.GetReference(a); + ref byte bRef = ref MemoryMarshal.GetReference(b); + Vector128 a0 = Unsafe.As>(ref Unsafe.Add(ref aRef, offset)); + Vector128 b0 = Unsafe.As>(ref Unsafe.Add(ref bRef, offset)); + Vector128 a1 = Unsafe.As>(ref Unsafe.Add(ref aRef, offset + WebpConstants.Bps)); + Vector128 b1 = Unsafe.As>(ref Unsafe.Add(ref bRef, offset + WebpConstants.Bps)); + + Vector128 sum1 = SubtractAndAccumulate(a0, b0); + Vector128 sum2 = SubtractAndAccumulate(a1, b1); + sum = Sse2.Add(sum, Sse2.Add(sum1, sum2)); + + offset += 2 * WebpConstants.Bps; + } + + return Numerics.ReduceSum(sum); + } + + [MethodImpl(InliningOptions.ShortMethod)] + private static Vector128 SubtractAndAccumulate(Vector128 a, Vector128 b) + { + // Take abs(a-b) in 8b. + Vector128 ab = Sse2.SubtractSaturate(a, b); + Vector128 ba = Sse2.SubtractSaturate(b, a); + Vector128 absAb = Sse2.Or(ab, ba); + + // Zero-extend to 16b. + Vector128 c0 = Sse2.UnpackLow(absAb, Vector128.Zero); + Vector128 c1 = Sse2.UnpackHigh(absAb, Vector128.Zero); + + // Multiply with self. + Vector128 sum1 = Sse2.MultiplyAddAdjacent(c0.AsInt16(), c0.AsInt16()); + Vector128 sum2 = Sse2.MultiplyAddAdjacent(c1.AsInt16(), c1.AsInt16()); + + return Sse2.Add(sum1, sum2); + } +#endif + [MethodImpl(InliningOptions.ShortMethod)] public static void Vp8Copy4X4(Span src, Span dst) => Copy(src, dst, 4, 4); From 1e2278b13ce9da5bebda9b261aa5f50c1c2b5531 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Wed, 8 Dec 2021 13:36:52 +0100 Subject: [PATCH 2/5] Add tests for Sse16X16 and Sse16X8 --- .../Formats/WebP/LossyUtilsTests.cs | 136 ++++++++++++++++++ 1 file changed, 136 insertions(+) diff --git a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs index aaffac443d..9625008ca8 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs @@ -76,6 +76,124 @@ private static void RunTransformOneTest() Assert.True(expected.SequenceEqual(dst)); } + private static void RunVp8Sse16X16Test() + { + // arrange + byte[] a = + { + 154, 154, 151, 151, 149, 148, 151, 157, 163, 163, 154, 132, 102, 98, 104, 108, 107, 104, 104, 103, + 101, 106, 123, 119, 170, 171, 172, 171, 168, 175, 171, 173, 151, 151, 149, 150, 147, 147, 146, 159, + 164, 165, 154, 129, 92, 90, 101, 105, 104, 103, 104, 101, 100, 105, 123, 117, 172, 172, 172, 168, + 170, 177, 170, 175, 151, 149, 150, 150, 147, 147, 156, 161, 161, 161, 151, 126, 93, 90, 102, 107, + 104, 103, 104, 101, 104, 104, 122, 117, 172, 172, 170, 168, 170, 177, 172, 175, 150, 149, 152, 151, + 148, 151, 160, 159, 157, 157, 148, 133, 96, 90, 103, 107, 104, 104, 101, 100, 102, 102, 121, 117, + 170, 170, 169, 171, 171, 179, 173, 175, 149, 151, 152, 151, 148, 154, 162, 157, 154, 154, 151, 132, + 92, 89, 101, 108, 104, 102, 101, 101, 103, 103, 123, 118, 171, 168, 177, 173, 171, 178, 172, 176, + 152, 152, 152, 151, 154, 162, 161, 155, 149, 157, 156, 129, 92, 87, 101, 107, 102, 100, 107, 100, + 101, 102, 123, 118, 170, 175, 182, 172, 171, 179, 173, 175, 152, 151, 154, 155, 160, 162, 161, 153, + 150, 156, 153, 129, 92, 91, 102, 106, 100, 109, 115, 99, 101, 102, 124, 120, 171, 179, 178, 172, + 171, 181, 171, 173, 154, 154, 154, 162, 160, 158, 156, 152, 153, 157, 151, 128, 86, 86, 102, 105, + 102, 122, 114, 99, 101, 102, 125, 120, 178, 173, 177, 172, 171, 180, 172, 173, 154, 152, 158, 163, + 150, 148, 148, 156, 151, 158, 152, 129, 87, 87, 101, 105, 204, 204, 204, 204, 204, 204, 204, 204, + 204, 204, 204, 204, 204, 204, 204, 204, 154, 151, 165, 156, 141, 137, 146, 158, 152, 159, 152, 133, + 90, 88, 99, 106, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, + 154, 160, 164, 150, 126, 127, 149, 159, 155, 161, 153, 131, 84, 86, 97, 103, 204, 204, 204, 204, + 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 157, 167, 157, 137, 102, 128, 155, 161, + 157, 159, 154, 134, 84, 82, 97, 102, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, + 204, 204, 204, 204, 163, 163, 150, 113, 78, 132, 156, 162, 159, 160, 154, 132, 83, 78, 91, 97, 204, + 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 163, 157, 137, 80, 78, + 131, 154, 163, 157, 159, 149, 131, 82, 77, 94, 100, 204, 204, 204, 204, 204, 204, 204, 204, 204, + 204, 204, 204, 204, 204, 204, 204, 159, 151, 108, 72, 88, 132, 156, 162, 159, 157, 151, 130, 79, 78, + 95, 102, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 151, 130, + 82, 82, 89, 134, 154, 161, 161, 157, 152, 129, 81, 77, 95, 102, 204, 204, 204, 204, 204, 204, 204, + 204, 204, 204, 204, 204, 204, 204, 204, 204 + }; + + byte[] b = + { + 150, 150, 150, 150, 146, 149, 152, 154, 164, 166, 154, 132, 99, 92, 106, 112, 204, 204, 204, 204, + 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 150, 150, 150, 150, 146, 149, 152, 154, + 161, 164, 151, 130, 93, 86, 100, 106, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, + 204, 204, 204, 204, 150, 150, 150, 150, 146, 149, 152, 154, 158, 161, 148, 127, 93, 86, 100, 106, + 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 150, 150, 150, 150, + 146, 149, 152, 154, 156, 159, 146, 125, 99, 92, 106, 112, 204, 204, 204, 204, 204, 204, 204, 204, + 204, 204, 204, 204, 204, 204, 204, 204, 148, 148, 148, 148, 149, 158, 162, 159, 155, 155, 153, 129, + 94, 87, 101, 106, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, + 151, 151, 151, 151, 152, 159, 161, 156, 155, 155, 153, 129, 94, 87, 101, 106, 204, 204, 204, 204, + 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 154, 154, 154, 154, 156, 161, 159, 152, + 155, 155, 153, 129, 94, 87, 101, 106, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, + 204, 204, 204, 204, 156, 156, 156, 156, 159, 162, 158, 149, 155, 155, 153, 129, 94, 87, 101, 106, + 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 152, 153, 157, 162, + 150, 149, 149, 151, 155, 160, 150, 131, 91, 90, 104, 104, 204, 204, 204, 204, 204, 204, 204, 204, + 204, 204, 204, 204, 204, 204, 204, 204, 152, 156, 158, 157, 140, 137, 145, 159, 155, 160, 150, 131, + 89, 88, 102, 101, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, + 153, 161, 160, 149, 118, 128, 147, 162, 155, 160, 150, 131, 86, 85, 99, 98, 204, 204, 204, 204, 204, + 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 154, 165, 161, 144, 96, 128, 154, 159, 155, + 160, 150, 131, 83, 82, 97, 96, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, + 204, 204, 161, 160, 149, 105, 78, 127, 156, 170, 156, 156, 154, 130, 81, 77, 95, 102, 204, 204, 204, + 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 160, 160, 133, 85, 81, 129, 155, + 167, 156, 156, 154, 130, 81, 77, 95, 102, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, + 204, 204, 204, 204, 204, 156, 147, 109, 76, 85, 130, 153, 163, 156, 156, 154, 130, 81, 77, 95, 102, + 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 152, 128, 87, 83, + 88, 132, 152, 159, 156, 156, 154, 130, 81, 77, 95, 102, 204, 204, 204, 204, 204, 204, 204, 204, 204, + 204, 204, 204, 204, 204, 204, 204 + }; + + int expected = 2063; + + // act + int actual = LossyUtils.Vp8_Sse16X16(a, b); + + // assert + Assert.Equal(expected, actual); + } + + private static void RunVp8Sse16X8Test() + { + // arrange + byte[] a = + { + 107, 104, 104, 103, 101, 106, 123, 119, 170, 171, 172, 171, 168, 175, 171, 173, 151, 151, 149, 150, + 147, 147, 146, 159, 164, 165, 154, 129, 92, 90, 101, 105, 104, 103, 104, 101, 100, 105, 123, 117, + 172, 172, 172, 168, 170, 177, 170, 175, 151, 149, 150, 150, 147, 147, 156, 161, 161, 161, 151, 126, + 93, 90, 102, 107, 104, 103, 104, 101, 104, 104, 122, 117, 172, 172, 170, 168, 170, 177, 172, 175, + 150, 149, 152, 151, 148, 151, 160, 159, 157, 157, 148, 133, 96, 90, 103, 107, 104, 104, 101, 100, + 102, 102, 121, 117, 170, 170, 169, 171, 171, 179, 173, 175, 149, 151, 152, 151, 148, 154, 162, 157, + 154, 154, 151, 132, 92, 89, 101, 108, 104, 102, 101, 101, 103, 103, 123, 118, 171, 168, 177, 173, + 171, 178, 172, 176, 152, 152, 152, 151, 154, 162, 161, 155, 149, 157, 156, 129, 92, 87, 101, 107, + 102, 100, 107, 100, 101, 102, 123, 118, 170, 175, 182, 172, 171, 179, 173, 175, 152, 151, 154, 155, + 160, 162, 161, 153, 150, 156, 153, 129, 92, 91, 102, 106, 100, 109, 115, 99, 101, 102, 124, 120, + 171, 179, 178, 172, 171, 181, 171, 173, 154, 154, 154, 162, 160, 158, 156, 152, 153, 157, 151, 128, + 86, 86, 102, 105, 102, 122, 114, 99, 101, 102, 125, 120, 178, 173, 177, 172, 171, 180, 172, 173, + 154, 152, 158, 163, 150, 148, 148, 156, 151, 158, 152, 129, 87, 87, 101, 105 + }; + + byte[] b = + { + 103, 103, 103, 103, 101, 106, 122, 114, 171, 171, 171, 171, 171, 177, 169, 175, 150, 150, 150, 150, + 146, 149, 152, 154, 161, 164, 151, 130, 93, 86, 100, 106, 103, 103, 103, 103, 101, 106, 122, 114, + 171, 171, 171, 171, 171, 177, 169, 175, 150, 150, 150, 150, 146, 149, 152, 154, 158, 161, 148, 127, + 93, 86, 100, 106, 103, 103, 103, 103, 101, 106, 122, 114, 171, 171, 171, 171, 171, 177, 169, 175, + 150, 150, 150, 150, 146, 149, 152, 154, 156, 159, 146, 125, 99, 92, 106, 112, 103, 103, 103, 103, + 101, 106, 122, 114, 171, 171, 171, 171, 171, 177, 169, 175, 148, 148, 148, 148, 149, 158, 162, 159, + 155, 155, 153, 129, 94, 87, 101, 106, 102, 100, 100, 102, 100, 101, 120, 122, 170, 176, 176, 170, + 174, 180, 171, 177, 151, 151, 151, 151, 152, 159, 161, 156, 155, 155, 153, 129, 94, 87, 101, 106, + 102, 105, 105, 102, 100, 101, 120, 122, 170, 176, 176, 170, 174, 180, 171, 177, 154, 154, 154, 154, + 156, 161, 159, 152, 155, 155, 153, 129, 94, 87, 101, 106, 102, 112, 112, 102, 100, 101, 120, 122, + 170, 176, 176, 170, 174, 180, 171, 177, 156, 156, 156, 156, 159, 162, 158, 149, 155, 155, 153, 129, + 94, 87, 101, 106, 102, 117, 117, 102, 100, 101, 120, 122, 170, 176, 176, 170, 174, 180, 171, 177, + 152, 153, 157, 162, 150, 149, 149, 151, 155, 160, 150, 131, 91, 90, 104, 104 + }; + + int expected = 749; + + // act + int actual = LossyUtils.Vp8_Sse16X8(a, b); + + // assert + Assert.Equal(expected, actual); + } + private static void RunVp8Sse4X4Test() { // arrange @@ -168,6 +286,12 @@ private static void RunHadamardTransformTest() [Fact] public void RunTransformOne_Works() => RunTransformOneTest(); + [Fact] + public void Vp8Sse16X16_Works() => RunVp8Sse16X16Test(); + + [Fact] + public void Vp8Sse16X8_Works() => RunVp8Sse16X8Test(); + [Fact] public void Vp8Sse4X4_Works() => RunVp8Sse4X4Test(); @@ -190,6 +314,18 @@ private static void RunHadamardTransformTest() [Fact] public void TransformOne_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformOneTest, HwIntrinsics.DisableHWIntrinsic); + [Fact] + public void Vp8Sse16X16_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.AllowAll); + + [Fact] + public void Vp8Sse16X16_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableSSE2); + + [Fact] + public void Vp8Sse16X8_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.AllowAll); + + [Fact] + public void Vp8Sse16X8_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableSSE2); + [Fact] public void Vp8Sse4X4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.AllowAll); From 3d00c68a721383aca8155f3d73ad0bc7b0613044 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Wed, 8 Dec 2021 14:03:48 +0100 Subject: [PATCH 3/5] Add AVX2 version of Vp8_Sse16X16 and Vp8_Sse16X8 --- .../Formats/Webp/Lossy/LossyUtils.cs | 62 +++++++++++++++++++ .../Formats/WebP/LossyUtilsTests.cs | 6 ++ 2 files changed, 68 insertions(+) diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index 69ce6c7f98..abaf720892 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -43,6 +43,11 @@ internal static class LossyUtils public static int Vp8_Sse16X16(Span a, Span b) { #if SUPPORTS_RUNTIME_INTRINSICS + if (Avx2.IsSupported) + { + return Vp8_Sse16xN_Avx2(a, b, 4); + } + if (Sse2.IsSupported) { return Vp8_Sse16xN_Sse2(a, b, 8); @@ -58,6 +63,11 @@ public static int Vp8_Sse16X16(Span a, Span b) public static int Vp8_Sse16X8(Span a, Span b) { #if SUPPORTS_RUNTIME_INTRINSICS + if (Avx2.IsSupported) + { + return Vp8_Sse16xN_Avx2(a, b, 2); + } + if (Sse2.IsSupported) { return Vp8_Sse16xN_Sse2(a, b, 4); @@ -194,6 +204,39 @@ private static int Vp8_Sse16xN_Sse2(Span a, Span b, int numPairs) return Numerics.ReduceSum(sum); } + [MethodImpl(InliningOptions.ShortMethod)] + private static int Vp8_Sse16xN_Avx2(Span a, Span b, int numPairs) + { + Vector256 sum = Vector256.Zero; + int offset = 0; + for (int i = 0; i < numPairs; i++) + { + // Load values. + ref byte aRef = ref MemoryMarshal.GetReference(a); + ref byte bRef = ref MemoryMarshal.GetReference(b); + var a0 = Vector256.Create( + Unsafe.As>(ref Unsafe.Add(ref aRef, offset)), + Unsafe.As>(ref Unsafe.Add(ref aRef, offset + WebpConstants.Bps))); + var b0 = Vector256.Create( + Unsafe.As>(ref Unsafe.Add(ref bRef, offset)), + Unsafe.As>(ref Unsafe.Add(ref bRef, offset + WebpConstants.Bps))); + var a1 = Vector256.Create( + Unsafe.As>(ref Unsafe.Add(ref aRef, offset + (2 * WebpConstants.Bps))), + Unsafe.As>(ref Unsafe.Add(ref aRef, offset + (3 * WebpConstants.Bps)))); + var b1 = Vector256.Create( + Unsafe.As>(ref Unsafe.Add(ref bRef, offset + (2 * WebpConstants.Bps))), + Unsafe.As>(ref Unsafe.Add(ref bRef, offset + (3 * WebpConstants.Bps)))); + + Vector256 sum1 = SubtractAndAccumulate(a0, b0); + Vector256 sum2 = SubtractAndAccumulate(a1, b1); + sum = Avx2.Add(sum, Avx2.Add(sum1, sum2)); + + offset += 4 * WebpConstants.Bps; + } + + return Numerics.ReduceSum(sum); + } + [MethodImpl(InliningOptions.ShortMethod)] private static Vector128 SubtractAndAccumulate(Vector128 a, Vector128 b) { @@ -212,6 +255,25 @@ private static Vector128 SubtractAndAccumulate(Vector128 a, Vector128 return Sse2.Add(sum1, sum2); } + + [MethodImpl(InliningOptions.ShortMethod)] + private static Vector256 SubtractAndAccumulate(Vector256 a, Vector256 b) + { + // Take abs(a-b) in 8b. + Vector256 ab = Avx2.SubtractSaturate(a, b); + Vector256 ba = Avx2.SubtractSaturate(b, a); + Vector256 absAb = Avx2.Or(ab, ba); + + // Zero-extend to 16b. + Vector256 c0 = Avx2.UnpackLow(absAb, Vector256.Zero); + Vector256 c1 = Avx2.UnpackHigh(absAb, Vector256.Zero); + + // Multiply with self. + Vector256 sum1 = Avx2.MultiplyAddAdjacent(c0.AsInt16(), c0.AsInt16()); + Vector256 sum2 = Avx2.MultiplyAddAdjacent(c1.AsInt16(), c1.AsInt16()); + + return Avx2.Add(sum1, sum2); + } #endif [MethodImpl(InliningOptions.ShortMethod)] diff --git a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs index 9625008ca8..cc5f1b4c27 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs @@ -320,12 +320,18 @@ private static void RunHadamardTransformTest() [Fact] public void Vp8Sse16X16_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableSSE2); + [Fact] + public void Vp8Sse16X16_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableAVX2); + [Fact] public void Vp8Sse16X8_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.AllowAll); [Fact] public void Vp8Sse16X8_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableSSE2); + [Fact] + public void Vp8Sse16X8_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableAVX2); + [Fact] public void Vp8Sse4X4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.AllowAll); From 1ae1d8ed6675f4b5d1812b892461edc5b977025e Mon Sep 17 00:00:00 2001 From: Brian Popow <38701097+brianpopow@users.noreply.github.com> Date: Thu, 9 Dec 2021 14:19:52 +0100 Subject: [PATCH 4/5] Use nint for offset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Günther Foidl --- src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index abaf720892..1b7e5633f6 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -183,7 +183,7 @@ public static int Vp8_SseNxN(Span a, Span b, int w, int h) private static int Vp8_Sse16xN_Sse2(Span a, Span b, int numPairs) { Vector128 sum = Vector128.Zero; - int offset = 0; + nint offset = 0; for (int i = 0; i < numPairs; i++) { // Load values. @@ -208,7 +208,7 @@ private static int Vp8_Sse16xN_Sse2(Span a, Span b, int numPairs) private static int Vp8_Sse16xN_Avx2(Span a, Span b, int numPairs) { Vector256 sum = Vector256.Zero; - int offset = 0; + nint offset = 0; for (int i = 0; i < numPairs; i++) { // Load values. From 8e1c3fa6bfab40d00d4edabf5147679ee3d1d3c3 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 9 Dec 2021 14:23:46 +0100 Subject: [PATCH 5/5] Move GetReference outside of the loop --- src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index 1b7e5633f6..8938ede0a1 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -184,11 +184,11 @@ private static int Vp8_Sse16xN_Sse2(Span a, Span b, int numPairs) { Vector128 sum = Vector128.Zero; nint offset = 0; + ref byte aRef = ref MemoryMarshal.GetReference(a); + ref byte bRef = ref MemoryMarshal.GetReference(b); for (int i = 0; i < numPairs; i++) { // Load values. - ref byte aRef = ref MemoryMarshal.GetReference(a); - ref byte bRef = ref MemoryMarshal.GetReference(b); Vector128 a0 = Unsafe.As>(ref Unsafe.Add(ref aRef, offset)); Vector128 b0 = Unsafe.As>(ref Unsafe.Add(ref bRef, offset)); Vector128 a1 = Unsafe.As>(ref Unsafe.Add(ref aRef, offset + WebpConstants.Bps)); @@ -209,11 +209,11 @@ private static int Vp8_Sse16xN_Avx2(Span a, Span b, int numPairs) { Vector256 sum = Vector256.Zero; nint offset = 0; + ref byte aRef = ref MemoryMarshal.GetReference(a); + ref byte bRef = ref MemoryMarshal.GetReference(b); for (int i = 0; i < numPairs; i++) { // Load values. - ref byte aRef = ref MemoryMarshal.GetReference(a); - ref byte bRef = ref MemoryMarshal.GetReference(b); var a0 = Vector256.Create( Unsafe.As>(ref Unsafe.Add(ref aRef, offset)), Unsafe.As>(ref Unsafe.Add(ref aRef, offset + WebpConstants.Bps)));