SixLabors · JimBobSquarePants · Jan 28, 2026 · Jan 28, 2026 · Jan 28, 2026 · Jan 28, 2026
diff --git a/src/ImageSharp/ColorProfiles/ColorProfileConverterExtensionsPixelCompatible.cs b/src/ImageSharp/ColorProfiles/ColorProfileConverterExtensionsPixelCompatible.cs
@@ -5,6 +5,8 @@
 using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
 using SixLabors.ImageSharp.PixelFormats;
 using SixLabors.ImageSharp.Processing;
 
@@ -60,8 +62,126 @@ public static void Convert<TPixel>(this ColorProfileConverter converter, Image<T
                 converter.ConvertUsingIccProfile<Rgb, Rgb>(rgbSpan, rgbSpan);
 
                 // Copy the converted Rgb pixels back to the row as TPixel.
+                // Important: Preserve alpha from the existing row Vector4 values.
+                // We merge RGB from rgbSpan into row, leaving W untouched.
+                ref float srcRgb = ref Unsafe.As<Rgb, float>(ref MemoryMarshal.GetReference(rgbSpan));
+                ref float dstRow = ref Unsafe.As<Vector4, float>(ref MemoryMarshal.GetReference(row));
+
+                int count = rgbSpan.Length;
+                int i = 0;
+
+                [MethodImpl(MethodImplOptions.AggressiveInlining)]
+                static Vector512<float> ReadVector512(ref float f)
+                {
+                    ref byte b = ref Unsafe.As<float, byte>(ref f);
+                    return Unsafe.ReadUnaligned<Vector512<float>>(ref b);
+                }
+
+                [MethodImpl(MethodImplOptions.AggressiveInlining)]
+                static void WriteVector512(ref float f, Vector512<float> v)
+                {
+                    ref byte b = ref Unsafe.As<float, byte>(ref f);
+                    Unsafe.WriteUnaligned(ref b, v);
+                }
+
+                [MethodImpl(MethodImplOptions.AggressiveInlining)]
+                static Vector256<float> ReadVector256(ref float f)
+                {
+                    ref byte b = ref Unsafe.As<float, byte>(ref f);
+                    return Unsafe.ReadUnaligned<Vector256<float>>(ref b);
+                }
+
+                [MethodImpl(MethodImplOptions.AggressiveInlining)]
+                static void WriteVector256(ref float f, Vector256<float> v)
+                {
+                    ref byte b = ref Unsafe.As<float, byte>(ref f);
+                    Unsafe.WriteUnaligned(ref b, v);
+                }
+
+                if (Avx512F.IsSupported)
+                {
+                    // 4 pixels per iteration.
+                    //
+                    // Source layout (Rgb float stream, 12 floats):
+                    // [r0 g0 b0 r1 g1 b1 r2 g2 b2 r3 g3 b3]
+                    //
+                    // Destination layout (row Vector4 float stream, 16 floats):
+                    // [r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3]
+                    //
+                    // We use an overlapped load (16 floats) from the 3-float stride source.
+                    // The permute selects the RGB we need and inserts placeholders for alpha lanes.
+                    //
+                    // Then we blend RGB lanes into the existing destination, preserving alpha lanes.
+                    Vector512<int> rgbPerm = Vector512.Create(0, 1, 2, 0, 3, 4, 5, 0, 6, 7, 8, 0, 9, 10, 11, 0);
+
+                    // BlendVariable selects from the second operand where the sign bit of the mask lane is set.
+                    // We want to overwrite lanes 0,1,2 then 4,5,6 then 8,9,10 then 12,13,14, and preserve lanes 3,7,11,15 (alpha).
+                    Vector512<float> rgbSelect = Vector512.Create(-0F, -0F, -0F, 0F, -0F, -0F, -0F, 0F, -0F, -0F, -0F, 0F, -0F, -0F, -0F, 0F);
+
+                    int quads = count >> 2;
+                    int simdQuads = quads - 1; // Leave the last quad for the scalar tail to avoid the final overlapped load reading past the end.
+
+                    for (int q = 0; q < simdQuads; q++)
+                    {
+                        Vector512<float> dst = ReadVector512(ref dstRow);
+                        Vector512<float> src = ReadVector512(ref srcRgb);
+
+                        Vector512<float> rgbx = Avx512F.PermuteVar16x32(src, rgbPerm);
+                        Vector512<float> merged = Avx512F.BlendVariable(dst, rgbx, rgbSelect);
+
+                        WriteVector512(ref dstRow, merged);
+
+                        // Advance input by 4 pixels (4 * 3 = 12 floats)
+                        srcRgb = ref Unsafe.Add(ref srcRgb, 12);
+
+                        // Advance output by 4 pixels (4 * 4 = 16 floats)
+                        dstRow = ref Unsafe.Add(ref dstRow, 16);
+
+                        i += 4;
+                    }
+                }
+                else if (Avx2.IsSupported)
+                {
+                    // 2 pixels per iteration.
+                    //
+                    // Same idea as AVX-512, but on 256-bit vectors.
+                    // We permute packed RGB into rgbx layout and blend into the existing destination,
+                    // preserving alpha lanes.
+                    Vector256<int> rgbPerm = Vector256.Create(0, 1, 2, 0, 3, 4, 5, 0);
+
+                    Vector256<float> rgbSelect = Vector256.Create(-0F, -0F, -0F, 0F, -0F, -0F, -0F, 0F);
+
+                    int pairs = count >> 1;
+                    int simdPairs = pairs - 1; // Leave the last pair for the scalar tail to avoid the final overlapped load reading past the end.
+
+                    for (int p = 0; p < simdPairs; p++)
+                    {
+                        Vector256<float> dst = ReadVector256(ref dstRow);
+                        Vector256<float> src = ReadVector256(ref srcRgb);
+
+                        Vector256<float> rgbx = Avx2.PermuteVar8x32(src, rgbPerm);
+                        Vector256<float> merged = Avx.BlendVariable(dst, rgbx, rgbSelect);
+
+                        WriteVector256(ref dstRow, merged);
+
+                        // Advance input by 2 pixels (2 * 3 = 6 floats)
+                        srcRgb = ref Unsafe.Add(ref srcRgb, 6);
+
+                        // Advance output by 2 pixels (2 * 4 = 8 floats)
+                        dstRow = ref Unsafe.Add(ref dstRow, 8);
+
+                        i += 2;
+                    }
+                }
+
+                // Scalar tail.
+                // Handles:
+                // - the last skipped SIMD block (quad or pair)
+                // - any remainder
+                //
+                // Preserve alpha by writing Vector3 into the Vector4 storage.
                 ref Vector4 rowRef = ref MemoryMarshal.GetReference(row);
-                for (int i = 0; i < rgbSpan.Length; i++)
+                for (; i < count; i++)
                 {
                     Vector3 rgb = rgbSpan[i].AsVector3Unsafe();
                     Unsafe.As<Vector4, Vector3>(ref Unsafe.Add(ref rowRef, (uint)i)) = rgb;

diff --git a/src/ImageSharp/ColorProfiles/Rgb.cs b/src/ImageSharp/ColorProfiles/Rgb.cs
@@ -4,6 +4,8 @@
 using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
 using SixLabors.ImageSharp.ColorProfiles.WorkingSpaces;
 
 namespace SixLabors.ImageSharp.ColorProfiles;
@@ -105,10 +107,87 @@ public static void ToScaledVector4(ReadOnlySpan<Rgb> source, Span<Vector4> desti
     {
         Guard.DestinationShouldNotBeTooShort(source, destination, nameof(destination));
 
-        // TODO: Optimize via SIMD
-        for (int i = 0; i < source.Length; i++)
+        int length = source.Length;
+        if (length == 0)
         {
-            destination[i] = source[i].ToScaledVector4();
+            return;
+        }
+
+        ref Rgb srcRgb = ref MemoryMarshal.GetReference(source);
+        ref Vector4 dstV4 = ref MemoryMarshal.GetReference(destination);
+
+        // Float streams:
+        // src: r0 g0 b0 r1 g1 b1 ...
+        // dst: r0 g0 b0 a0 r1 g1 b1 a1 ...
+        ref float src = ref Unsafe.As<Rgb, float>(ref srcRgb);
+        ref float dst = ref Unsafe.As<Vector4, float>(ref dstV4);
+
+        int i = 0;
+
+        if (Avx512F.IsSupported)
+        {
+            // 4 pixels per iteration. Using overlapped 16-float loads.
+            Vector512<int> perm = Vector512.Create(0, 1, 2, 0, 3, 4, 5, 0, 6, 7, 8, 0, 9, 10, 11, 0);
+            Vector512<float> ones = Vector512.Create(1F);
+
+            // BlendVariable selects from 'ones' where the sign-bit of mask lane is set.
+            // Using -0f sets only the sign bit, producing an efficient "select lane" mask.
+            Vector512<float> alphaSelect = Vector512.Create(0F, 0F, 0F, -0F, 0F, 0F, 0F, -0F, 0F, 0F, 0F, -0F, 0F, 0F, 0F, -0F);
+
+            int quads = length >> 2;
+
+            // Leave the last quad (4 pixels) for the scalar tail.
+            int simdQuads = quads - 1;
+
+            for (int q = 0; q < simdQuads; q++)
+            {
+                Vector512<float> v = ReadVector512(ref src);
+                Vector512<float> rgbx = Avx512F.PermuteVar16x32(v, perm);
+                Vector512<float> rgba = Avx512F.BlendVariable(rgbx, ones, alphaSelect);
+
+                WriteVector512(ref dst, rgba);
+
+                src = ref Unsafe.Add(ref src, 12);
+                dst = ref Unsafe.Add(ref dst, 16);
+
+                i += 4;
+            }
+        }
+        else if (Avx2.IsSupported)
+        {
+            // 2 pixels per iteration. Using overlapped 8-float loads.
+            Vector256<int> perm = Vector256.Create(0, 1, 2, 0, 3, 4, 5, 0);
+
+            Vector256<float> ones = Vector256.Create(1F);
+
+            // vblendps mask: bit i selects lane i from 'ones' when set.
+            // We want lanes 3 and 7 -> 0b10001000 = 0x88.
+            const byte alphaMask = 0x88;
+
+            int pairs = length >> 1;
+
+            // Leave the last pair (2 pixels) for the scalar tail.
+            int simdPairs = pairs - 1;
+
+            for (int p = 0; p < simdPairs; p++)
+            {
+                Vector256<float> v = ReadVector256(ref src);
+                Vector256<float> rgbx = Avx2.PermuteVar8x32(v, perm);
+                Vector256<float> rgba = Avx.Blend(rgbx, ones, alphaMask);
+
+                WriteVector256(ref dst, rgba);
+
+                src = ref Unsafe.Add(ref src, 6);
+                dst = ref Unsafe.Add(ref dst, 8);
+
+                i += 2;
+            }
+        }
+
+        // Tail (and non-AVX paths)
+        for (; i < length; i++)
+        {
+            Unsafe.Add(ref dstV4, i) = Unsafe.Add(ref srcRgb, i).ToScaledVector4();
         }
     }
 
@@ -117,10 +196,75 @@ public static void FromScaledVector4(ReadOnlySpan<Vector4> source, Span<Rgb> des
     {
         Guard.DestinationShouldNotBeTooShort(source, destination, nameof(destination));
 
-        // TODO: Optimize via SIMD
-        for (int i = 0; i < source.Length; i++)
+        int length = source.Length;
+        if (length == 0)
         {
-            destination[i] = FromScaledVector4(source[i]);
+            return;
+        }
+
+        ref Vector4 srcV4 = ref MemoryMarshal.GetReference(source);
+        ref Rgb dstRgb = ref MemoryMarshal.GetReference(destination);
+
+        // Float streams:
+        // src: r0 g0 b0 a0 r1 g1 b1 a1 ...
+        // dst: r0 g0 b0 r1 g1 b1 ...
+        ref float src = ref Unsafe.As<Vector4, float>(ref srcV4);
+        ref float dst = ref Unsafe.As<Rgb, float>(ref dstRgb);
+
+        int i = 0;
+
+        if (Avx512F.IsSupported)
+        {
+            // 4 pixels per iteration. Using overlapped 16-float stores:
+            Vector512<int> idx = Vector512.Create(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15);
+
+            // Number of 4-pixel groups in the input.
+            int quads = length >> 2;
+
+            // Leave the last quad (4 pixels) for the scalar tail.
+            int simdQuads = quads - 1;
+
+            for (int q = 0; q < simdQuads; q++)
+            {
+                Vector512<float> v = ReadVector512(ref src);
+                Vector512<float> packed = Avx512F.PermuteVar16x32(v, idx);
+
+                WriteVector512(ref dst, packed);
+
+                src = ref Unsafe.Add(ref src, 16);
+                dst = ref Unsafe.Add(ref dst, 12);
+                i += 4;
+            }
+        }
+        else if (Avx2.IsSupported)
+        {
+            // 2 pixels per iteration, using overlapped 8-float stores:
+            Vector256<int> idx = Vector256.Create(0, 1, 2, 4, 5, 6, 0, 0);
+
+            int pairs = length >> 1;
+
+            // Leave the last pair (2 pixels) for the scalar tail.
+            int simdPairs = pairs - 1;
+
+            int pairIndex = 0;
+            for (; pairIndex < simdPairs; pairIndex++)
+            {
+                Vector256<float> v = ReadVector256(ref src);
+                Vector256<float> packed = Avx2.PermuteVar8x32(v, idx);
+
+                WriteVector256(ref dst, packed);
+
+                src = ref Unsafe.Add(ref src, 8);
+                dst = ref Unsafe.Add(ref dst, 6);
+                i += 2;
+            }
+        }
+
+        // Tail (and non-AVX paths)
+        for (; i < length; i++)
+        {
+            Vector4 v = Unsafe.Add(ref srcV4, i);
+            Unsafe.Add(ref dstRgb, i) = FromScaledVector4(v);
         }
     }
 
@@ -288,4 +432,32 @@ private static Matrix4x4 GetRgbToCieXyzMatrix(RgbWorkingSpace workingSpace)
             M44 = 1F
         };
     }
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    private static Vector512<float> ReadVector512(ref float src)
+    {
+        ref byte b = ref Unsafe.As<float, byte>(ref src);
+        return Unsafe.ReadUnaligned<Vector512<float>>(ref b);
+    }
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    private static Vector256<float> ReadVector256(ref float src)
+    {
+        ref byte b = ref Unsafe.As<float, byte>(ref src);
+        return Unsafe.ReadUnaligned<Vector256<float>>(ref b);
+    }
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    private static void WriteVector512(ref float dst, Vector512<float> value)
+    {
+        ref byte b = ref Unsafe.As<float, byte>(ref dst);
+        Unsafe.WriteUnaligned(ref b, value);
+    }
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    private static void WriteVector256(ref float dst, Vector256<float> value)
+    {
+        ref byte b = ref Unsafe.As<float, byte>(ref dst);
+        Unsafe.WriteUnaligned(ref b, value);
+    }
 }
diff --git a/src/ImageSharp/Formats/Webp/WebpDecoderCore.cs b/src/ImageSharp/Formats/Webp/WebpDecoderCore.cs
@@ -122,6 +122,7 @@ protected override Image<TPixel> Decode<TPixel>(BufferedReadStream stream, Cance
                     this.ParseOptionalChunks(stream, metadata, this.webImageInfo.Features, buffer);
                 }
 
+                _ = this.TryConvertIccProfile(image);
                 return image;
             }
         }

diff --git a/tests/ImageSharp.Tests/Formats/WebP/WebpDecoderTests.cs b/tests/ImageSharp.Tests/Formats/WebP/WebpDecoderTests.cs
@@ -608,4 +608,17 @@ public void WebpDecoder_CanDecode_Issue2906<TPixel>(TestImageProvider<TPixel> pr
         image.DebugSave(provider);
         image.CompareToOriginal(provider, ReferenceDecoder);
     }
+
+    [Theory]
+    [WithFile(Icc.Perceptual, PixelTypes.Rgba32)]
+    [WithFile(Icc.PerceptualcLUTOnly, PixelTypes.Rgba32)]
+    public void Decode_WhenColorProfileHandlingIsConvert_ApplyIccProfile<TPixel>(TestImageProvider<TPixel> provider)
+    where TPixel : unmanaged, IPixel<TPixel>
+    {
+        using Image<TPixel> image = provider.GetImage(WebpDecoder.Instance, new DecoderOptions { ColorProfileHandling = ColorProfileHandling.Convert });
+
+        image.DebugSave(provider);
+        image.CompareToReferenceOutput(provider);
+        Assert.Null(image.Metadata.IccProfile);
+    }
 }
diff --git a/tests/ImageSharp.Tests/TestImages.cs b/tests/ImageSharp.Tests/TestImages.cs
@@ -901,6 +901,12 @@ public static class Lossy
         public const string AlphaBlend2 = "Webp/alpha-blend-2.webp";
         public const string AlphaBlend3 = "Webp/alpha-blend-3.webp";
         public const string AlphaBlend4 = "Webp/alpha-blend-4.webp";
+
+        public static class Icc
+        {
+            public const string Perceptual = "Webp/icc-profiles/Perceptual.webp";
+            public const string PerceptualcLUTOnly = "Webp/icc-profiles/Perceptual-cLUT-only.webp";
+        }
     }
 
     public static class Tiff

diff --git a/...enColorProfileHandlingIsConvert_ApplyIccProfile_Rgba32_Perceptual-cLUT-only.png b/...enColorProfileHandlingIsConvert_ApplyIccProfile_Rgba32_Perceptual-cLUT-only.png
diff --git a/.../Decode_WhenColorProfileHandlingIsConvert_ApplyIccProfile_Rgba32_Perceptual.png b/.../Decode_WhenColorProfileHandlingIsConvert_ApplyIccProfile_Rgba32_Perceptual.png
diff --git a/tests/Images/Input/Webp/icc-profiles/Perceptual-cLUT-only.webp b/tests/Images/Input/Webp/icc-profiles/Perceptual-cLUT-only.webp
-Original file line number
+Diff line change
@@ Expand Up @@
                         this.ParseOptionalChunks(stream, metadata, this.webImageInfo.Features, buffer);
                     }
+                    _ = this.TryConvertIccProfile(image);
                     return image;
                 }
             }
@@ Expand Down @@