Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using SixLabors.ImageSharp.PixelFormats;
using SixLabors.ImageSharp.Processing;

Expand Down Expand Up @@ -60,8 +62,126 @@ public static void Convert<TPixel>(this ColorProfileConverter converter, Image<T
converter.ConvertUsingIccProfile<Rgb, Rgb>(rgbSpan, rgbSpan);

// Copy the converted Rgb pixels back to the row as TPixel.
// Important: Preserve alpha from the existing row Vector4 values.
// We merge RGB from rgbSpan into row, leaving W untouched.
ref float srcRgb = ref Unsafe.As<Rgb, float>(ref MemoryMarshal.GetReference(rgbSpan));
ref float dstRow = ref Unsafe.As<Vector4, float>(ref MemoryMarshal.GetReference(row));

int count = rgbSpan.Length;
int i = 0;

[MethodImpl(MethodImplOptions.AggressiveInlining)]
static Vector512<float> ReadVector512(ref float f)
{
ref byte b = ref Unsafe.As<float, byte>(ref f);
return Unsafe.ReadUnaligned<Vector512<float>>(ref b);
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
static void WriteVector512(ref float f, Vector512<float> v)
{
ref byte b = ref Unsafe.As<float, byte>(ref f);
Unsafe.WriteUnaligned(ref b, v);
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
static Vector256<float> ReadVector256(ref float f)
{
ref byte b = ref Unsafe.As<float, byte>(ref f);
return Unsafe.ReadUnaligned<Vector256<float>>(ref b);
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
static void WriteVector256(ref float f, Vector256<float> v)
{
ref byte b = ref Unsafe.As<float, byte>(ref f);
Unsafe.WriteUnaligned(ref b, v);
}

if (Avx512F.IsSupported)
{
// 4 pixels per iteration.
//
// Source layout (Rgb float stream, 12 floats):
// [r0 g0 b0 r1 g1 b1 r2 g2 b2 r3 g3 b3]
//
// Destination layout (row Vector4 float stream, 16 floats):
// [r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3]
//
// We use an overlapped load (16 floats) from the 3-float stride source.
// The permute selects the RGB we need and inserts placeholders for alpha lanes.
//
// Then we blend RGB lanes into the existing destination, preserving alpha lanes.
Vector512<int> rgbPerm = Vector512.Create(0, 1, 2, 0, 3, 4, 5, 0, 6, 7, 8, 0, 9, 10, 11, 0);

// BlendVariable selects from the second operand where the sign bit of the mask lane is set.
// We want to overwrite lanes 0,1,2 then 4,5,6 then 8,9,10 then 12,13,14, and preserve lanes 3,7,11,15 (alpha).
Vector512<float> rgbSelect = Vector512.Create(-0F, -0F, -0F, 0F, -0F, -0F, -0F, 0F, -0F, -0F, -0F, 0F, -0F, -0F, -0F, 0F);

int quads = count >> 2;
int simdQuads = quads - 1; // Leave the last quad for the scalar tail to avoid the final overlapped load reading past the end.

for (int q = 0; q < simdQuads; q++)
{
Vector512<float> dst = ReadVector512(ref dstRow);
Vector512<float> src = ReadVector512(ref srcRgb);

Vector512<float> rgbx = Avx512F.PermuteVar16x32(src, rgbPerm);
Vector512<float> merged = Avx512F.BlendVariable(dst, rgbx, rgbSelect);

WriteVector512(ref dstRow, merged);

// Advance input by 4 pixels (4 * 3 = 12 floats)
srcRgb = ref Unsafe.Add(ref srcRgb, 12);

// Advance output by 4 pixels (4 * 4 = 16 floats)
dstRow = ref Unsafe.Add(ref dstRow, 16);

i += 4;
}
}
else if (Avx2.IsSupported)
{
// 2 pixels per iteration.
//
// Same idea as AVX-512, but on 256-bit vectors.
// We permute packed RGB into rgbx layout and blend into the existing destination,
// preserving alpha lanes.
Vector256<int> rgbPerm = Vector256.Create(0, 1, 2, 0, 3, 4, 5, 0);

Vector256<float> rgbSelect = Vector256.Create(-0F, -0F, -0F, 0F, -0F, -0F, -0F, 0F);

int pairs = count >> 1;
int simdPairs = pairs - 1; // Leave the last pair for the scalar tail to avoid the final overlapped load reading past the end.

for (int p = 0; p < simdPairs; p++)
{
Vector256<float> dst = ReadVector256(ref dstRow);
Vector256<float> src = ReadVector256(ref srcRgb);

Vector256<float> rgbx = Avx2.PermuteVar8x32(src, rgbPerm);
Vector256<float> merged = Avx.BlendVariable(dst, rgbx, rgbSelect);

WriteVector256(ref dstRow, merged);

// Advance input by 2 pixels (2 * 3 = 6 floats)
srcRgb = ref Unsafe.Add(ref srcRgb, 6);

// Advance output by 2 pixels (2 * 4 = 8 floats)
dstRow = ref Unsafe.Add(ref dstRow, 8);

i += 2;
}
}

// Scalar tail.
// Handles:
// - the last skipped SIMD block (quad or pair)
// - any remainder
//
// Preserve alpha by writing Vector3 into the Vector4 storage.
ref Vector4 rowRef = ref MemoryMarshal.GetReference(row);
for (int i = 0; i < rgbSpan.Length; i++)
for (; i < count; i++)
{
Vector3 rgb = rgbSpan[i].AsVector3Unsafe();
Unsafe.As<Vector4, Vector3>(ref Unsafe.Add(ref rowRef, (uint)i)) = rgb;
Expand Down
184 changes: 178 additions & 6 deletions src/ImageSharp/ColorProfiles/Rgb.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using SixLabors.ImageSharp.ColorProfiles.WorkingSpaces;

namespace SixLabors.ImageSharp.ColorProfiles;
Expand Down Expand Up @@ -105,10 +107,87 @@ public static void ToScaledVector4(ReadOnlySpan<Rgb> source, Span<Vector4> desti
{
Guard.DestinationShouldNotBeTooShort(source, destination, nameof(destination));

// TODO: Optimize via SIMD
for (int i = 0; i < source.Length; i++)
int length = source.Length;
if (length == 0)
{
destination[i] = source[i].ToScaledVector4();
return;
}

ref Rgb srcRgb = ref MemoryMarshal.GetReference(source);
ref Vector4 dstV4 = ref MemoryMarshal.GetReference(destination);

// Float streams:
// src: r0 g0 b0 r1 g1 b1 ...
// dst: r0 g0 b0 a0 r1 g1 b1 a1 ...
ref float src = ref Unsafe.As<Rgb, float>(ref srcRgb);
ref float dst = ref Unsafe.As<Vector4, float>(ref dstV4);

int i = 0;

if (Avx512F.IsSupported)
{
// 4 pixels per iteration. Using overlapped 16-float loads.
Vector512<int> perm = Vector512.Create(0, 1, 2, 0, 3, 4, 5, 0, 6, 7, 8, 0, 9, 10, 11, 0);
Vector512<float> ones = Vector512.Create(1F);

// BlendVariable selects from 'ones' where the sign-bit of mask lane is set.
// Using -0f sets only the sign bit, producing an efficient "select lane" mask.
Vector512<float> alphaSelect = Vector512.Create(0F, 0F, 0F, -0F, 0F, 0F, 0F, -0F, 0F, 0F, 0F, -0F, 0F, 0F, 0F, -0F);

int quads = length >> 2;

// Leave the last quad (4 pixels) for the scalar tail.
int simdQuads = quads - 1;

for (int q = 0; q < simdQuads; q++)
{
Vector512<float> v = ReadVector512(ref src);
Vector512<float> rgbx = Avx512F.PermuteVar16x32(v, perm);
Vector512<float> rgba = Avx512F.BlendVariable(rgbx, ones, alphaSelect);

WriteVector512(ref dst, rgba);

src = ref Unsafe.Add(ref src, 12);
dst = ref Unsafe.Add(ref dst, 16);

i += 4;
}
}
else if (Avx2.IsSupported)
{
// 2 pixels per iteration. Using overlapped 8-float loads.
Vector256<int> perm = Vector256.Create(0, 1, 2, 0, 3, 4, 5, 0);

Vector256<float> ones = Vector256.Create(1F);

// vblendps mask: bit i selects lane i from 'ones' when set.
// We want lanes 3 and 7 -> 0b10001000 = 0x88.
const byte alphaMask = 0x88;

int pairs = length >> 1;

// Leave the last pair (2 pixels) for the scalar tail.
int simdPairs = pairs - 1;

for (int p = 0; p < simdPairs; p++)
{
Vector256<float> v = ReadVector256(ref src);
Vector256<float> rgbx = Avx2.PermuteVar8x32(v, perm);
Vector256<float> rgba = Avx.Blend(rgbx, ones, alphaMask);

WriteVector256(ref dst, rgba);

src = ref Unsafe.Add(ref src, 6);
dst = ref Unsafe.Add(ref dst, 8);

i += 2;
}
}

// Tail (and non-AVX paths)
for (; i < length; i++)
{
Unsafe.Add(ref dstV4, i) = Unsafe.Add(ref srcRgb, i).ToScaledVector4();
}
}

Expand All @@ -117,10 +196,75 @@ public static void FromScaledVector4(ReadOnlySpan<Vector4> source, Span<Rgb> des
{
Guard.DestinationShouldNotBeTooShort(source, destination, nameof(destination));

// TODO: Optimize via SIMD
for (int i = 0; i < source.Length; i++)
int length = source.Length;
if (length == 0)
{
destination[i] = FromScaledVector4(source[i]);
return;
}

ref Vector4 srcV4 = ref MemoryMarshal.GetReference(source);
ref Rgb dstRgb = ref MemoryMarshal.GetReference(destination);

// Float streams:
// src: r0 g0 b0 a0 r1 g1 b1 a1 ...
// dst: r0 g0 b0 r1 g1 b1 ...
ref float src = ref Unsafe.As<Vector4, float>(ref srcV4);
ref float dst = ref Unsafe.As<Rgb, float>(ref dstRgb);

int i = 0;

if (Avx512F.IsSupported)
{
// 4 pixels per iteration. Using overlapped 16-float stores:
Vector512<int> idx = Vector512.Create(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15);

// Number of 4-pixel groups in the input.
int quads = length >> 2;

// Leave the last quad (4 pixels) for the scalar tail.
int simdQuads = quads - 1;

for (int q = 0; q < simdQuads; q++)
{
Vector512<float> v = ReadVector512(ref src);
Vector512<float> packed = Avx512F.PermuteVar16x32(v, idx);

WriteVector512(ref dst, packed);

src = ref Unsafe.Add(ref src, 16);
dst = ref Unsafe.Add(ref dst, 12);
i += 4;
}
}
else if (Avx2.IsSupported)
{
// 2 pixels per iteration, using overlapped 8-float stores:
Vector256<int> idx = Vector256.Create(0, 1, 2, 4, 5, 6, 0, 0);

int pairs = length >> 1;

// Leave the last pair (2 pixels) for the scalar tail.
int simdPairs = pairs - 1;

int pairIndex = 0;
for (; pairIndex < simdPairs; pairIndex++)
{
Vector256<float> v = ReadVector256(ref src);
Vector256<float> packed = Avx2.PermuteVar8x32(v, idx);

WriteVector256(ref dst, packed);

src = ref Unsafe.Add(ref src, 8);
dst = ref Unsafe.Add(ref dst, 6);
i += 2;
}
}

// Tail (and non-AVX paths)
for (; i < length; i++)
{
Vector4 v = Unsafe.Add(ref srcV4, i);
Unsafe.Add(ref dstRgb, i) = FromScaledVector4(v);
}
}

Expand Down Expand Up @@ -288,4 +432,32 @@ private static Matrix4x4 GetRgbToCieXyzMatrix(RgbWorkingSpace workingSpace)
M44 = 1F
};
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector512<float> ReadVector512(ref float src)
{
ref byte b = ref Unsafe.As<float, byte>(ref src);
return Unsafe.ReadUnaligned<Vector512<float>>(ref b);
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector256<float> ReadVector256(ref float src)
{
ref byte b = ref Unsafe.As<float, byte>(ref src);
return Unsafe.ReadUnaligned<Vector256<float>>(ref b);
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void WriteVector512(ref float dst, Vector512<float> value)
{
ref byte b = ref Unsafe.As<float, byte>(ref dst);
Unsafe.WriteUnaligned(ref b, value);
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void WriteVector256(ref float dst, Vector256<float> value)
{
ref byte b = ref Unsafe.As<float, byte>(ref dst);
Unsafe.WriteUnaligned(ref b, value);
}
}
1 change: 1 addition & 0 deletions src/ImageSharp/Formats/Webp/WebpDecoderCore.cs
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ protected override Image<TPixel> Decode<TPixel>(BufferedReadStream stream, Cance
this.ParseOptionalChunks(stream, metadata, this.webImageInfo.Features, buffer);
}

_ = this.TryConvertIccProfile(image);
return image;
}
}
Expand Down
13 changes: 13 additions & 0 deletions tests/ImageSharp.Tests/Formats/WebP/WebpDecoderTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -608,4 +608,17 @@ public void WebpDecoder_CanDecode_Issue2906<TPixel>(TestImageProvider<TPixel> pr
image.DebugSave(provider);
image.CompareToOriginal(provider, ReferenceDecoder);
}

[Theory]
[WithFile(Icc.Perceptual, PixelTypes.Rgba32)]
[WithFile(Icc.PerceptualcLUTOnly, PixelTypes.Rgba32)]
public void Decode_WhenColorProfileHandlingIsConvert_ApplyIccProfile<TPixel>(TestImageProvider<TPixel> provider)
where TPixel : unmanaged, IPixel<TPixel>
{
using Image<TPixel> image = provider.GetImage(WebpDecoder.Instance, new DecoderOptions { ColorProfileHandling = ColorProfileHandling.Convert });

image.DebugSave(provider);
image.CompareToReferenceOutput(provider);
Assert.Null(image.Metadata.IccProfile);
}
}
6 changes: 6 additions & 0 deletions tests/ImageSharp.Tests/TestImages.cs
Original file line number Diff line number Diff line change
Expand Up @@ -901,6 +901,12 @@ public static class Lossy
public const string AlphaBlend2 = "Webp/alpha-blend-2.webp";
public const string AlphaBlend3 = "Webp/alpha-blend-3.webp";
public const string AlphaBlend4 = "Webp/alpha-blend-4.webp";

public static class Icc
{
public const string Perceptual = "Webp/icc-profiles/Perceptual.webp";
public const string PerceptualcLUTOnly = "Webp/icc-profiles/Perceptual-cLUT-only.webp";
}
}

public static class Tiff
Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Loading