Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improvements around fixed #2418

Merged
merged 9 commits into from
Mar 30, 2023
4 changes: 2 additions & 2 deletions src/ImageSharp/ColorSpaces/Companding/SRgbCompanding.cs
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ public static float Compress(float channel)
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe void CompandAvx2(Span<Vector4> vectors, float[] table)
{
fixed (float* tablePointer = &table[0])
fixed (float* tablePointer = &MemoryMarshal.GetArrayDataReference(table))
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This avoids the bound-checks.
See sharplab for example.

{
var scale = Vector256.Create((float)Scale);
Vector256<float> zero = Vector256<float>.Zero;
Expand Down Expand Up @@ -199,7 +199,7 @@ private static unsafe void CompandAvx2(Span<Vector4> vectors, float[] table)
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe void CompandScalar(Span<Vector4> vectors, float[] table)
{
fixed (float* tablePointer = &table[0])
fixed (float* tablePointer = &MemoryMarshal.GetArrayDataReference(table))
{
Vector4 zero = Vector4.Zero;
var scale = new Vector4(Scale);
Expand Down
42 changes: 10 additions & 32 deletions src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,25 +15,14 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
/// 8x8 matrix of <see cref="short"/> coefficients.
/// </summary>
// ReSharper disable once InconsistentNaming
[StructLayout(LayoutKind.Explicit)]
internal unsafe partial struct Block8x8
[StructLayout(LayoutKind.Explicit, Size = 2 * Size)]
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The unused fixed sized buffer to specify the size of the struct isn't neede, as the size can be set directly.

Codegen for creating the struct is also a bit better -- only zeroing the memory, instead of stack-cookie + zeroing.
Though that won't matter much -- the reduction of C# code is the motivation for this change.

internal partial struct Block8x8
{
/// <summary>
/// A number of scalar coefficients in a <see cref="Block8x8F"/>
/// </summary>
public const int Size = 64;

#pragma warning disable IDE0051 // Remove unused private member
/// <summary>
/// A placeholder buffer so the actual struct occupies exactly 64 * 2 bytes.
/// </summary>
/// <remarks>
/// This is not used directly in the code.
/// </remarks>
[FieldOffset(0)]
private fixed short data[Size];
#pragma warning restore IDE0051

/// <summary>
/// Gets or sets a <see cref="short"/> value at the given index
/// </summary>
Expand Down Expand Up @@ -74,9 +63,10 @@ internal unsafe partial struct Block8x8

public static Block8x8 Load(Span<short> data)
{
Unsafe.SkipInit(out Block8x8 result);
result.LoadFrom(data);
return result;
DebugGuard.MustBeGreaterThanOrEqualTo(data.Length, Size, "data is too small");

ref byte src = ref Unsafe.As<short, byte>(ref MemoryMarshal.GetReference(data));
return Unsafe.ReadUnaligned<Block8x8>(ref src);
}

/// <summary>
Expand Down Expand Up @@ -104,9 +94,10 @@ public short[] ToArray()
/// </summary>
public void CopyTo(Span<short> destination)
{
ref byte selfRef = ref Unsafe.As<Block8x8, byte>(ref this);
ref byte destRef = ref MemoryMarshal.GetReference(MemoryMarshal.Cast<short, byte>(destination));
Unsafe.CopyBlockUnaligned(ref destRef, ref selfRef, Size * sizeof(short));
DebugGuard.MustBeGreaterThanOrEqualTo(destination.Length, Size, "destination is too small");

ref byte destRef = ref Unsafe.As<short, byte>(ref MemoryMarshal.GetReference(destination));
Unsafe.WriteUnaligned(ref destRef, this);
}

/// <summary>
Expand Down Expand Up @@ -135,19 +126,6 @@ public void LoadFrom(ReadOnlySpan<byte> source)
}
}

/// <summary>
/// Load raw 16bit integers from source.
/// </summary>
/// <param name="source">Source</param>
[MethodImpl(InliningOptions.ShortMethod)]
public void LoadFrom(Span<short> source)
{
ref byte sourceRef = ref Unsafe.As<short, byte>(ref MemoryMarshal.GetReference(source));
ref byte destRef = ref Unsafe.As<Block8x8, byte>(ref this);

Unsafe.CopyBlockUnaligned(ref destRef, ref sourceRef, Size * sizeof(short));
}

/// <summary>
/// Cast and copy <see cref="Size"/> <see cref="int"/>-s from the beginning of 'source' span.
/// </summary>
Expand Down
25 changes: 9 additions & 16 deletions src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
Original file line number Diff line number Diff line change
Expand Up @@ -101,24 +101,17 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
set => this[((uint)y * 8) + (uint)x] = value;
}

public static Block8x8F Load(Span<float> data)
{
Block8x8F result = default;
result.LoadFrom(data);
Comment on lines -106 to -107
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Going via instance method needs to copy two times, which can be avoided.

In sharplab it's hard to see (no method names, just the address of the call), so here from the JIT directly:

JIT assembly
; Assembly listing for method Foo:Default(System.Span`1[float]):Block8x8F
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; Tier-1 compilation
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; 0 inlinees with PGO data; 4 single block inlinees; 0 inlinees without PGO data

G_M000_IG01:                ;; offset=0000H
       56                   push     rsi
       4881EC00010000       sub      rsp, 256
       488BF1               mov      rsi, rcx

G_M000_IG02:                ;; offset=000BH
       488B12               mov      rdx, bword ptr [rdx]
       488D0C24             lea      rcx, bword ptr [rsp]
       41B800010000         mov      r8d, 256
       E80330AF5F           call     CORINFO_HELP_MEMCPY
       488BCE               mov      rcx, rsi
       488D1424             lea      rdx, bword ptr [rsp]
       41B800010000         mov      r8d, 256
       E8F12FAF5F           call     CORINFO_HELP_MEMCPY
       488BC6               mov      rax, rsi

G_M000_IG03:                ;; offset=0032H
       4881C400010000       add      rsp, 256
       5E                   pop      rsi
       C3                   ret

; Total bytes of code 59

; Assembly listing for method Foo:PR(System.Span`1[float]):Block8x8F_PR
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; Tier-1 compilation
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; 0 inlinees with PGO data; 3 single block inlinees; 0 inlinees without PGO data

G_M000_IG01:                ;; offset=0000H
       56                   push     rsi
       488BF1               mov      rsi, rcx

G_M000_IG02:                ;; offset=0004H
       488B12               mov      rdx, bword ptr [rdx]
       488BCE               mov      rcx, rsi
       41B800010000         mov      r8d, 256
       E80B2FAF5F           call     CORINFO_HELP_MEMCPY
       488BC6               mov      rax, rsi

G_M000_IG03:                ;; offset=0018H
       5E                   pop      rsi
       C3                   ret

; Total bytes of code 26

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just wondering: is it the same gain when you compare use cases on call sites, ie.

Block8x8F b = Block8x8F.Load(data)`;

after the PR change vs. typical usage on main:

Block8x8F b = default;
b.LoadFrom(data);

?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It depends what's done with the instance of Block8x8F. In this example Default2 shows no difference, as the JIT is smart enough to figure out what's going on. Namely: load the data onto the stack, then operate with the instance method of that type and use data on the stack.
But as soon the Block8x8F is passed around a copy needs to be made. Thus for me the instance LoadFrom has the potential for a perf-trap, and should be removed.

Besides that from an API point of view having an instance method to load (or better "fill") the block with data is a bit strange for an value-type*. Here a static method seems the right fit, like int.Parse, Vector128.Load, etc.
As it's an internal type this change seems fine from this view.

* for reference types this is pretty much OK

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Besides that from an API point of view having an instance method to load (or better "fill") the block with data is a bit strange

Note that Block8x8F is really huge. If I remember correctly, in early Jpeg code there were cases when existing on-stack blocks were being reused by repopulating it. If it's no longer the case in the current code, I'm happy with the change.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Checked it -- all usages are safe. Block8x8F is either passed by reference, and only copied where needed (e.g. constructors).

existing on-stack blocks were being reused by repopulating

When I understand it right, then this is a simple example of it.

Both variants work and produce the same machine-code.
(It's because the compilers hoists the loop-local block variable, and as it's a value-type the stack space needs to be reserved anyway).
Even if there are two blocks read then it's the same machine code.

I'll see if the C# code here can be simplified.

return result;
}

/// <summary>
/// Load raw 32bit floating point data from source.
/// </summary>
/// <param name="source">Source</param>
/// <param name="data">Source</param>
[MethodImpl(InliningOptions.ShortMethod)]
public void LoadFrom(Span<float> source)
public static Block8x8F Load(Span<float> data)
{
ref byte s = ref Unsafe.As<float, byte>(ref MemoryMarshal.GetReference(source));
ref byte d = ref Unsafe.As<Block8x8F, byte>(ref this);
DebugGuard.MustBeGreaterThanOrEqualTo(data.Length, Size, "data is too small");

Unsafe.CopyBlock(ref d, ref s, Size * sizeof(float));
ref byte src = ref Unsafe.As<float, byte>(ref MemoryMarshal.GetReference(data));
return Unsafe.ReadUnaligned<Block8x8F>(ref src);
}

/// <summary>
Expand All @@ -144,10 +137,10 @@ public unsafe void LoadFrom(Span<int> source)
[MethodImpl(InliningOptions.ShortMethod)]
public unsafe void ScaledCopyTo(float[] dest)
{
fixed (void* ptr = &this.V0L)
{
Marshal.Copy((IntPtr)ptr, dest, 0, Size);
}
DebugGuard.MustBeGreaterThanOrEqualTo(dest.Length, Size, "dest is too small");

ref byte destRef = ref Unsafe.As<float, byte>(ref MemoryMarshal.GetArrayDataReference(dest));
Unsafe.WriteUnaligned(ref destRef, this);
}

public float[] ToArray()
Expand Down
7 changes: 3 additions & 4 deletions src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1474,8 +1474,7 @@ private static uint Select(uint a, uint b, uint c, Span<short> scratch)
{
if (Sse2.IsSupported)
{
Span<short> output = scratch;
fixed (short* p = output)
fixed (short* ptr = &MemoryMarshal.GetReference(scratch))
{
Vector128<byte> a0 = Sse2.ConvertScalarToVector128UInt32(a).AsByte();
Vector128<byte> b0 = Sse2.ConvertScalarToVector128UInt32(b).AsByte();
Expand All @@ -1489,8 +1488,8 @@ private static uint Select(uint a, uint b, uint c, Span<short> scratch)
Vector128<byte> pa = Sse2.UnpackLow(ac, Vector128<byte>.Zero); // |a - c|
Vector128<byte> pb = Sse2.UnpackLow(bc, Vector128<byte>.Zero); // |b - c|
Vector128<ushort> diff = Sse2.Subtract(pb.AsUInt16(), pa.AsUInt16());
Sse2.Store((ushort*)p, diff);
int paMinusPb = output[3] + output[2] + output[1] + output[0];
Sse2.Store((ushort*)ptr, diff);
int paMinusPb = ptr[3] + ptr[2] + ptr[1] + ptr[0];
return (paMinusPb <= 0) ? a : b;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,6 @@ private static Block8x8F Create8x8FloatData()
}
}

var source = default(Block8x8F);
source.LoadFrom(result);
return source;
return Block8x8F.Load(result);
}
}
12 changes: 4 additions & 8 deletions tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,7 @@ public void Load_Store_FloatArray()
Times,
() =>
{
var b = default(Block8x8F);
b.LoadFrom(data);
Block8x8F b = Block8x8F.Load(data);
b.ScaledCopyTo(mirror);
});

Expand All @@ -117,8 +116,7 @@ static void RunTest()
float[] expected = Create8x8FloatData();
ReferenceImplementations.Transpose8x8(expected);

var block8x8 = default(Block8x8F);
block8x8.LoadFrom(Create8x8FloatData());
Block8x8F block8x8 = Block8x8F.Load(Create8x8FloatData());

block8x8.TransposeInplace();

Expand Down Expand Up @@ -153,9 +151,8 @@ private static float[] Create8x8ColorCropTestData()
[Fact]
public void NormalizeColors()
{
var block = default(Block8x8F);
float[] input = Create8x8ColorCropTestData();
block.LoadFrom(input);
Block8x8F block = Block8x8F.Load(input);
this.Output.WriteLine("Input:");
this.PrintLinearData(input);

Expand Down Expand Up @@ -242,8 +239,7 @@ public void RoundInto()
{
float[] data = Create8x8RandomFloatData(-1000, 1000);

var source = default(Block8x8F);
source.LoadFrom(data);
Block8x8F source = Block8x8F.Load(data);
var dest = default(Block8x8);

source.RoundInto(ref dest);
Expand Down
3 changes: 1 addition & 2 deletions tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -269,8 +269,7 @@ static void RunTest()
short[] expected = Create8x8ShortData();
ReferenceImplementations.Transpose8x8(expected);

var block8x8 = default(Block8x8);
block8x8.LoadFrom(Create8x8ShortData());
Block8x8 block8x8 = Block8x8.Load(Create8x8ShortData());

block8x8.TransposeInplace();

Expand Down
15 changes: 5 additions & 10 deletions tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,7 @@ static void RunTest(string serialized)
int seed = FeatureTestRunner.Deserialize<int>(serialized);

Span<float> src = Create8x8RandomFloatData(MinInputValue, MaxInputValue, seed);
var srcBlock = default(Block8x8F);
srcBlock.LoadFrom(src);
Block8x8F srcBlock = Block8x8F.Load(src);

float[] expectedDest = new float[64];
float[] temp = new float[64];
Expand Down Expand Up @@ -162,8 +161,7 @@ static void RunTest(string serialized)
public void TranformIDCT_4x4(int seed)
{
Span<float> src = Create8x8RandomFloatData(MinInputValue, MaxInputValue, seed, 4, 4);
var srcBlock = default(Block8x8F);
srcBlock.LoadFrom(src);
Block8x8F srcBlock = Block8x8F.Load(src);

float[] expectedDest = new float[64];
float[] temp = new float[64];
Expand Down Expand Up @@ -224,8 +222,7 @@ static void AssertScaledElementEquality(Span<float> expected, Span<float> actual
public void TranformIDCT_2x2(int seed)
{
Span<float> src = Create8x8RandomFloatData(MinInputValue, MaxInputValue, seed, 2, 2);
var srcBlock = default(Block8x8F);
srcBlock.LoadFrom(src);
Block8x8F srcBlock = Block8x8F.Load(src);

float[] expectedDest = new float[64];
float[] temp = new float[64];
Expand Down Expand Up @@ -286,8 +283,7 @@ static void AssertScaledElementEquality(Span<float> expected, Span<float> actual
public void TranformIDCT_1x1(int seed)
{
Span<float> src = Create8x8RandomFloatData(MinInputValue, MaxInputValue, seed, 1, 1);
var srcBlock = default(Block8x8F);
srcBlock.LoadFrom(src);
Block8x8F srcBlock = Block8x8F.Load(src);

float[] expectedDest = new float[64];
float[] temp = new float[64];
Expand Down Expand Up @@ -330,8 +326,7 @@ static void RunTest(string serialized)
int seed = FeatureTestRunner.Deserialize<int>(serialized);

Span<float> src = Create8x8RandomFloatData(MinInputValue, MaxInputValue, seed);
var block = default(Block8x8F);
block.LoadFrom(src);
Block8x8F block = Block8x8F.Load(src);

float[] expectedDest = new float[64];
float[] temp1 = new float[64];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,7 @@ public void ForwardThenInverse(int seed)
{
float[] data = Create8x8RandomFloatData(-1000, 1000, seed);

var b0 = default(Block8x8F);
b0.LoadFrom(data);
Block8x8F b0 = Block8x8F.Load(data);

Block8x8F b1 = ReferenceImplementations.AccurateDCT.TransformFDCT(ref b0);
Block8x8F b2 = ReferenceImplementations.AccurateDCT.TransformIDCT(ref b1);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,7 @@ public void LLM_FDCT_IsEquivalentTo_AccurateImplementation(int seed)
{
float[] floatData = Create8x8RandomFloatData(-1000, 1000);

Block8x8F source = default;
source.LoadFrom(floatData);
Block8x8F source = Block8x8F.Load(floatData);

Block8x8F expected = ReferenceImplementations.AccurateDCT.TransformFDCT(ref source);
Block8x8F actual = ReferenceImplementations.LLM_FloatingPoint_DCT.TransformFDCT_UpscaleBy8(ref source);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,7 @@ public static Block8x8F TransformIDCT(ref Block8x8F source)
float[] temp = new float[64];

IDCT2D_llm(s, d, temp);
Block8x8F result = default;
result.LoadFrom(d);
Block8x8F result = Block8x8F.Load(d);
return result;
}

Expand All @@ -49,8 +48,7 @@ public static Block8x8F TransformFDCT_UpscaleBy8(ref Block8x8F source)
float[] temp = new float[64];

FDCT2D_llm(s, d, temp);
Block8x8F result = default;
result.LoadFrom(d);
Block8x8F result = Block8x8F.Load(d);
return result;
}

Expand Down