diff --git a/src/VCDiff/Encoders/BlockHash.cs b/src/VCDiff/Encoders/BlockHash.cs index 7af1f70..448ce5f 100644 --- a/src/VCDiff/Encoders/BlockHash.cs +++ b/src/VCDiff/Encoders/BlockHash.cs @@ -58,9 +58,9 @@ public BlockHash(ByteBuffer sin, int offset, RollingHash hasher, int blockSize = hashTableMask = (ulong)tableSize - 1; #if NET5_0 - hashTable = GC.AllocateUninitializedArray((int) tableSize); - nextBlockTable = GC.AllocateUninitializedArray((int) blocksCount); - lastBlockTable = GC.AllocateUninitializedArray((int) blocksCount); + hashTable = GC.AllocateUninitializedArray((int) tableSize, true); + nextBlockTable = GC.AllocateUninitializedArray((int) blocksCount, true); + lastBlockTable = GC.AllocateUninitializedArray((int) blocksCount, true); #else hashTable = new long[tableSize]; nextBlockTable = new long[blocksCount]; @@ -73,9 +73,9 @@ public BlockHash(ByteBuffer sin, int offset, RollingHash hasher, int blockSize = private void SetTablesToInvalid() { - Array.Fill(lastBlockTable, -1); - Array.Fill(nextBlockTable, -1); - Array.Fill(hashTable, -1); + Intrinsics.FillArrayVectorized(lastBlockTable, -1); + Intrinsics.FillArrayVectorized(nextBlockTable, -1); + Intrinsics.FillArrayVectorized(hashTable, -1); } private long CalcTableSize() @@ -188,7 +188,6 @@ public unsafe void FindBestMatch(ulong hash, long candidateStart, long targetSta blockNumber = NextMatchingBlock(blockNumber, candidateStart, sourcePtr, targetPtr, target)) { long sourceMatchOffset = blockNumber * blockSize; - long sourceStart = blockNumber * blockSize; long sourceMatchEnd = sourceMatchOffset + blockSize; long targetMatchOffset = candidateStart - targetStart; long targetMatchEnd = targetMatchOffset + blockSize; @@ -206,11 +205,8 @@ public unsafe void FindBestMatch(ulong hash, long candidateStart, long targetSta long targetBytesToRight = targetSize - targetMatchEnd; long rightLimit = Math.Min(sourceBytesToRight, targetBytesToRight); - long rightMatching = MatchingBytesToRight(sourceMatchEnd, targetStart + targetMatchEnd, sourcePtr, targetPtr, - target, rightLimit); + long rightMatching = MatchingBytesToRight(sourceMatchEnd, targetStart + targetMatchEnd, sourcePtr, targetPtr, target, rightLimit); matchSize += rightMatching; - //sourceMatchEnd += rightMatching; - //targetMatchEnd += rightMatching; m.ReplaceIfBetterMatch(matchSize, sourceMatchOffset + offset, targetMatchOffset); } } @@ -267,55 +263,75 @@ private unsafe bool BlockContentsMatch(long block1, long tOffset, byte *sourcePt long tLen = target.Length; byte* sPtr = sourcePtr; byte* tPtr = targetPtr; + + if (sOffset > sLen || tOffset > tLen) + return false; + #if NETCOREAPP3_1 || NET5_0 - if (Avx2.IsSupported && lengthToExamine >= 32) + if (Avx2.IsSupported && lengthToExamine >= Intrinsics.AvxRegisterSize) { - if (sOffset > sLen || tOffset > tLen) return false; - for (; sOffset >= 32 && tOffset >= 32 && - lengthToExamine >= 32; sOffset += 32, tOffset += 32, lengthToExamine -= 32) + if (sOffset >= Intrinsics.AvxRegisterSize && tOffset >= Intrinsics.AvxRegisterSize) { - Vector256 lv = Avx.LoadVector256(&sPtr[sOffset]); - Vector256 rv = Avx.LoadVector256(&tPtr[tOffset]); - if (Avx2.MoveMask(Avx2.CompareEqual(lv, rv)) != EQMASK) return false; + while (lengthToExamine >= Intrinsics.AvxRegisterSize) + { + Vector256 lv = Avx.LoadVector256(&sPtr[sOffset]); + Vector256 rv = Avx.LoadVector256(&tPtr[tOffset]); + if (Avx2.MoveMask(Avx2.CompareEqual(lv, rv)) != EQMASK) + return false; + + sOffset += Intrinsics.AvxRegisterSize; + tOffset += Intrinsics.AvxRegisterSize; + lengthToExamine -= Intrinsics.AvxRegisterSize; + } } } - if (Sse2.IsSupported && lengthToExamine >= 16) + if (Sse2.IsSupported && lengthToExamine >= Intrinsics.SseRegisterSize) { - if (sOffset > sLen || tOffset > tLen) return false; - - for (; sOffset >= 16 && tOffset >= 16 && - lengthToExamine >= 16; sOffset += 16, tOffset += 16, lengthToExamine -= 16) + if (sOffset >= Intrinsics.SseRegisterSize && tOffset >= Intrinsics.SseRegisterSize) { - Vector128 lv = Sse2.LoadVector128(&sPtr[sOffset]); - Vector128 rv = Sse2.LoadVector128(&tPtr[tOffset]); - if ((uint) Sse2.MoveMask(Sse2.CompareEqual(lv, rv)) != ushort.MaxValue) return false; + while (lengthToExamine >= Intrinsics.SseRegisterSize) + { + Vector128 lv = Sse2.LoadVector128(&sPtr[sOffset]); + Vector128 rv = Sse2.LoadVector128(&tPtr[tOffset]); + if ((uint)Sse2.MoveMask(Sse2.CompareEqual(lv, rv)) != ushort.MaxValue) + return false; + + sOffset += Intrinsics.SseRegisterSize; + tOffset += Intrinsics.SseRegisterSize; + lengthToExamine -= Intrinsics.SseRegisterSize; + } } } -#endif +#else int vectorSize = Vector.Count; - if (lengthToExamine >= vectorSize) { var sBuf = source.AsSpan(); var tBuf = target.AsSpan(); - - if (sOffset > sLen || tOffset > tLen) - return false; - - for (; sOffset >= vectorSize && tOffset >= vectorSize && - lengthToExamine >= vectorSize; sOffset += vectorSize, tOffset += vectorSize, lengthToExamine -= vectorSize) + + if (sOffset >= vectorSize && tOffset >= vectorSize) { - Vector lv = new Vector(sBuf.Slice(sOffset)); - Vector rv = new Vector(tBuf.Slice((int) tOffset)); - if (!Vector.EqualsAll(lv, rv)) - return false; + while (lengthToExamine >= vectorSize) + { + Vector lv = new Vector(sBuf.Slice(sOffset)); + Vector rv = new Vector(tBuf.Slice((int)tOffset)); + if (!Vector.EqualsAll(lv, rv)) + return false; + + sOffset += vectorSize; + tOffset += vectorSize; + lengthToExamine -= vectorSize; + } } } +#endif while (lengthToExamine > 0 && !(sOffset > sLen || tOffset > tLen)) { - if (sPtr[sOffset] != tPtr[tOffset]) return false; + if (sPtr[sOffset] != tPtr[tOffset]) + return false; + --lengthToExamine; ++sOffset; ++tOffset; @@ -372,16 +388,23 @@ private unsafe long MatchingBytesToLeftAvx2(long start, long tstart, byte* sourc byte* tPtr = targetPtr; byte* sPtr = sourcePtr; - for (; (sindex >= 32 && tindex >= 32) && bytesFound <= maxBytes - 32; bytesFound += 32) + if (sindex >= Intrinsics.AvxRegisterSize && tindex >= Intrinsics.AvxRegisterSize) { - tindex -= 32; - sindex -= 32; - var lv = Avx2.LoadVector256(&sPtr[sindex]); - var rv = Avx2.LoadVector256(&tPtr[tindex]); - if (Avx2.MoveMask(Avx2.CompareEqual(lv, rv)) == EQMASK) continue; - tindex += 32; - sindex += 32; - break; + while (bytesFound <= maxBytes - Intrinsics.AvxRegisterSize) + { + tindex -= Intrinsics.AvxRegisterSize; + sindex -= Intrinsics.AvxRegisterSize; + var lv = Avx2.LoadVector256(&sPtr[sindex]); + var rv = Avx2.LoadVector256(&tPtr[tindex]); + if (Avx2.MoveMask(Avx2.CompareEqual(lv, rv)) != EQMASK) + { + tindex += Intrinsics.AvxRegisterSize; + sindex += Intrinsics.AvxRegisterSize; + break; + } + + bytesFound += Intrinsics.AvxRegisterSize; + } } while (bytesFound < maxBytes) @@ -410,18 +433,25 @@ private unsafe long MatchingBytesToLeftSse2(long start, long tstart, byte* sourc byte* tPtr = targetPtr; byte* sPtr = sourcePtr; - for (; (sindex >= 16 && tindex >= 16) && bytesFound <= maxBytes - 16; bytesFound += 16) + if (sindex >= Intrinsics.SseRegisterSize && tindex >= Intrinsics.SseRegisterSize) { - tindex -= 16; - sindex -= 16; - var lv = Sse2.LoadVector128(&sPtr[sindex]); - var rv = Sse2.LoadVector128(&tPtr[tindex]); - if ((uint)Sse2.MoveMask(Sse2.CompareEqual(lv, rv)) == ushort.MaxValue) continue; - tindex += 16; - sindex += 16; - break; + while (bytesFound <= maxBytes - Intrinsics.SseRegisterSize) + { + tindex -= Intrinsics.SseRegisterSize; + sindex -= Intrinsics.SseRegisterSize; + var lv = Sse2.LoadVector128(&sPtr[sindex]); + var rv = Sse2.LoadVector128(&tPtr[tindex]); + if ((uint)Sse2.MoveMask(Sse2.CompareEqual(lv, rv)) != ushort.MaxValue) + { + tindex += Intrinsics.SseRegisterSize; + sindex += Intrinsics.SseRegisterSize; + break; + } + + bytesFound += Intrinsics.SseRegisterSize; + } } - + while (bytesFound < maxBytes) { --sindex; @@ -460,18 +490,20 @@ private unsafe long MatchingBytesToLeft(long start, long tstart, byte* sourcePtr var tBuf = target.AsSpan(); var sBuf = source.AsSpan(); - for (; (sindex >= vectorSize && tindex >= vectorSize) - && bytesFound <= maxBytes - vectorSize; bytesFound += vectorSize) + while (sindex >= vectorSize && tindex >= vectorSize && bytesFound <= maxBytes - vectorSize) { - tindex -= vectorSize; sindex -= vectorSize; var lv = new Vector(sBuf.Slice((int)sindex)); var rv = new Vector(tBuf.Slice((int)tindex)); - if (Vector.EqualsAll(lv, rv)) continue; - tindex += vectorSize; - sindex += vectorSize; - break; + if (!Vector.EqualsAll(lv, rv)) + { + tindex += vectorSize; + sindex += vectorSize; + break; + } + + bytesFound += vectorSize; } while (bytesFound < maxBytes) @@ -501,12 +533,18 @@ private unsafe long MatchingBytesToRightAvx2(long end, long tstart, byte* source byte* tPtr = targetPtr; byte* sPtr = sourcePtr; - for (; (srcLength - sindex) >= 32 && (trgLength - tindex) >= 32 && bytesFound <= maxBytes - 32; bytesFound += 32, tindex += 32, sindex += 32) + while ((srcLength - sindex) >= Intrinsics.AvxRegisterSize && + (trgLength - tindex) >= Intrinsics.AvxRegisterSize && + bytesFound <= maxBytes - Intrinsics.AvxRegisterSize) { var lv = Avx2.LoadVector256(&sPtr[sindex]); var rv = Avx2.LoadVector256(&tPtr[tindex]); - if (Avx2.MoveMask(Avx2.CompareEqual(lv, rv)) == EQMASK) continue; - break; + if (Avx2.MoveMask(Avx2.CompareEqual(lv, rv)) != EQMASK) + break; + + bytesFound += Intrinsics.AvxRegisterSize; + tindex += Intrinsics.AvxRegisterSize; + sindex += Intrinsics.AvxRegisterSize; } while (bytesFound < maxBytes) @@ -533,14 +571,20 @@ private unsafe long MatchingBytesToRightSse2(long end, long tstart, byte* source byte* tPtr = targetPtr; byte* sPtr = sourcePtr; - for (; (srcLength - sindex) >= 16 && (trgLength - tindex) >= 16 && bytesFound <= maxBytes - 16; bytesFound += 16, tindex += 16, sindex += 16) + while ((srcLength - sindex) >= Intrinsics.SseRegisterSize && + (trgLength - tindex) >= Intrinsics.SseRegisterSize && + bytesFound <= maxBytes - Intrinsics.SseRegisterSize) { var lv = Sse2.LoadVector128(&sPtr[sindex]); var rv = Sse2.LoadVector128(&tPtr[tindex]); - if (Sse2.MoveMask(Sse2.CompareEqual(lv, rv)) == ushort.MaxValue) continue; - break; - } + if (Sse2.MoveMask(Sse2.CompareEqual(lv, rv)) != ushort.MaxValue) + break; + bytesFound += Intrinsics.SseRegisterSize; + tindex += Intrinsics.SseRegisterSize; + sindex += Intrinsics.SseRegisterSize; + } + while (bytesFound < maxBytes) { if (sindex >= srcLength || tindex >= trgLength) break; @@ -551,6 +595,7 @@ private unsafe long MatchingBytesToRightSse2(long end, long tstart, byte* source ++sindex; ++bytesFound; } + return bytesFound; } #endif @@ -583,14 +628,18 @@ private unsafe long MatchingBytesToRight(long end, long tstart, byte* sourcePtr, var tBuf = target.AsSpan(); var sBuf = source.AsSpan(); - for (; (srcLength - sindex) >= vectorSize && (trgLength - tindex) >= vectorSize - && bytesFound <= maxBytes - vectorSize; - bytesFound += vectorSize, tindex += vectorSize, sindex += vectorSize) + while ((srcLength - sindex) >= vectorSize + && (trgLength - tindex) >= vectorSize + && bytesFound <= maxBytes - vectorSize) { var lv = new Vector(sBuf.Slice((int)sindex)); var rv = new Vector(tBuf.Slice((int)tindex)); - if (Vector.EqualsAll(lv, rv)) continue; - break; + if (!Vector.EqualsAll(lv, rv)) + break; + + bytesFound += vectorSize; + tindex += vectorSize; + sindex += vectorSize; } while (bytesFound < maxBytes) diff --git a/src/VCDiff/Shared/Intrinsics.cs b/src/VCDiff/Shared/Intrinsics.cs new file mode 100644 index 0000000..8d70df8 --- /dev/null +++ b/src/VCDiff/Shared/Intrinsics.cs @@ -0,0 +1,120 @@ +using System; +using System.Runtime.CompilerServices; + +#if NETCOREAPP3_1 || NET5_0 +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif + +namespace VCDiff.Shared +{ + internal static class Intrinsics + { + public const int AvxRegisterSize = 32; + public const int SseRegisterSize = 16; + public static readonly int MaxRegisterSize; + public static readonly bool UseAvx; + + static Intrinsics() + { +#if NETCOREAPP3_1 || NET5_0 + if (Sse2.IsSupported) + { + MaxRegisterSize = SseRegisterSize; + UseAvx = false; + } + + if (Avx.IsSupported) + { + MaxRegisterSize = AvxRegisterSize; + UseAvx = true; + } + + // Not set. + if (MaxRegisterSize == 0) + { + // bytesLeft will never exceed. + MaxRegisterSize = int.MaxValue; + } +#endif + } + + public static unsafe void FillArrayVectorized(long[] array, long value) + { +#if NETCOREAPP3_1 || NET5_0 + int bytesLeft = array.Length * sizeof(long); + if (bytesLeft >= MaxRegisterSize) + { + // Note: This can be 0 cost in .NET 5 when paired with pinned GC.AllocateUnitializedArray. + fixed (long* first = &array[0]) + { + if (UseAvx) + Avx2FillArray(first, value, ref bytesLeft); + else + Sse2FillArray(first, value, ref bytesLeft); + + // Fill rest of array. + var elementsLeft = bytesLeft / sizeof(long); + for (int x = array.Length - elementsLeft; x < array.Length; x++) + array[x] = value; + } + } + else + { + // Copy remaining elements. + for (int x = 0; x < array.Length; x++) + array[x] = value; + } +#else + // Accelerate via loop unrolled solution. + array.AsSpan().Fill(value); +#endif + } + + +#if NETCOREAPP3_1 || NET5_0 + [MethodImpl(MethodImplOptions.AggressiveOptimization)] + private static unsafe void Sse2FillArray(long* first, long value, ref int bytesLeft) + { + // Initialize. + var numValues = SseRegisterSize / sizeof(long); + var vectorValues = stackalloc long[numValues]; + FillPointer(vectorValues, value, numValues); + + var vector = Sse2.LoadVector128(vectorValues); + while (bytesLeft >= SseRegisterSize) + { + Sse2.Store(first, vector); + first += numValues; + bytesLeft -= SseRegisterSize; + } + } + + [MethodImpl(MethodImplOptions.AggressiveOptimization)] + private static unsafe void Avx2FillArray(long* first, long value, ref int bytesLeft) + { + // Initialize. + var numValues = AvxRegisterSize / sizeof(long); + var vectorValues = stackalloc long[numValues]; + FillPointer(vectorValues, value, numValues); + + var vector = Avx2.LoadVector256(vectorValues); + while (bytesLeft >= AvxRegisterSize) + { + Avx2.Store(first, vector); + first += numValues; + bytesLeft -= AvxRegisterSize; + } + } + + private static unsafe void FillPointer(long* values, long value, int numValues) + { + for (int x = 0; x < numValues; x++) + { + *values = value; + values += 1; + } + } +#endif + } +}