From bfb44c2798a6a44bd6e95481002c8affbe8dd255 Mon Sep 17 00:00:00 2001
From: Jorrit Jongma <git@jongma.org>
Date: Mon, 25 May 2020 18:16:19 +0200
Subject: [PATCH 1/3] Move OpenSSL-related MD4/5 defines and imports to
 lib/mdigest.h

Works just as well, prevents having to repeat them across files
---
 checksum.c    | 11 -----------
 lib/mdigest.h | 10 ++++++++++
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/checksum.c b/checksum.c
index 19df32a31..d6a91e7ce 100644
--- a/checksum.c
+++ b/checksum.c
@@ -30,10 +30,6 @@
 #ifdef SUPPORT_XXHASH
 #include "xxhash.h"
 #endif
-#ifdef USE_OPENSSL
-#include "openssl/md4.h"
-#include "openssl/md5.h"
-#endif
 
 extern int am_server;
 extern int whole_file;
@@ -63,13 +59,6 @@ struct name_num_obj valid_checksums = {
 	}
 };
 
-#ifndef USE_OPENSSL
-#define MD5_CTX md_context
-#define MD5_Init md5_begin
-#define MD5_Update md5_update
-#define MD5_Final(digest, cptr) md5_result(cptr, digest)
-#endif
-
 int xfersum_type = 0; /* used for the file transfer checksums */
 int checksum_type = 0; /* used for the pre-transfer (--checksum) checksums */
 
diff --git a/lib/mdigest.h b/lib/mdigest.h
index 86c1140f0..e543d6f3a 100644
--- a/lib/mdigest.h
+++ b/lib/mdigest.h
@@ -1,5 +1,10 @@
 /* The include file for both the MD4 and MD5 routines. */
 
+#ifdef USE_OPENSSL
+#include "openssl/md4.h"
+#include "openssl/md5.h"
+#endif
+
 #define MD4_DIGEST_LEN 16
 #define MD5_DIGEST_LEN 16
 #define MAX_DIGEST_LEN MD5_DIGEST_LEN
@@ -18,6 +23,11 @@ void mdfour_update(md_context *md, const uchar *in, uint32 length);
 void mdfour_result(md_context *md, uchar digest[MD4_DIGEST_LEN]);
 
 #ifndef USE_OPENSSL
+#define MD5_CTX md_context
+#define MD5_Init md5_begin
+#define MD5_Update md5_update
+#define MD5_Final(digest, cptr) md5_result(cptr, digest)
+
 void md5_begin(md_context *ctx);
 void md5_update(md_context *ctx, const uchar *input, uint32 length);
 void md5_result(md_context *ctx, uchar digest[MD5_DIGEST_LEN]);

From e892b00699799a69fd94cfdcf87793baa5b183d7 Mon Sep 17 00:00:00 2001
From: Jorrit Jongma <git@jongma.org>
Date: Mon, 25 May 2020 19:12:45 +0200
Subject: [PATCH 2/3] SSE2/AVX2 optimized version of get_checksum2() (if MD5)
 for x86-64

MD5 hashes computed during rsync's block matching phase are independent
and thus possible to process in parallel. This code processes 4 blocks
in parallel if SSE2 is available, or 8 if AVX2 is available. An increase
of performance (or decrease of CPU usage) of up to 6x has been measured.

A prefetching algorithm is used to predict and load upcoming blocks, as
this prevents the need for extensive modifications to other parts of
the rsync sources to get this working.
---
 Makefile.in                  |   5 +-
 checksum.c                   |  14 +-
 generator.c                  |  12 +-
 match.c                      |   8 +-
 simd-checksum-x86_64.cpp     | 179 +++++++
 simd-md5-parallel-x86_64.cpp | 919 +++++++++++++++++++++++++++++++++++
 6 files changed, 1128 insertions(+), 9 deletions(-)
 create mode 100644 simd-md5-parallel-x86_64.cpp

diff --git a/Makefile.in b/Makefile.in
index 13cba5d9b..f1a01fd33 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -31,7 +31,7 @@ VERSION=@RSYNC_VERSION@
 .SUFFIXES:
 .SUFFIXES: .c .o
 
-SIMD_x86_64=simd-checksum-x86_64.o lib/md5-asm-x86_64.o
+SIMD_x86_64=simd-checksum-x86_64.o simd-md5-parallel-x86_64.o lib/md5-asm-x86_64.o
 
 GENFILES=configure.sh aclocal.m4 config.h.in proto.h proto.h-tstamp rsync.1 rsync-ssl.1 rsyncd.conf.5
 HEADERS=byteorder.h config.h errcode.h proto.h rsync.h ifuncs.h itypes.h inums.h \
@@ -122,6 +122,9 @@ rounding.h: rounding.c rsync.h proto.h
 simd-checksum-x86_64.o: simd-checksum-x86_64.cpp
 	$(CXX) $(CXXFLAGS) $(CPPFLAGS) -c -o $@ $<
 
+simd-md5-parallel-x86_64.o: simd-md5-parallel-x86_64.cpp
+	$(CXX) $(CXXFLAGS) $(CPPFLAGS) -c -o $@ $<
+
 lib/md5-asm-x86_64.o: lib/md5-asm-x86_64.s
 	$(CC) -c -o $@ $<
 
diff --git a/checksum.c b/checksum.c
index d6a91e7ce..f16e07d08 100644
--- a/checksum.c
+++ b/checksum.c
@@ -197,9 +197,19 @@ uint32 get_checksum1(char *buf1, int32 len)
 	}
 	return (s1 & 0xffff) + (s2 << 16);
 }
-#endif
 
-void get_checksum2(char *buf, int32 len, char *sum)
+void checksum2_enable_prefetch(UNUSED(struct map_struct *map), UNUSED(OFF_T len), UNUSED(int32 blocklen))
+{
+}
+
+void checksum2_disable_prefetch()
+{
+}
+
+void get_checksum2(char *buf, int32 len, char *sum, UNUSED(OFF_T prefetch_offset))
+#else
+void get_checksum2_nosimd(char *buf, int32 len, char *sum, UNUSED(OFF_T prefetch_offset))
+#endif
 {
 	switch (xfersum_type) {
 #ifdef SUPPORT_XXHASH
diff --git a/generator.c b/generator.c
index dba97d854..e3b4c90f7 100644
--- a/generator.c
+++ b/generator.c
@@ -706,10 +706,12 @@ static int generate_and_send_sums(int fd, OFF_T len, int f_out, int f_copy)
 	if (append_mode > 0 && f_copy < 0)
 		return 0;
 
-	if (len > 0)
+	if (len > 0) {
 		mapbuf = map_file(fd, len, MAX_MAP_SIZE, sum.blength);
-	else
+        checksum2_enable_prefetch(mapbuf, len, sum.blength);
+	} else {
 		mapbuf = NULL;
+    }
 
 	for (i = 0; i < sum.count; i++) {
 		int32 n1 = (int32)MIN(len, (OFF_T)sum.blength);
@@ -727,7 +729,7 @@ static int generate_and_send_sums(int fd, OFF_T len, int f_out, int f_copy)
 		}
 
 		sum1 = get_checksum1(map, n1);
-		get_checksum2(map, n1, sum2);
+		get_checksum2(map, n1, sum2, offset - n1);
 
 		if (DEBUG_GTE(DELTASUM, 3)) {
 			rprintf(FINFO,
@@ -739,8 +741,10 @@ static int generate_and_send_sums(int fd, OFF_T len, int f_out, int f_copy)
 		write_buf(f_out, sum2, sum.s2length);
 	}
 
-	if (mapbuf)
+	if (mapbuf) {
 		unmap_file(mapbuf);
+	    checksum2_disable_prefetch();
+    }
 
 	return 0;
 }
diff --git a/match.c b/match.c
index 0639bf248..aede432a1 100644
--- a/match.c
+++ b/match.c
@@ -167,6 +167,8 @@ static void hash_search(int f,struct sum_struct *s,
 	if (DEBUG_GTE(DELTASUM, 3))
 		rprintf(FINFO, "sum=%.8x k=%ld\n", sum, (long)k);
 
+	checksum2_enable_prefetch(buf, len, s->blength);
+
 	offset = aligned_offset = aligned_i = 0;
 
 	end = len + 1 - s->sums[s->count-1].len;
@@ -229,7 +231,7 @@ static void hash_search(int f,struct sum_struct *s,
 
 			if (!done_csum2) {
 				map = (schar *)map_ptr(buf,offset,l);
-				get_checksum2((char *)map,l,sum2);
+				get_checksum2((char *)map, l, sum2, offset);
 				done_csum2 = 1;
 			}
 
@@ -271,7 +273,7 @@ static void hash_search(int f,struct sum_struct *s,
 						sum = get_checksum1((char *)map, l);
 						if (sum != s->sums[i].sum1)
 							goto check_want_i;
-						get_checksum2((char *)map, l, sum2);
+						get_checksum2((char *)map, l, sum2, aligned_offset);
 						if (memcmp(sum2, s->sums[i].sum2, s->s2length) != 0)
 							goto check_want_i;
 						/* OK, we have a re-alignment match.  Bump the offset
@@ -339,6 +341,8 @@ static void hash_search(int f,struct sum_struct *s,
 			matched(f, s, buf, offset - s->blength, -2);
 	} while (++offset < end);
 
+    checksum2_disable_prefetch();
+
 	matched(f, s, buf, len, -1);
 	map_ptr(buf, len-1, 1);
 }
diff --git a/simd-checksum-x86_64.cpp b/simd-checksum-x86_64.cpp
index 66f726505..52a3d1936 100644
--- a/simd-checksum-x86_64.cpp
+++ b/simd-checksum-x86_64.cpp
@@ -48,6 +48,22 @@
  * This file is compiled using GCC 4.8+'s C++ front end to allow the use of
  * the target attribute, selecting the fastest code path based on runtime
  * detection of CPU capabilities.
+ *
+ * ----
+ *
+ * get_checksum2() is optimized for the case where the selected transfer
+ * checksum is MD5. MD5 can't be made significantly faster with SIMD
+ * instructions than the assembly version already included but SIMD
+ * instructions can be used to hash multiple streams in parallel (see
+ * simd-md5-parallel-x86_64.cpp for details and benchmarks). As rsync's
+ * block-matching algorithm hashes the blocks independently (in contrast to
+ * the whole-file checksum) this method can be employed here.
+ *
+ * To prevent needing to modify the core rsync sources significantly, a
+ * prefetching strategy is used. When a checksum2 is requested, the code
+ * reads ahead several blocks, creates the MD5 hashes for each block in
+ * parallel, returns the hash for the first block, and caches the results
+ * for the other blocks to return in future calls to get_checksum2().
  */
 
 #ifdef __x86_64__
@@ -409,6 +425,169 @@ uint32 get_checksum1(char *buf1, int32 len) {
     return (s1 & 0xffff) + (s2 << 16);
 }
 
+#define PREFETCH_ENABLE // debugging
+
+#if 0 // debugging
+#define PREFETCH_PRINTF(f_, ...) printf((f_), ##__VA_ARGS__)
+#else
+#define PREFETCH_PRINTF(f_, ...) (void)0;
+#endif
+
+#define PREFETCH_MIN_LEN 1024 // the overhead is unlikely to be worth the gain for small blocks
+#define PREFETCH_MAX_BLOCKS 8
+#define CSUM_MD5 5
+
+typedef struct {
+    int in_use;
+    OFF_T offset;
+    int32 len;
+    char sum[SUM_LENGTH];
+} prefetch_sum_t;
+
+typedef struct {
+    struct map_struct *map;
+    OFF_T len;
+    OFF_T last;
+    int32 blocklen;
+    int blocks;
+    prefetch_sum_t sums[PREFETCH_MAX_BLOCKS];
+} prefetch_t;
+
+prefetch_t *prefetch;
+
+extern int xfersum_type;
+extern int checksum_seed;
+extern int proper_seed_order;
+extern void get_checksum2_nosimd(char *buf, int32 len, char *sum, OFF_T prefetch_offset);
+
+extern char *map_ptr(struct map_struct *map, OFF_T offset, int32 len);
+
+// see simd-md5-parallel-x86_64.cpp
+extern int md5_parallel_slots();
+extern int md5_parallel(int streams, char** buf, int* len, char** sum, char* pre4, char* post4);
+
+void checksum2_disable_prefetch() {
+    if (prefetch) {
+        PREFETCH_PRINTF("checksum2_disable_prefetch\n");
+        free(prefetch);
+        prefetch = NULL;
+    }
+}
+
+void checksum2_enable_prefetch(struct map_struct *map, OFF_T len, int32 blocklen) {
+#ifdef PREFETCH_ENABLE
+    checksum2_disable_prefetch();
+    int slots = md5_parallel_slots();
+    if ((xfersum_type == CSUM_MD5) && (slots > 1) && (len >= blocklen * PREFETCH_MAX_BLOCKS) && (blocklen >= PREFETCH_MIN_LEN)) {
+        prefetch = (prefetch_t*)malloc(sizeof(prefetch_t));
+        memset(prefetch, 0, sizeof(prefetch_t));
+        prefetch->map = map;
+        prefetch->len = len;
+        prefetch->last = 0;
+        prefetch->blocklen = blocklen;
+        prefetch->blocks = MIN(PREFETCH_MAX_BLOCKS, slots);
+        PREFETCH_PRINTF("checksum2_enable_prefetch len:%ld blocklen:%d blocks:%d\n", prefetch->len, prefetch->blocklen, prefetch->blocks);
+    }
+#else
+    (void)map;
+    (void)len;
+    (void)blocklen;
+#endif
+}
+
+static inline void checksum2_reset_prefetch() {
+    for (int i = 0; i < PREFETCH_MAX_BLOCKS; i++) {
+        prefetch->sums[i].in_use = 0;
+    }
+}
+
+static int get_checksum2_prefetched(int32 len, char* sum, OFF_T prefetch_offset) {
+    if (prefetch->sums[0].in_use) {
+        if ((prefetch->sums[0].offset == prefetch_offset) && (prefetch->sums[0].len == len)) {
+            memcpy(sum, prefetch->sums[0].sum, SUM_LENGTH);
+            for (int i = 0; i < PREFETCH_MAX_BLOCKS - 1; i++) {
+                prefetch->sums[i] = prefetch->sums[i + 1];
+            }
+            prefetch->sums[PREFETCH_MAX_BLOCKS - 1].in_use = 0;
+            PREFETCH_PRINTF("checksum2_prefetch HIT len:%d offset:%ld\n", len, prefetch_offset);
+            return 1;
+        } else {
+            // unexpected access, reset cache
+            PREFETCH_PRINTF("checksum2_prefetch MISS len:%d offset:%ld\n", len, prefetch_offset);
+            checksum2_reset_prefetch();
+        }
+    }
+    return 0;
+}
+
+static int checksum2_perform_prefetch(OFF_T prefetch_offset) {
+    int blocks = MIN(MAX(1, (prefetch->len + prefetch->blocklen - 1) / prefetch->blocklen), prefetch->blocks);
+    if (blocks < 2) return 0; // fall through to non-simd, probably faster
+
+    int32 total = 0;
+    int i;
+    for (i = 0; i < blocks; i++) {
+        prefetch->sums[i].offset = prefetch_offset + total;
+        prefetch->sums[i].len = MIN(prefetch->blocklen, prefetch->len - prefetch_offset - total);
+        prefetch->sums[i].in_use = 0;
+        total += prefetch->sums[i].len;
+    }
+    for (; i < PREFETCH_MAX_BLOCKS; i++) {
+        prefetch->sums[i].in_use = 0;
+    }
+
+    uchar seedbuf[4];
+    SIVALu(seedbuf, 0, checksum_seed);
+
+    PREFETCH_PRINTF("checksum2_perform_prefetch pos:%ld len:%d blocks:%d\n", prefetch_offset, total, blocks);
+    char* mapbuf = map_ptr(prefetch->map, prefetch_offset, total);
+    char* bufs[PREFETCH_MAX_BLOCKS] = {0};
+    int lens[PREFETCH_MAX_BLOCKS] = {0};
+    char* sums[PREFETCH_MAX_BLOCKS] = {0};
+    for (i = 0; i < blocks; i++) {
+        bufs[i] = mapbuf + prefetch->sums[i].offset - prefetch_offset;
+        lens[i] = prefetch->sums[i].len;
+        sums[i] = prefetch->sums[i].sum;
+    }
+    if (md5_parallel(blocks, bufs, lens, sums, (proper_seed_order && checksum_seed) ? (char*)seedbuf : NULL, (!proper_seed_order && checksum_seed) ? (char*)seedbuf : NULL)) {
+        for (i = 0; i < blocks; i++) {
+            prefetch->sums[i].in_use = 1;
+        }
+        return 1;
+    } else {
+        // this should never be, abort
+        PREFETCH_PRINTF("checksum2_perform_prefetch PMD5 ABORT\n");
+        checksum2_disable_prefetch();
+    }
+    return 0;
+}
+
+void get_checksum2(char *buf, int32 len, char *sum, OFF_T prefetch_offset) {
+    if (prefetch) {
+        PREFETCH_PRINTF("get_checksum2 %d @ %ld\n", len, prefetch_offset);
+        OFF_T last = prefetch->last;
+        prefetch->last = prefetch_offset;
+        if ((prefetch_offset != 0) && (prefetch_offset != last + prefetch->blocklen)) {
+            // we're looking around trying to align blocks, prefetching will slow things down
+            PREFETCH_PRINTF("get_checksum2 SEEK\n");
+            checksum2_reset_prefetch();
+        } else if (get_checksum2_prefetched(len, sum, prefetch_offset)) {
+            // hit
+            return;
+        } else if (checksum2_perform_prefetch(prefetch_offset)) {
+            if (get_checksum2_prefetched(len, sum, prefetch_offset)) {
+                // hit; should always be as we just fetched this data
+                return;
+            } else {
+                // this should never be, abort
+                PREFETCH_PRINTF("get_checksum2 MISSING DATA ABORT\n");
+                checksum2_disable_prefetch();
+            }
+        }
+    }
+    get_checksum2_nosimd(buf, len, sum, prefetch_offset);
+}
+
 }
 #endif /* HAVE_SIMD */
 #endif /* __cplusplus */
diff --git a/simd-md5-parallel-x86_64.cpp b/simd-md5-parallel-x86_64.cpp
new file mode 100644
index 000000000..a25f6c7ab
--- /dev/null
+++ b/simd-md5-parallel-x86_64.cpp
@@ -0,0 +1,919 @@
+/*
+ * SSE2/AVX2-optimized routines to process multiple MD5 streams in parallel.
+ *
+ * Original author: Nicolas Noble, 2017
+ * Modifications:   Jorrit Jongma, 2020
+ *
+ * The original code was released in the public domain by the original author,
+ * falling back to the MIT license ( http://www.opensource.org/licenses/MIT )
+ * in case public domain does not apply in your country. These modifications
+ * are likewise released in the public domain, with the same MIT license
+ * fallback.
+ *
+ * The original publication can be found at:
+ *
+ * https://github.com/nicolasnoble/sse-hash
+ */
+/*
+ * Nicolas' original code has been extended to add AVX2 support, all non-SIMD
+ * MD5 code has been removed and those code paths rerouted to use the MD5
+ * code already present in rsync, and wrapper functions have been added.
+ *
+ * This code allows multiple independent MD5 streams to be processed in
+ * parallel, 4 with SSE2, 8 with AVX2. While single-stream performance is
+ * lower than that of the original C routines for MD5, the processing of
+ * additional streams is "for free".
+ *
+ * Single streams are rerouted to rsync's normal MD5 code as that is faster
+ * for that case. A further optimization is possible by using SSE2 code on
+ * AVX2-supporting CPUs when the number of streams is 2, 3, or 4. This is not
+ * implemented here as it would require some restructuring, and in practise
+ * the code here is only rarely called with less than the maximum amount of
+ * streams (typically once at the end of each checksum2'd file).
+ *
+ * Benchmarks (in MB/s)            C     ASM  SSE2*1  SSE2*4  AVX2*1  AVX2*8
+ * - Intel Atom D2700            302     334     166     664     N/A     N/A
+ * - Intel i7-7700hq             351     376     289    1156     273    2184
+ * - AMD ThreadRipper 2950x      728     784     568    2272     430    3440
+ */
+
+#ifdef __x86_64__
+#ifdef __cplusplus
+
+extern "C" {
+
+#include "rsync.h"
+
+}
+
+#ifdef HAVE_SIMD
+
+#define PMD5_ALLOW_SSE2 // debugging
+#define PMD5_ALLOW_AVX2 // debugging
+
+#ifdef PMD5_ALLOW_AVX2
+#ifndef PMD5_ALLOW_SSE2
+#define PMD5_ALLOW_SSE2
+#endif
+#endif
+
+#include <stdint.h>
+#include <string.h>
+
+#include <immintrin.h>
+
+#define PMD5_SLOTS_DEFAULT 0
+#define PMD5_SLOTS_SSE2 4
+#define PMD5_SLOTS_AVX2 8
+#define PMD5_SLOTS_MAX PMD5_SLOTS_AVX2
+
+#ifdef PMD5_ALLOW_SSE2
+__attribute__ ((target("sse2"))) static int pmd5_slots() {
+    return PMD5_SLOTS_SSE2;
+}
+#endif
+
+#ifdef PMD5_ALLOW_AVX2
+__attribute__ ((target("avx2"))) static int pmd5_slots() {
+    return PMD5_SLOTS_AVX2;
+}
+#endif
+
+__attribute__ ((target("default"))) static int pmd5_slots() {
+    return PMD5_SLOTS_DEFAULT;
+}
+
+/* The parallel MD5 context structure. */
+typedef struct {
+    __m128i state_sse2[4];
+    __m256i state_avx2[4];
+    uint64_t len[PMD5_SLOTS_MAX];
+} pmd5_context;
+
+/* The status returned by the various functions below. */
+typedef enum {
+    PMD5_SUCCESS,
+    PMD5_INVALID_SLOT,
+    PMD5_UNALIGNED_UPDATE,
+} pmd5_status;
+
+/* Initializes all slots in the given pmd5 context. */
+static pmd5_status pmd5_init_all(pmd5_context * ctx);
+
+/* Initializes a single slot out in the given pmd5 context. */
+static pmd5_status pmd5_init_slot(pmd5_context * ctx, int slot);
+
+/* Makes an MD5 update on all slots in parallel, given the same exact length on all streams.
+   The stream pointers will be incremented accordingly.
+   It is valid for a stream pointer to be NULL. Garbage will then be hashed into its corresponding slot.
+   The argument length NEEDS to be a multiple of 64. If not, an error is returned, and the context is corrupted.
+   Stride defaults to 64 if 0 is passed. */
+static pmd5_status pmd5_update_all_simple(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t length, uint64_t stride);
+
+/* Makes an MD5 update on all slots in parallel, given different lengths.
+   The stream pointers will be incremented accordingly.
+   The lengths will be decreased accordingly. Not all data might be consumed.
+   It is valid for a stream pointer to be NULL. Garbage will then be hashed into its corresponding slot.
+   The argument lengths NEEDS to contain only multiples of 64. If not, an error is returned, and the context is corrupted. */
+static pmd5_status pmd5_update_all(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t lengths[PMD5_SLOTS_MAX]);
+
+/* Finishes all slots at once. Fills in all digests. */
+static pmd5_status pmd5_finish_all(pmd5_context * ctx, uint8_t digests[PMD5_SLOTS_MAX][MD5_DIGEST_LEN]);
+
+/* Finishes one slot. The other slots will be unnaffected. The finished slot can then continue to hash garbage using
+   a NULL pointer as its stream argument, or needs to be reinitialized using pmd5_init_slot before being usable again. */
+static pmd5_status pmd5_finish_slot(pmd5_context * ctx, uint8_t digest[MD5_DIGEST_LEN], int slot);
+
+/* Finishes one slot. Extra data is allowed to be passed on as an argument. Length DOESN'T need to be a
+   multiple of 64. The other slots will be unnaffected. The finished slot can then continue to hash garbage using
+   a NULL pointer as its stream argument, or needs to be reinitialized using pmd5_init_slot before being usable again. */
+static pmd5_status pmd5_finish_slot_with_extra(pmd5_context * ctx, uint8_t digest[MD5_DIGEST_LEN], int slot, const uint8_t * data, uint64_t length);
+
+/* Insert a normal MD5 context into a given slot of a given parallel MD5 context. */
+static pmd5_status md5_to_pmd5(const MD5_CTX * ctx, pmd5_context * pctx, int slot);
+
+/* Extract a normal MD5 context from a given slot of a given parallel MD5 context. */
+static pmd5_status pmd5_to_md5(const pmd5_context * pctx, MD5_CTX * ctx, int slot);
+
+#define S11  7
+#define S12 12
+#define S13 17
+#define S14 22
+#define S21  5
+#define S22  9
+#define S23 14
+#define S24 20
+#define S31  4
+#define S32 11
+#define S33 16
+#define S34 23
+#define S41  6
+#define S42 10
+#define S43 15
+#define S44 21
+
+#define T1  0xD76AA478
+#define T2  0xE8C7B756
+#define T3  0x242070DB
+#define T4  0xC1BDCEEE
+#define T5  0xF57C0FAF
+#define T6  0x4787C62A
+#define T7  0xA8304613
+#define T8  0xFD469501
+#define T9  0x698098D8
+#define T10 0x8B44F7AF
+#define T11 0xFFFF5BB1
+#define T12 0x895CD7BE
+#define T13 0x6B901122
+#define T14 0xFD987193
+#define T15 0xA679438E
+#define T16 0x49B40821
+#define T17 0xF61E2562
+#define T18 0xC040B340
+#define T19 0x265E5A51
+#define T20 0xE9B6C7AA
+#define T21 0xD62F105D
+#define T22 0x02441453
+#define T23 0xD8A1E681
+#define T24 0xE7D3FBC8
+#define T25 0x21E1CDE6
+#define T26 0xC33707D6
+#define T27 0xF4D50D87
+#define T28 0x455A14ED
+#define T29 0xA9E3E905
+#define T30 0xFCEFA3F8
+#define T31 0x676F02D9
+#define T32 0x8D2A4C8A
+#define T33 0xFFFA3942
+#define T34 0x8771F681
+#define T35 0x6D9D6122
+#define T36 0xFDE5380C
+#define T37 0xA4BEEA44
+#define T38 0x4BDECFA9
+#define T39 0xF6BB4B60
+#define T40 0xBEBFBC70
+#define T41 0x289B7EC6
+#define T42 0xEAA127FA
+#define T43 0xD4EF3085
+#define T44 0x04881D05
+#define T45 0xD9D4D039
+#define T46 0xE6DB99E5
+#define T47 0x1FA27CF8
+#define T48 0xC4AC5665
+#define T49 0xF4292244
+#define T50 0x432AFF97
+#define T51 0xAB9423A7
+#define T52 0xFC93A039
+#define T53 0x655B59C3
+#define T54 0x8F0CCC92
+#define T55 0xFFEFF47D
+#define T56 0x85845DD1
+#define T57 0x6FA87E4F
+#define T58 0xFE2CE6E0
+#define T59 0xA3014314
+#define T60 0x4E0811A1
+#define T61 0xF7537E82
+#define T62 0xBD3AF235
+#define T63 0x2AD7D2BB
+#define T64 0xEB86D391
+
+#define ROTL_SSE2(x, n) { \
+    __m128i s; \
+    s = _mm_srli_epi32(x, 32 - n); \
+    x = _mm_slli_epi32(x, n); \
+    x = _mm_or_si128(x, s); \
+};
+
+#define ROTL_AVX2(x, n) { \
+    __m256i s; \
+    s = _mm256_srli_epi32(x, 32 - n); \
+    x = _mm256_slli_epi32(x, n); \
+    x = _mm256_or_si256(x, s); \
+};
+
+#define F_SSE2(x, y, z) _mm_or_si128(_mm_and_si128(x, y), _mm_andnot_si128(x, z))
+#define G_SSE2(x, y, z) _mm_or_si128(_mm_and_si128(x, z), _mm_andnot_si128(z, y))
+#define H_SSE2(x, y, z) _mm_xor_si128(_mm_xor_si128(x, y), z)
+#define I_SSE2(x, y, z) _mm_xor_si128(y, _mm_or_si128(x, _mm_andnot_si128(z, _mm_set1_epi32(0xffffffff))))
+
+#define F_AVX2(x, y, z) _mm256_or_si256(_mm256_and_si256(x, y), _mm256_andnot_si256(x, z))
+#define G_AVX2(x, y, z) _mm256_or_si256(_mm256_and_si256(x, z), _mm256_andnot_si256(z, y))
+#define H_AVX2(x, y, z) _mm256_xor_si256(_mm256_xor_si256(x, y), z)
+#define I_AVX2(x, y, z) _mm256_xor_si256(y, _mm256_or_si256(x, _mm256_andnot_si256(z, _mm256_set1_epi32(0xffffffff))))
+
+#define SET_SSE2(step, a, b, c, d, x, s, ac) { \
+    a = _mm_add_epi32(_mm_add_epi32(a, _mm_add_epi32(x, _mm_set1_epi32(T##ac))), step##_SSE2(b, c, d)); \
+    ROTL_SSE2(a, s); \
+    a = _mm_add_epi32(a, b); \
+}
+
+#define SET_AVX2(step, a, b, c, d, x, s, ac) { \
+    a = _mm256_add_epi32(_mm256_add_epi32(a, _mm256_add_epi32(x, _mm256_set1_epi32(T##ac))), step##_AVX2(b, c, d)); \
+    ROTL_AVX2(a, s); \
+    a = _mm256_add_epi32(a, b); \
+}
+
+#define IA 0x67452301
+#define IB 0xefcdab89
+#define IC 0x98badcfe
+#define ID 0x10325476
+
+#define GET_MD5_DATA(dest, src, pos)         \
+    dest =                                   \
+        ((uint32_t) src[pos + 0]) <<  0 |    \
+        ((uint32_t) src[pos + 1]) <<  8 |    \
+        ((uint32_t) src[pos + 2]) << 16 |    \
+        ((uint32_t) src[pos + 3]) << 24
+
+#define GET_PMD5_DATA_SSE2(dest, src, pos) { \
+    uint32_t v0, v1, v2, v3;                 \
+    GET_MD5_DATA(v0, src[0], pos);           \
+    GET_MD5_DATA(v1, src[1], pos);           \
+    GET_MD5_DATA(v2, src[2], pos);           \
+    GET_MD5_DATA(v3, src[3], pos);           \
+    dest = _mm_setr_epi32(v0, v1, v2, v3);   \
+}
+
+#define GET_PMD5_DATA_AVX2(dest, src, pos) { \
+    uint32_t v0, v1, v2, v3;                 \
+    uint32_t v4, v5, v6, v7;                 \
+    GET_MD5_DATA(v0, src[0], pos);           \
+    GET_MD5_DATA(v1, src[1], pos);           \
+    GET_MD5_DATA(v2, src[2], pos);           \
+    GET_MD5_DATA(v3, src[3], pos);           \
+    GET_MD5_DATA(v4, src[4], pos);           \
+    GET_MD5_DATA(v5, src[5], pos);           \
+    GET_MD5_DATA(v6, src[6], pos);           \
+    GET_MD5_DATA(v7, src[7], pos);           \
+    dest = _mm256_setr_epi32(v0, v1, v2, v3, \
+                          v4, v5, v6, v7);   \
+}
+
+#define PUT_MD5_DATA(dest, val, pos) {       \
+    dest[pos + 0] = (val >>  0) & 0xff;      \
+    dest[pos + 1] = (val >>  8) & 0xff;      \
+    dest[pos + 2] = (val >> 16) & 0xff;      \
+    dest[pos + 3] = (val >> 24) & 0xff;      \
+}
+
+const static uint8_t md5_padding[64] = {
+    0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+#ifdef PMD5_ALLOW_SSE2
+__attribute__ ((target("sse2"))) static pmd5_status pmd5_init_all(pmd5_context * ctx) {
+    int i;
+    for (i = 0; i < PMD5_SLOTS_MAX; i++) {
+        ctx->len[i] = 0;
+    }
+
+    ctx->state_sse2[0] = _mm_set1_epi32(IA);
+    ctx->state_sse2[1] = _mm_set1_epi32(IB);
+    ctx->state_sse2[2] = _mm_set1_epi32(IC);
+    ctx->state_sse2[3] = _mm_set1_epi32(ID);
+
+    return PMD5_SUCCESS;
+}
+#endif
+
+#ifdef PMD5_ALLOW_AVX2
+__attribute__ ((target("avx2"))) static pmd5_status pmd5_init_all(pmd5_context * ctx) {
+    int i;
+    for (i = 0; i < PMD5_SLOTS_MAX; i++) {
+        ctx->len[i] = 0;
+    }
+
+    ctx->state_avx2[0] = _mm256_set1_epi32(IA);
+    ctx->state_avx2[1] = _mm256_set1_epi32(IB);
+    ctx->state_avx2[2] = _mm256_set1_epi32(IC);
+    ctx->state_avx2[3] = _mm256_set1_epi32(ID);
+
+    return PMD5_SUCCESS;
+}
+#endif
+
+__attribute__ ((target("default"))) static pmd5_status pmd5_init_all(pmd5_context * ctx) {
+    return PMD5_INVALID_SLOT;
+}
+
+#ifdef PMD5_ALLOW_SSE2
+__attribute__ ((target("sse2"))) static pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+    if ((slot >= PMD5_SLOTS_SSE2) || (slot < 0))
+        return PMD5_INVALID_SLOT;
+
+    uint32_t v[4][PMD5_SLOTS_SSE2];
+    int i;
+
+    for (i = 0; i < 4; i++) {
+        _mm_store_si128((__m128i_u*)v[i], ctx->state_sse2[i]);
+    }
+
+    v[0][slot] = a;
+    v[1][slot] = b;
+    v[2][slot] = c;
+    v[3][slot] = d;
+
+    for (i = 0; i < 4; i++) {
+        ctx->state_sse2[i] = _mm_loadu_si128((__m128i_u*)v[i]);
+    }
+
+    return PMD5_SUCCESS;
+}
+#endif
+
+#ifdef PMD5_ALLOW_AVX2
+__attribute__ ((target("avx2"))) static pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+    if ((slot >= PMD5_SLOTS_AVX2) || (slot < 0))
+        return PMD5_INVALID_SLOT;
+
+    uint32_t v[4][PMD5_SLOTS_AVX2];
+    int i;
+
+    for (i = 0; i < 4; i++) {
+        _mm256_store_si256((__m256i_u*)v[i], ctx->state_avx2[i]);
+    }
+
+    v[0][slot] = a;
+    v[1][slot] = b;
+    v[2][slot] = c;
+    v[3][slot] = d;
+
+    for (i = 0; i < 4; i++) {
+        ctx->state_avx2[i] = _mm256_lddqu_si256((__m256i_u*)v[i]);
+    }
+
+    return PMD5_SUCCESS;
+}
+#endif
+
+__attribute__ ((target("default"))) static pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+    return PMD5_INVALID_SLOT;
+}
+
+#ifdef PMD5_ALLOW_SSE2
+__attribute__ ((target("sse2"))) static pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d) {
+    if ((slot >= PMD5_SLOTS_SSE2) || (slot < 0))
+        return PMD5_INVALID_SLOT;
+
+    uint32_t v[4][PMD5_SLOTS_SSE2];
+    int i;
+
+    for (i = 0; i < 4; i++) {
+        _mm_store_si128((__m128i_u*)v[i], ctx->state_sse2[i]);
+    }
+
+    *a = v[0][slot];
+    *b = v[1][slot];
+    *c = v[2][slot];
+    *d = v[3][slot];
+
+    return PMD5_SUCCESS;
+}
+#endif
+
+#ifdef PMD5_ALLOW_AVX2
+__attribute__ ((target("avx2"))) static pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d) {
+    if ((slot >= PMD5_SLOTS_AVX2) || (slot < 0))
+        return PMD5_INVALID_SLOT;
+
+    uint32_t v[4][PMD5_SLOTS_AVX2];
+    int i;
+
+    for (i = 0; i < 4; i++) {
+        _mm256_store_si256((__m256i_u*)v[i], ctx->state_avx2[i]);
+    }
+
+    *a = v[0][slot];
+    *b = v[1][slot];
+    *c = v[2][slot];
+    *d = v[3][slot];
+
+    return PMD5_SUCCESS;
+}
+#endif
+
+__attribute__ ((target("default"))) static pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d) {
+    return PMD5_INVALID_SLOT;
+}
+
+static pmd5_status pmd5_init_slot(pmd5_context * ctx, int slot) {
+    return pmd5_set_slot(ctx, slot, IA, IB, IC, ID);
+}
+
+#ifdef PMD5_ALLOW_SSE2
+__attribute__ ((target("sse2"))) static void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX]) {
+    __m128i W[MD5_DIGEST_LEN], a, b, c, d;
+
+    GET_PMD5_DATA_SSE2(W[ 0], data,  0);
+    GET_PMD5_DATA_SSE2(W[ 1], data,  4);
+    GET_PMD5_DATA_SSE2(W[ 2], data,  8);
+    GET_PMD5_DATA_SSE2(W[ 3], data, 12);
+    GET_PMD5_DATA_SSE2(W[ 4], data, 16);
+    GET_PMD5_DATA_SSE2(W[ 5], data, 20);
+    GET_PMD5_DATA_SSE2(W[ 6], data, 24);
+    GET_PMD5_DATA_SSE2(W[ 7], data, 28);
+    GET_PMD5_DATA_SSE2(W[ 8], data, 32);
+    GET_PMD5_DATA_SSE2(W[ 9], data, 36);
+    GET_PMD5_DATA_SSE2(W[10], data, 40);
+    GET_PMD5_DATA_SSE2(W[11], data, 44);
+    GET_PMD5_DATA_SSE2(W[12], data, 48);
+    GET_PMD5_DATA_SSE2(W[13], data, 52);
+    GET_PMD5_DATA_SSE2(W[14], data, 56);
+    GET_PMD5_DATA_SSE2(W[15], data, 60);
+
+    a = ctx->state_sse2[0];
+    b = ctx->state_sse2[1];
+    c = ctx->state_sse2[2];
+    d = ctx->state_sse2[3];
+
+    SET_SSE2(F, a, b, c, d, W[ 0], S11,  1);
+    SET_SSE2(F, d, a, b, c, W[ 1], S12,  2);
+    SET_SSE2(F, c, d, a, b, W[ 2], S13,  3);
+    SET_SSE2(F, b, c, d, a, W[ 3], S14,  4);
+    SET_SSE2(F, a, b, c, d, W[ 4], S11,  5);
+    SET_SSE2(F, d, a, b, c, W[ 5], S12,  6);
+    SET_SSE2(F, c, d, a, b, W[ 6], S13,  7);
+    SET_SSE2(F, b, c, d, a, W[ 7], S14,  8);
+    SET_SSE2(F, a, b, c, d, W[ 8], S11,  9);
+    SET_SSE2(F, d, a, b, c, W[ 9], S12, 10);
+    SET_SSE2(F, c, d, a, b, W[10], S13, 11);
+    SET_SSE2(F, b, c, d, a, W[11], S14, 12);
+    SET_SSE2(F, a, b, c, d, W[12], S11, 13);
+    SET_SSE2(F, d, a, b, c, W[13], S12, 14);
+    SET_SSE2(F, c, d, a, b, W[14], S13, 15);
+    SET_SSE2(F, b, c, d, a, W[15], S14, 16);
+
+    SET_SSE2(G, a, b, c, d, W[ 1], S21, 17);
+    SET_SSE2(G, d, a, b, c, W[ 6], S22, 18);
+    SET_SSE2(G, c, d, a, b, W[11], S23, 19);
+    SET_SSE2(G, b, c, d, a, W[ 0], S24, 20);
+    SET_SSE2(G, a, b, c, d, W[ 5], S21, 21);
+    SET_SSE2(G, d, a, b, c, W[10], S22, 22);
+    SET_SSE2(G, c, d, a, b, W[15], S23, 23);
+    SET_SSE2(G, b, c, d, a, W[ 4], S24, 24);
+    SET_SSE2(G, a, b, c, d, W[ 9], S21, 25);
+    SET_SSE2(G, d, a, b, c, W[14], S22, 26);
+    SET_SSE2(G, c, d, a, b, W[ 3], S23, 27);
+    SET_SSE2(G, b, c, d, a, W[ 8], S24, 28);
+    SET_SSE2(G, a, b, c, d, W[13], S21, 29);
+    SET_SSE2(G, d, a, b, c, W[ 2], S22, 30);
+    SET_SSE2(G, c, d, a, b, W[ 7], S23, 31);
+    SET_SSE2(G, b, c, d, a, W[12], S24, 32);
+
+    SET_SSE2(H, a, b, c, d, W[ 5], S31, 33);
+    SET_SSE2(H, d, a, b, c, W[ 8], S32, 34);
+    SET_SSE2(H, c, d, a, b, W[11], S33, 35);
+    SET_SSE2(H, b, c, d, a, W[14], S34, 36);
+    SET_SSE2(H, a, b, c, d, W[ 1], S31, 37);
+    SET_SSE2(H, d, a, b, c, W[ 4], S32, 38);
+    SET_SSE2(H, c, d, a, b, W[ 7], S33, 39);
+    SET_SSE2(H, b, c, d, a, W[10], S34, 40);
+    SET_SSE2(H, a, b, c, d, W[13], S31, 41);
+    SET_SSE2(H, d, a, b, c, W[ 0], S32, 42);
+    SET_SSE2(H, c, d, a, b, W[ 3], S33, 43);
+    SET_SSE2(H, b, c, d, a, W[ 6], S34, 44);
+    SET_SSE2(H, a, b, c, d, W[ 9], S31, 45);
+    SET_SSE2(H, d, a, b, c, W[12], S32, 46);
+    SET_SSE2(H, c, d, a, b, W[15], S33, 47);
+    SET_SSE2(H, b, c, d, a, W[ 2], S34, 48);
+
+    SET_SSE2(I, a, b, c, d, W[ 0], S41, 49);
+    SET_SSE2(I, d, a, b, c, W[ 7], S42, 50);
+    SET_SSE2(I, c, d, a, b, W[14], S43, 51);
+    SET_SSE2(I, b, c, d, a, W[ 5], S44, 52);
+    SET_SSE2(I, a, b, c, d, W[12], S41, 53);
+    SET_SSE2(I, d, a, b, c, W[ 3], S42, 54);
+    SET_SSE2(I, c, d, a, b, W[10], S43, 55);
+    SET_SSE2(I, b, c, d, a, W[ 1], S44, 56);
+    SET_SSE2(I, a, b, c, d, W[ 8], S41, 57);
+    SET_SSE2(I, d, a, b, c, W[15], S42, 58);
+    SET_SSE2(I, c, d, a, b, W[ 6], S43, 59);
+    SET_SSE2(I, b, c, d, a, W[13], S44, 60);
+    SET_SSE2(I, a, b, c, d, W[ 4], S41, 61);
+    SET_SSE2(I, d, a, b, c, W[11], S42, 62);
+    SET_SSE2(I, c, d, a, b, W[ 2], S43, 63);
+    SET_SSE2(I, b, c, d, a, W[ 9], S44, 64);
+
+    ctx->state_sse2[0] = _mm_add_epi32(ctx->state_sse2[0], a);
+    ctx->state_sse2[1] = _mm_add_epi32(ctx->state_sse2[1], b);
+    ctx->state_sse2[2] = _mm_add_epi32(ctx->state_sse2[2], c);
+    ctx->state_sse2[3] = _mm_add_epi32(ctx->state_sse2[3], d);
+}
+#endif
+
+#ifdef PMD5_ALLOW_AVX2
+__attribute__ ((target("avx2"))) static void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX]) {
+    __m256i W[MD5_DIGEST_LEN], a, b, c, d;
+
+    GET_PMD5_DATA_AVX2(W[ 0], data,  0);
+    GET_PMD5_DATA_AVX2(W[ 1], data,  4);
+    GET_PMD5_DATA_AVX2(W[ 2], data,  8);
+    GET_PMD5_DATA_AVX2(W[ 3], data, 12);
+    GET_PMD5_DATA_AVX2(W[ 4], data, 16);
+    GET_PMD5_DATA_AVX2(W[ 5], data, 20);
+    GET_PMD5_DATA_AVX2(W[ 6], data, 24);
+    GET_PMD5_DATA_AVX2(W[ 7], data, 28);
+    GET_PMD5_DATA_AVX2(W[ 8], data, 32);
+    GET_PMD5_DATA_AVX2(W[ 9], data, 36);
+    GET_PMD5_DATA_AVX2(W[10], data, 40);
+    GET_PMD5_DATA_AVX2(W[11], data, 44);
+    GET_PMD5_DATA_AVX2(W[12], data, 48);
+    GET_PMD5_DATA_AVX2(W[13], data, 52);
+    GET_PMD5_DATA_AVX2(W[14], data, 56);
+    GET_PMD5_DATA_AVX2(W[15], data, 60);
+
+    a = ctx->state_avx2[0];
+    b = ctx->state_avx2[1];
+    c = ctx->state_avx2[2];
+    d = ctx->state_avx2[3];
+
+    SET_AVX2(F, a, b, c, d, W[ 0], S11,  1);
+    SET_AVX2(F, d, a, b, c, W[ 1], S12,  2);
+    SET_AVX2(F, c, d, a, b, W[ 2], S13,  3);
+    SET_AVX2(F, b, c, d, a, W[ 3], S14,  4);
+    SET_AVX2(F, a, b, c, d, W[ 4], S11,  5);
+    SET_AVX2(F, d, a, b, c, W[ 5], S12,  6);
+    SET_AVX2(F, c, d, a, b, W[ 6], S13,  7);
+    SET_AVX2(F, b, c, d, a, W[ 7], S14,  8);
+    SET_AVX2(F, a, b, c, d, W[ 8], S11,  9);
+    SET_AVX2(F, d, a, b, c, W[ 9], S12, 10);
+    SET_AVX2(F, c, d, a, b, W[10], S13, 11);
+    SET_AVX2(F, b, c, d, a, W[11], S14, 12);
+    SET_AVX2(F, a, b, c, d, W[12], S11, 13);
+    SET_AVX2(F, d, a, b, c, W[13], S12, 14);
+    SET_AVX2(F, c, d, a, b, W[14], S13, 15);
+    SET_AVX2(F, b, c, d, a, W[15], S14, 16);
+
+    SET_AVX2(G, a, b, c, d, W[ 1], S21, 17);
+    SET_AVX2(G, d, a, b, c, W[ 6], S22, 18);
+    SET_AVX2(G, c, d, a, b, W[11], S23, 19);
+    SET_AVX2(G, b, c, d, a, W[ 0], S24, 20);
+    SET_AVX2(G, a, b, c, d, W[ 5], S21, 21);
+    SET_AVX2(G, d, a, b, c, W[10], S22, 22);
+    SET_AVX2(G, c, d, a, b, W[15], S23, 23);
+    SET_AVX2(G, b, c, d, a, W[ 4], S24, 24);
+    SET_AVX2(G, a, b, c, d, W[ 9], S21, 25);
+    SET_AVX2(G, d, a, b, c, W[14], S22, 26);
+    SET_AVX2(G, c, d, a, b, W[ 3], S23, 27);
+    SET_AVX2(G, b, c, d, a, W[ 8], S24, 28);
+    SET_AVX2(G, a, b, c, d, W[13], S21, 29);
+    SET_AVX2(G, d, a, b, c, W[ 2], S22, 30);
+    SET_AVX2(G, c, d, a, b, W[ 7], S23, 31);
+    SET_AVX2(G, b, c, d, a, W[12], S24, 32);
+
+    SET_AVX2(H, a, b, c, d, W[ 5], S31, 33);
+    SET_AVX2(H, d, a, b, c, W[ 8], S32, 34);
+    SET_AVX2(H, c, d, a, b, W[11], S33, 35);
+    SET_AVX2(H, b, c, d, a, W[14], S34, 36);
+    SET_AVX2(H, a, b, c, d, W[ 1], S31, 37);
+    SET_AVX2(H, d, a, b, c, W[ 4], S32, 38);
+    SET_AVX2(H, c, d, a, b, W[ 7], S33, 39);
+    SET_AVX2(H, b, c, d, a, W[10], S34, 40);
+    SET_AVX2(H, a, b, c, d, W[13], S31, 41);
+    SET_AVX2(H, d, a, b, c, W[ 0], S32, 42);
+    SET_AVX2(H, c, d, a, b, W[ 3], S33, 43);
+    SET_AVX2(H, b, c, d, a, W[ 6], S34, 44);
+    SET_AVX2(H, a, b, c, d, W[ 9], S31, 45);
+    SET_AVX2(H, d, a, b, c, W[12], S32, 46);
+    SET_AVX2(H, c, d, a, b, W[15], S33, 47);
+    SET_AVX2(H, b, c, d, a, W[ 2], S34, 48);
+
+    SET_AVX2(I, a, b, c, d, W[ 0], S41, 49);
+    SET_AVX2(I, d, a, b, c, W[ 7], S42, 50);
+    SET_AVX2(I, c, d, a, b, W[14], S43, 51);
+    SET_AVX2(I, b, c, d, a, W[ 5], S44, 52);
+    SET_AVX2(I, a, b, c, d, W[12], S41, 53);
+    SET_AVX2(I, d, a, b, c, W[ 3], S42, 54);
+    SET_AVX2(I, c, d, a, b, W[10], S43, 55);
+    SET_AVX2(I, b, c, d, a, W[ 1], S44, 56);
+    SET_AVX2(I, a, b, c, d, W[ 8], S41, 57);
+    SET_AVX2(I, d, a, b, c, W[15], S42, 58);
+    SET_AVX2(I, c, d, a, b, W[ 6], S43, 59);
+    SET_AVX2(I, b, c, d, a, W[13], S44, 60);
+    SET_AVX2(I, a, b, c, d, W[ 4], S41, 61);
+    SET_AVX2(I, d, a, b, c, W[11], S42, 62);
+    SET_AVX2(I, c, d, a, b, W[ 2], S43, 63);
+    SET_AVX2(I, b, c, d, a, W[ 9], S44, 64);
+
+    ctx->state_avx2[0] = _mm256_add_epi32(ctx->state_avx2[0], a);
+    ctx->state_avx2[1] = _mm256_add_epi32(ctx->state_avx2[1], b);
+    ctx->state_avx2[2] = _mm256_add_epi32(ctx->state_avx2[2], c);
+    ctx->state_avx2[3] = _mm256_add_epi32(ctx->state_avx2[3], d);
+}
+#endif
+
+__attribute__ ((target("default"))) static void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX]) {
+}
+
+static pmd5_status pmd5_update_all_simple(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t length, uint64_t stride) {
+    const uint8_t * ptrs[PMD5_SLOTS_MAX];
+
+    if (!length) return PMD5_SUCCESS;
+
+    int slots = pmd5_slots();
+
+    if (!stride) stride = 64;
+
+    int i;
+    for (i = 0; i < slots; i++) {
+        ptrs[i] = data[i];
+        ctx->len[i] += length;
+        if (!ptrs[i]) ptrs[i] = md5_padding;
+    }
+
+    while (length >= 64) {
+        pmd5_process(ctx, ptrs);
+        length -= 64;
+        for (i = 0; i < slots; i++) {
+            if (data[i]) ptrs[i] += stride;
+        }
+    }
+
+    if (length) return PMD5_UNALIGNED_UPDATE;
+
+    for (i = 0; i < slots; i++) {
+        if (data[i]) data[i] = ptrs[i];
+    }
+
+    return PMD5_SUCCESS;
+}
+
+static pmd5_status pmd5_update_all(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t lengths[PMD5_SLOTS_MAX]) {
+    uint64_t length = 0;
+    int slots = pmd5_slots();
+
+    int i;
+    for (i = 0; i < slots; i++) {
+        if ((length == 0) || (lengths[i] < length)) length = lengths[i];
+    }
+
+    for (i = 0; i < slots; i++) {
+        lengths[i] -= length;
+    }
+
+    return pmd5_update_all_simple(ctx, data, length, 0);
+}
+
+static pmd5_status pmd5_finish_slot_with_extra(pmd5_context * pctx, uint8_t digest[MD5_DIGEST_LEN], int slot, const uint8_t * data, uint64_t length) {
+    MD5_CTX ctx;
+
+    if ((slot >= pmd5_slots()) || (slot < 0))
+        return PMD5_INVALID_SLOT;
+
+    pmd5_to_md5(pctx, &ctx, slot);
+    if (data && length) {
+        MD5_Update(&ctx, data, length);
+    }
+    MD5_Final(digest, &ctx);
+
+    return PMD5_SUCCESS;
+}
+
+static pmd5_status pmd5_finish_slot(pmd5_context * pctx, uint8_t digest[MD5_DIGEST_LEN], int slot) {
+    return pmd5_finish_slot_with_extra(pctx, digest, slot, NULL, 0);
+}
+
+static pmd5_status pmd5_finish_all(pmd5_context * ctx, uint8_t digests[PMD5_SLOTS_MAX][MD5_DIGEST_LEN]) {
+    int i;
+    for (i = 0; i < pmd5_slots(); i++) {
+        pmd5_finish_slot_with_extra(ctx, digests[i], i, NULL, 0);
+    }
+    return PMD5_SUCCESS;
+}
+
+static pmd5_status md5_to_pmd5(const MD5_CTX * ctx, pmd5_context * pctx, int slot) {
+    if ((slot >= pmd5_slots()) || (slot < 0))
+        return PMD5_INVALID_SLOT;
+
+    // TODO This function ignores buffered but as of yet unhashed data. We're not using this function, just noting.
+
+#ifdef USE_OPENSSL
+    pctx->len[slot] = (ctx->Nl >> 3) + ((uint64_t)ctx->Nh << 29);
+#else
+    pctx->len[slot] = ctx->totalN + ((uint64_t)ctx->totalN2 << 32);
+#endif
+    return pmd5_set_slot(pctx, slot, (uint32_t)ctx->A, (uint32_t)ctx->B, (uint32_t)ctx->C, (uint32_t)ctx->D);
+}
+
+static pmd5_status pmd5_to_md5(const pmd5_context * pctx, MD5_CTX * ctx, int slot) {
+    if ((slot >= pmd5_slots()) || (slot < 0))
+        return PMD5_INVALID_SLOT;
+
+    MD5_Init(ctx);
+
+#ifdef USE_OPENSSL
+    ctx->Nl = (pctx->len[slot] << 3) & 0xFFFFFFFF;
+    ctx->Nh = pctx->len[slot] >> 29;
+
+    uint32_t a, b, c, d;
+    pmd5_status ret = pmd5_get_slot(pctx, slot, &a, &b, &c, &d);
+    if (ret == PMD5_SUCCESS) {
+        // OpenSSL may be using 64-bit types so we couldn't pass them directly
+        ctx->A = a;
+        ctx->B = b;
+        ctx->C = c;
+        ctx->D = d;
+    }
+    return ret;
+#else
+    ctx->totalN = pctx->len[slot] & 0xFFFFFFFF;
+    ctx->totalN2 = pctx->len[slot] >> 32;
+    return pmd5_get_slot(pctx, slot, &ctx->A, &ctx->B, &ctx->C, &ctx->D);
+#endif
+}
+
+extern "C" {
+
+int md5_parallel_slots() {
+    int slots = pmd5_slots();
+    if (slots == 0) return 1;
+    return slots;
+}
+
+int md5_parallel(int streams, char** buf, int* len, char** sum, char* pre4, char* post4) {
+    int slots = md5_parallel_slots();
+    if ((streams < 1) || (streams > slots)) return 0;
+    if (pre4 && post4) return 0;
+    
+    if (slots == 1) {
+        MD5_CTX ctx;
+        MD5_Init(&ctx);
+        if (pre4) {
+            MD5_Update(&ctx, (const unsigned char*)pre4, 4);
+        }
+        MD5_Update(&ctx, (const unsigned char*)buf[0], len[0]);
+        if (post4) {
+            MD5_Update(&ctx, (const unsigned char*)post4, 4);
+        }
+        if (sum[0]) {
+            MD5_Final((uint8_t*)sum[0], &ctx);
+        }
+        return 0;
+    }
+
+    int i;
+    int active[PMD5_SLOTS_MAX];
+    char* buffers[PMD5_SLOTS_MAX];
+    uint64_t left[PMD5_SLOTS_MAX];
+    for (i = 0; i < PMD5_SLOTS_MAX; i++) {
+        active[i] = streams > i;
+        if (i < streams) {
+            buffers[i] = buf[i];
+            left[i] = (uint64_t)len[i];
+        } else {
+            buffers[i] = NULL;
+            left[i] = 0;
+        }
+    }
+    MD5_CTX results[PMD5_SLOTS_MAX];
+
+    pmd5_context ctx_simd;
+    if (pmd5_init_all(&ctx_simd) != PMD5_SUCCESS) return 0;
+
+    if (pre4) {
+        char temp_buffers[PMD5_SLOTS_MAX][64];
+        int have_any = 0;
+        for (i = 0; i < slots; i++) {
+            if (active[i]) {
+                if (left[i] < 60) {
+                    MD5_Init(&results[i]);
+                    MD5_Update(&results[i], (const unsigned char*)pre4, 4);
+                    MD5_Update(&results[i], (const unsigned char*)buf[i], left[i]);
+                    active[i] = 0;
+                    left[i] = 0;
+                } else {
+                    memcpy(temp_buffers[i], pre4, 4);
+                    memcpy(temp_buffers[i] + 4, buffers[i], 60);
+                    buffers[i] += 60;
+                    left[i] -= 60;
+                    have_any = 1;
+                }
+            }
+        }
+
+        if (have_any) {
+            char* ptrs[PMD5_SLOTS_MAX];
+            for (i = 0; i < PMD5_SLOTS_MAX; i++) {
+                ptrs[i] = &temp_buffers[i][0];
+            }
+            if (pmd5_update_all_simple(&ctx_simd, (const uint8_t**)ptrs, 64, 0) != PMD5_SUCCESS) {
+                return 0;
+            }
+        }
+    }
+
+    int failed = 0;
+    while (true) {
+        for (i = 0; i < slots; i++) {
+            if (active[i] && (left[i] < 64)) {
+                if (pmd5_to_md5(&ctx_simd, &results[i], i) != PMD5_SUCCESS) {
+                    failed = 1;
+                }
+                active[i] = 0;
+            }
+        }
+
+        uint64_t shortest = 0;
+        for (i = 0; i < slots; i++) {
+            if (!active[i]) {
+                buffers[i] = NULL;
+            } else if ((shortest == 0) || (left[i] < shortest)) {
+                shortest = left[i];
+            }
+        }
+
+        if (shortest > 0) {
+            shortest = shortest & ~63;
+            if (pmd5_update_all_simple(&ctx_simd, (const uint8_t**)buffers, shortest, 0) != PMD5_SUCCESS) {
+                failed = 1;
+            }
+            for (i = 0; i < slots; i++) {
+                if (active[i]) {
+                    left[i] -= shortest;
+                }
+            }
+        }
+
+        if (failed) {
+            return 0;
+        } else {
+            int have_any = 0;
+            for (i = 0; i < slots; i++) {
+                have_any |= active[i];
+            }
+            if (!have_any) {
+                break;
+            }
+        }
+    }
+
+    for (i = 0; i < slots; i++) {
+        if (i < streams) {
+            if (left[i] > 0) {
+                // buffer[i] == NULL here
+                MD5_Update(&results[i], (const unsigned char*)buf[i] + len[i] - left[i], left[i]);
+            }
+            if (post4) {
+                MD5_Update(&results[i], (const unsigned char*)post4, 4);
+            }
+            if (sum[i]) {
+                MD5_Final((uint8_t*)sum[i], &results[i]);
+            }
+        }
+    }
+
+    return 1;
+}
+
+}
+
+#endif /* HAVE_SIMD */
+#endif /* __cplusplus */
+#endif /* __x86_64__ */
\ No newline at end of file

From ffd83cc99df93015a817981ad05e05a71312ff64 Mon Sep 17 00:00:00 2001
From: Jorrit Jongma <git@jongma.org>
Date: Fri, 29 May 2020 17:55:14 +0200
Subject: [PATCH 3/3] MD5P8 (whole-file) checksums: parallelization-friendly
 MD5

Splits the input up into 8 independent streams (64-byte interleave), and
produces a final checksum based on the end state of those 8 streams. If
parallelization of MD5 hashing is available, the performance gain is 2x
to 6x.

xxHash is still preferred (and faster), but this provides a reasonably
fast fallback for the case where xxHash libraries are not available at
build time.
---
 Makefile.in                  |   2 +-
 checksum.c                   |  30 +++++++
 lib/md5p8.c                  | 130 +++++++++++++++++++++++++++++
 lib/mdigest.h                |  11 +++
 simd-checksum-x86_64.cpp     |   3 +-
 simd-md5-parallel-x86_64.cpp | 156 ++++++++++++++++++++++++++++++++++-
 6 files changed, 328 insertions(+), 4 deletions(-)
 create mode 100644 lib/md5p8.c

diff --git a/Makefile.in b/Makefile.in
index f1a01fd33..923d90a61 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -36,7 +36,7 @@ SIMD_x86_64=simd-checksum-x86_64.o simd-md5-parallel-x86_64.o lib/md5-asm-x86_64
 GENFILES=configure.sh aclocal.m4 config.h.in proto.h proto.h-tstamp rsync.1 rsync-ssl.1 rsyncd.conf.5
 HEADERS=byteorder.h config.h errcode.h proto.h rsync.h ifuncs.h itypes.h inums.h \
 	lib/pool_alloc.h
-LIBOBJ=lib/wildmatch.o lib/compat.o lib/snprintf.o lib/mdfour.o lib/md5.o \
+LIBOBJ=lib/wildmatch.o lib/compat.o lib/snprintf.o lib/mdfour.o lib/md5.o lib/md5p8.o \
 	lib/permstring.o lib/pool_alloc.o lib/sysacls.o lib/sysxattrs.o @LIBOBJS@
 zlib_OBJS=zlib/deflate.o zlib/inffast.o zlib/inflate.o zlib/inftrees.o \
 	zlib/trees.o zlib/zutil.o zlib/adler32.o zlib/compress.o zlib/crc32.o
diff --git a/checksum.c b/checksum.c
index f16e07d08..b6bdaa0c8 100644
--- a/checksum.c
+++ b/checksum.c
@@ -45,6 +45,7 @@ extern const char *checksum_choice;
 #define CSUM_MD4 4
 #define CSUM_MD5 5
 #define CSUM_XXH64 6
+#define CSUM_MD5P8 7
 
 struct name_num_obj valid_checksums = {
 	"checksum", NULL, NULL, 0, 0, {
@@ -52,6 +53,7 @@ struct name_num_obj valid_checksums = {
 		{ CSUM_XXH64, "xxh64", NULL },
 		{ CSUM_XXH64, "xxhash", NULL },
 #endif
+        { CSUM_MD5P8, "md5p8", NULL },
 		{ CSUM_MD5, "md5", NULL },
 		{ CSUM_MD4, "md4", NULL },
 		{ CSUM_NONE, "none", NULL },
@@ -139,6 +141,7 @@ int csum_len_for_type(int cst, BOOL flist_csum)
 	  case CSUM_MD4_OLD:
 	  case CSUM_MD4_BUSTED:
 		return MD4_DIGEST_LEN;
+	  case CSUM_MD5P8:
 	  case CSUM_MD5:
 		return MD5_DIGEST_LEN;
 #ifdef SUPPORT_XXHASH
@@ -164,6 +167,7 @@ int canonical_checksum(int csum_type)
 	  case CSUM_MD4_BUSTED:
 		break;
 	  case CSUM_MD4:
+	  case CSUM_MD5P8:
 	  case CSUM_MD5:
 		return -1;
 #ifdef SUPPORT_XXHASH
@@ -217,6 +221,7 @@ void get_checksum2_nosimd(char *buf, int32 len, char *sum, UNUSED(OFF_T prefetch
 		SIVAL64(sum, 0, XXH64(buf, len, checksum_seed));
 		break;
 #endif
+	  case CSUM_MD5P8:  // == CSUM_MD5 for checksum2
 	  case CSUM_MD5: {
 		MD5_CTX m5;
 		uchar seedbuf[4];
@@ -332,6 +337,21 @@ void file_checksum(const char *fname, const STRUCT_STAT *st_p, char *sum)
 		break;
 	  }
 #endif
+	  case CSUM_MD5P8: {
+		MD5P8_CTX m5p8;
+
+		MD5P8_Init(&m5p8);
+
+		for (i = 0; i + CHUNK_SIZE <= len; i += CHUNK_SIZE)
+			MD5P8_Update(&m5p8, (uchar *)map_ptr(buf, i, CHUNK_SIZE), CHUNK_SIZE);
+
+		remainder = (int32)(len - i);
+		if (remainder > 0)
+			MD5P8_Update(&m5p8, (uchar *)map_ptr(buf, i, remainder), remainder);
+
+		MD5P8_Final((uchar *)sum, &m5p8);
+		break;
+	  }
 	  case CSUM_MD5: {
 		MD5_CTX m5;
 
@@ -407,6 +427,7 @@ static union {
 #ifdef SUPPORT_XXHASH
 static XXH64_state_t* xxh64_state;
 #endif
+static MD5P8_CTX m5p8;
 static int cursum_type;
 
 void sum_init(int csum_type, int seed)
@@ -425,6 +446,9 @@ void sum_init(int csum_type, int seed)
 		XXH64_reset(xxh64_state, 0);
 		break;
 #endif
+	  case CSUM_MD5P8:
+		MD5P8_Init(&m5p8);
+		break;
 	  case CSUM_MD5:
 		MD5_Init(&ctx.m5);
 		break;
@@ -467,6 +491,9 @@ void sum_update(const char *p, int32 len)
 		XXH64_update(xxh64_state, p, len);
 		break;
 #endif
+	  case CSUM_MD5P8:
+		MD5P8_Update(&m5p8, (uchar *)p, len);
+		break;
 	  case CSUM_MD5:
 		MD5_Update(&ctx.m5, (uchar *)p, len);
 		break;
@@ -521,6 +548,9 @@ int sum_end(char *sum)
 		SIVAL64(sum, 0, XXH64_digest(xxh64_state));
 		break;
 #endif
+	  case CSUM_MD5P8:
+		MD5P8_Final((uchar *)sum, &m5p8);
+		break;
 	  case CSUM_MD5:
 		MD5_Final((uchar *)sum, &ctx.m5);
 		break;
diff --git a/lib/md5p8.c b/lib/md5p8.c
new file mode 100644
index 000000000..af83bc0e5
--- /dev/null
+++ b/lib/md5p8.c
@@ -0,0 +1,130 @@
+/*
+ * MD5-based hash friendly to parallel processing, reference implementation
+ *
+ * Author: Jorrit Jongma, 2020
+ *
+ * Released in the public domain falling back to the MIT license
+ * ( http://www.opensource.org/licenses/MIT ) in case public domain does not
+ * apply in your country.
+ */
+/*
+ * MD5P8 is an MD5-based hash friendly to parallel processing. The input
+ * stream is divided into 8 independent streams. For each 512 bytes of input,
+ * the first 64 bytes are send to the first stream, the second 64 bytes to
+ * the second stream, etc. The input stream is padded with zeros to the next
+ * multiple of 512 bytes, then a normal MD5 hash is computed on a buffer
+ * containing the A, B, C, and D states of the 8 individual streams, followed
+ * by the (unpadded) length of the input.
+ *
+ * On non-SIMD accelerated CPUs the performance of MD5P8 is slightly lower
+ * than normal MD5 (particularly on files smaller than 10 kB), but with
+ * SIMD-based parallel processing it can be two to six times as fast. Even in
+ * the best-case scenario, xxHash is still at least twice as fast and should
+ * be preferred when available.
+ */
+
+#include "rsync.h"
+
+#ifndef HAVE_SIMD
+
+// each MD5_CTX needs to be 8-byte aligned
+#define MD5P8_Contexts(ctx, index) ((MD5_CTX*)((((uintptr_t)((ctx)->context_storage) + 7) & ~7) + (index)*((sizeof(MD5_CTX) + 7) & ~7)))
+
+int MD5P8_Init(MD5P8_CTX *ctx)
+{
+    int i;
+    for (i = 0; i < 8; i++) {
+        MD5_Init(MD5P8_Contexts(ctx, i));
+    }
+    ctx->used = 0;
+    ctx->next = 0;
+    return 1;
+}
+
+int MD5P8_Update(MD5P8_CTX *ctx, const uchar *input, uint32 length)
+{
+    uint32 pos = 0;
+
+    if ((ctx->used) || (length < 64)) {
+        int cpy = MIN(length, 64 - ctx->used);
+        memmove(&ctx->buffer[ctx->used], input, cpy);
+        ctx->used += cpy;
+        length -= cpy;
+        pos += cpy;
+
+        if (ctx->used == 64) {
+            MD5_Update(MD5P8_Contexts(ctx, ctx->next), ctx->buffer, 64);
+            ctx->used = 0;
+            ctx->next = (ctx->next + 1) % 8;
+        }
+    }
+
+    while (length >= 64) {
+        MD5_Update(MD5P8_Contexts(ctx, ctx->next), &input[pos], 64);
+        ctx->next = (ctx->next + 1) % 8;
+        pos += 64;
+        length -= 64;
+    }
+
+    if (length) {
+        memcpy(ctx->buffer, &input[pos], length);
+        ctx->used = length;
+    }
+    return 1;
+}
+
+int MD5P8_Final(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx)
+{
+    int i;
+    uint32 low = 0, high = 0, sub = ctx->used ? 64 - ctx->used : 0;
+    if (ctx->used) {
+        uchar tmp[64];
+        memset(tmp, 0, 64);
+        MD5P8_Update(ctx, tmp, 64 - ctx->used);
+    }
+    memset(ctx->buffer, 0, 64);
+    while (ctx->next != 0) {
+        MD5P8_Update(ctx, ctx->buffer, 64);
+        sub += 64;
+    }
+
+    uchar state[34*4] = {0};
+
+    for (i = 0; i < 8; i++) {
+        MD5_CTX* md = MD5P8_Contexts(ctx, i);
+#ifdef USE_OPENSSL
+        if (low + md->Nl < low) high++;
+        low += md->Nl;
+        high += md->Nh;
+#else
+        if (low + md->totalN < low) high++;
+        low += md->totalN;
+        high += md->totalN2;
+#endif
+        SIVALu(state, i*16, md->A);
+        SIVALu(state, i*16 + 4, md->B);
+        SIVALu(state, i*16 + 8, md->C);
+        SIVALu(state, i*16 + 12, md->D);
+    }
+
+#ifndef USE_OPENSSL
+	high = (low >> 29) | (high << 3);
+	low = (low << 3);
+#endif
+
+    sub <<= 3;
+    if (low - sub > low) high--;
+    low -= sub;
+
+    SIVALu(state, 32*4, low);
+    SIVALu(state, 33*4, high);
+
+    MD5_CTX md;
+    MD5_Init(&md);
+    MD5_Update(&md, state, 34*4);
+    MD5_Final(digest, &md);
+
+    return 1;
+}
+
+#endif
diff --git a/lib/mdigest.h b/lib/mdigest.h
index e543d6f3a..f00da11bc 100644
--- a/lib/mdigest.h
+++ b/lib/mdigest.h
@@ -32,3 +32,14 @@ void md5_begin(md_context *ctx);
 void md5_update(md_context *ctx, const uchar *input, uint32 length);
 void md5_result(md_context *ctx, uchar digest[MD5_DIGEST_LEN]);
 #endif
+
+typedef struct {
+    uchar context_storage[1024];
+    uchar buffer[512];
+    int used;
+    int next;
+} MD5P8_CTX;
+
+int MD5P8_Init(MD5P8_CTX *ctx);
+int MD5P8_Update(MD5P8_CTX *ctx, const uchar *input, uint32 length);
+int MD5P8_Final(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx);
diff --git a/simd-checksum-x86_64.cpp b/simd-checksum-x86_64.cpp
index 52a3d1936..8fa9803b0 100644
--- a/simd-checksum-x86_64.cpp
+++ b/simd-checksum-x86_64.cpp
@@ -436,6 +436,7 @@ uint32 get_checksum1(char *buf1, int32 len) {
 #define PREFETCH_MIN_LEN 1024 // the overhead is unlikely to be worth the gain for small blocks
 #define PREFETCH_MAX_BLOCKS 8
 #define CSUM_MD5 5
+#define CSUM_MD5P8 7
 
 typedef struct {
     int in_use;
@@ -478,7 +479,7 @@ void checksum2_enable_prefetch(struct map_struct *map, OFF_T len, int32 blocklen
 #ifdef PREFETCH_ENABLE
     checksum2_disable_prefetch();
     int slots = md5_parallel_slots();
-    if ((xfersum_type == CSUM_MD5) && (slots > 1) && (len >= blocklen * PREFETCH_MAX_BLOCKS) && (blocklen >= PREFETCH_MIN_LEN)) {
+    if (((xfersum_type == CSUM_MD5) || (xfersum_type == CSUM_MD5P8)) && (slots > 1) && (len >= blocklen * PREFETCH_MAX_BLOCKS) && (blocklen >= PREFETCH_MIN_LEN)) {
         prefetch = (prefetch_t*)malloc(sizeof(prefetch_t));
         memset(prefetch, 0, sizeof(prefetch_t));
         prefetch->map = map;
diff --git a/simd-md5-parallel-x86_64.cpp b/simd-md5-parallel-x86_64.cpp
index a25f6c7ab..21ca46799 100644
--- a/simd-md5-parallel-x86_64.cpp
+++ b/simd-md5-parallel-x86_64.cpp
@@ -17,7 +17,8 @@
 /*
  * Nicolas' original code has been extended to add AVX2 support, all non-SIMD
  * MD5 code has been removed and those code paths rerouted to use the MD5
- * code already present in rsync, and wrapper functions have been added.
+ * code already present in rsync, and wrapper functions have been added. The
+ * MD5P8 code is also new, and is the reason for the new stride parameter.
  *
  * This code allows multiple independent MD5 streams to be processed in
  * parallel, 4 with SSE2, 8 with AVX2. While single-stream performance is
@@ -755,7 +756,6 @@ static pmd5_status pmd5_to_md5(const pmd5_context * pctx, MD5_CTX * ctx, int slo
     uint32_t a, b, c, d;
     pmd5_status ret = pmd5_get_slot(pctx, slot, &a, &b, &c, &d);
     if (ret == PMD5_SUCCESS) {
-        // OpenSSL may be using 64-bit types so we couldn't pass them directly
         ctx->A = a;
         ctx->B = b;
         ctx->C = c;
@@ -912,6 +912,158 @@ int md5_parallel(int streams, char** buf, int* len, char** sum, char* pre4, char
     return 1;
 }
 
+// each pmd5_context needs to be 32-byte aligned
+#define MD5P8_Contexts(ctx, index) ((pmd5_context*)((((uintptr_t)((ctx)->context_storage) + 31) & ~31) + (index)*((sizeof(pmd5_context) + 31) & ~31)))
+
+int MD5P8_Init(MD5P8_CTX *ctx) {
+    int i;
+    for (i = 0; i < (pmd5_slots() == PMD5_SLOTS_AVX2 ? 1 : 2); i++) {
+        pmd5_init_all(MD5P8_Contexts(ctx, i));
+    }
+    ctx->used = 0;
+    ctx->next = 0;
+    return 1;
+}
+
+int MD5P8_Update(MD5P8_CTX *ctx, const uchar *input, uint32 length) {
+    int slots = pmd5_slots();
+    uint32 pos = 0;
+
+    if ((ctx->used) || (length < 512)) {
+        int cpy = MIN(length, 512 - ctx->used);
+        memcpy(&ctx->buffer[ctx->used], input, cpy);
+        ctx->used += cpy;
+        length -= cpy;
+        pos += cpy;
+
+        if (ctx->used == 512) {
+            if (slots == PMD5_SLOTS_AVX2) {
+                const uint8_t* ptrs[PMD5_SLOTS_MAX] = {
+                    (uint8_t*)ctx->buffer,
+                    (uint8_t*)(ctx->buffer + 64),
+                    (uint8_t*)(ctx->buffer + 128),
+                    (uint8_t*)(ctx->buffer + 192),
+                    (uint8_t*)(ctx->buffer + 256),
+                    (uint8_t*)(ctx->buffer + 320),
+                    (uint8_t*)(ctx->buffer + 384),
+                    (uint8_t*)(ctx->buffer + 448)
+                };
+                pmd5_update_all_simple(MD5P8_Contexts(ctx, 0), ptrs, 64, 0);
+            } else {
+                const uint8_t* ptrs1[PMD5_SLOTS_MAX] = {
+                    (uint8_t*)ctx->buffer,
+                    (uint8_t*)(ctx->buffer + 64),
+                    (uint8_t*)(ctx->buffer + 128),
+                    (uint8_t*)(ctx->buffer + 192)
+                };
+                const uint8_t* ptrs2[PMD5_SLOTS_MAX] = {
+                    (uint8_t*)(ctx->buffer + 256),
+                    (uint8_t*)(ctx->buffer + 320),
+                    (uint8_t*)(ctx->buffer + 384),
+                    (uint8_t*)(ctx->buffer + 448)
+                };
+                pmd5_update_all_simple(MD5P8_Contexts(ctx, 0), ptrs1, 64, 0);
+                pmd5_update_all_simple(MD5P8_Contexts(ctx, 1), ptrs2, 64, 0);
+            }
+            ctx->used = 0;
+        }
+    }
+
+    if (length >= 512) {
+        uint32 blocks = length / 512;
+        if (slots == PMD5_SLOTS_AVX2) {
+            const uint8_t* ptrs[8] = {
+                (uint8_t*)(input + pos),
+                (uint8_t*)(input + pos + 64),
+                (uint8_t*)(input + pos + 128),
+                (uint8_t*)(input + pos + 192),
+                (uint8_t*)(input + pos + 256),
+                (uint8_t*)(input + pos + 320),
+                (uint8_t*)(input + pos + 384),
+                (uint8_t*)(input + pos + 448)
+            };
+            pmd5_update_all_simple(MD5P8_Contexts(ctx, 0), ptrs, blocks * 64, 512);
+        } else {
+            const uint8_t* ptrs1[4] = {
+                (uint8_t*)(input + pos),
+                (uint8_t*)(input + pos + 64),
+                (uint8_t*)(input + pos + 128),
+                (uint8_t*)(input + pos + 192)
+            };
+            const uint8_t* ptrs2[4] = {
+                (uint8_t*)(input + pos + 256),
+                (uint8_t*)(input + pos + 320),
+                (uint8_t*)(input + pos + 384),
+                (uint8_t*)(input + pos + 448)
+            };
+            pmd5_update_all_simple(MD5P8_Contexts(ctx, 0), ptrs1, blocks * 64, 512);
+            pmd5_update_all_simple(MD5P8_Contexts(ctx, 1), ptrs2, blocks * 64, 512);
+        }
+        pos += blocks * 512;
+        length -= blocks * 512;
+    }
+
+    if (length) {
+        memcpy(ctx->buffer, &input[pos], length);
+        ctx->used = length;
+    }
+}
+
+int MD5P8_Final(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx) {
+    int i;
+    uint32 low = 0, high = 0, sub = ctx->used ? 512 - ctx->used : 0;
+    if (ctx->used) {
+        uchar tmp[512];
+        memset(tmp, 0, 512);
+        MD5P8_Update(ctx, tmp, 512 - ctx->used);
+    }
+
+    uchar state[34*4] = {0};
+
+    MD5_CTX tmp;
+    for (i = 0; i < 8; i++) {
+        if (pmd5_slots() == PMD5_SLOTS_AVX2) {
+            pmd5_to_md5(MD5P8_Contexts(ctx, 0), &tmp, i);
+        } else if (i < 4) {
+            pmd5_to_md5(MD5P8_Contexts(ctx, 0), &tmp, i);
+        } else {
+            pmd5_to_md5(MD5P8_Contexts(ctx, 1), &tmp, i - 4);
+        }
+#ifdef USE_OPENSSL
+        if (low + tmp.Nl < low) high++;
+        low += tmp.Nl;
+        high += tmp.Nh;
+#else
+        if (low + tmp.totalN < low) high++;
+        low += tmp.totalN;
+        high += tmp.totalN2;
+#endif
+        SIVALu(state, i*16, tmp.A);
+        SIVALu(state, i*16 + 4, tmp.B);
+        SIVALu(state, i*16 + 8, tmp.C);
+        SIVALu(state, i*16 + 12, tmp.D);
+    }
+
+#ifndef USE_OPENSSL
+	high = (low >> 29) | (high << 3);
+	low = (low << 3);
+#endif
+
+    sub <<= 3;
+    if (low - sub > low) high--;
+    low -= sub;
+
+    SIVALu(state, 32*4, low);
+    SIVALu(state, 33*4, high);
+
+    MD5_CTX md;
+    MD5_Init(&md);
+    MD5_Update(&md, state, 34*4);
+    MD5_Final(digest, &md);
+
+    return 1;
+}
+
 }
 
 #endif /* HAVE_SIMD */