From 991fe31c61e26a5b810b84261305eb3e761b829d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D1=91=D0=BC=20=D0=9F=D0=B0=D0=B2=D0=BB?= =?UTF-8?q?=D0=BE=D0=B2=20=5BArtyom=20Pavlov=5D?= Date: Thu, 7 May 2026 19:23:44 +0300 Subject: [PATCH 1/4] sha2: remove macros from `sha256/x86_sha` --- sha2/src/consts.rs | 22 ------- sha2/src/sha256/x86_sha.rs | 120 ++++++++++++++++++------------------- 2 files changed, 60 insertions(+), 82 deletions(-) diff --git a/sha2/src/consts.rs b/sha2/src/consts.rs index b82e9ebd4..95982afd9 100644 --- a/sha2/src/consts.rs +++ b/sha2/src/consts.rs @@ -68,25 +68,3 @@ pub(crate) const K64: [u64; 80] = [ 0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c, 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 0x5fcb6fab3ad6faec, 0x6c44198c4a475817, ]; - -/// Swapped round constants for SHA-256 family of digests -pub(crate) static K32X4: [[u32; 4]; 16] = { - let mut res = [[0u32; 4]; 16]; - let mut i = 0; - while i < 16 { - res[i] = [K32[4 * i + 3], K32[4 * i + 2], K32[4 * i + 1], K32[4 * i]]; - i += 1; - } - res -}; - -/// Swapped round constants for SHA-512 family of digests -pub(crate) const K64X2: [[u64; 2]; 40] = { - let mut res = [[0u64; 2]; 40]; - let mut i = 0; - while i < 16 { - res[i] = [K64[4 * i + 1], K64[4 * i]]; - i += 1; - } - res -}; diff --git a/sha2/src/sha256/x86_sha.rs b/sha2/src/sha256/x86_sha.rs index a70ff69c9..7abbe55d6 100644 --- a/sha2/src/sha256/x86_sha.rs +++ b/sha2/src/sha256/x86_sha.rs @@ -10,47 +10,61 @@ use core::arch::x86::*; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*; -#[target_feature(enable = "sha,sse2,ssse3,sse4.1")] -unsafe fn schedule(v0: __m128i, v1: __m128i, v2: __m128i, v3: __m128i) -> __m128i { - let t1 = _mm_sha256msg1_epu32(v0, v1); - let t2 = _mm_alignr_epi8(v3, v2, 4); - let t3 = _mm_add_epi32(t1, t2); - _mm_sha256msg2_epu32(t3, v3) +#[target_feature(enable = "sha")] +unsafe fn rounds4(r: usize, abef: &mut __m128i, cdgh: &mut __m128i, rest: __m128i) { + use crate::consts::K32; + let rk = _mm_set_epi32( + K32[4 * r + 3] as i32, + K32[4 * r + 2] as i32, + K32[4 * r + 1] as i32, + K32[4 * r] as i32, + ); + let t1 = _mm_add_epi32(rest, rk); + *cdgh = _mm_sha256rnds2_epu32(*cdgh, *abef, t1); + let t2 = _mm_shuffle_epi32(t1, 0x0E); + *abef = _mm_sha256rnds2_epu32(*abef, *cdgh, t2); } -macro_rules! rounds4 { - ($abef:ident, $cdgh:ident, $rest:expr, $i:expr) => {{ - let k = crate::consts::K32X4[$i]; - let kv = _mm_set_epi32(k[0] as i32, k[1] as i32, k[2] as i32, k[3] as i32); - let t1 = _mm_add_epi32($rest, kv); - $cdgh = _mm_sha256rnds2_epu32($cdgh, $abef, t1); - let t2 = _mm_shuffle_epi32(t1, 0x0E); - $abef = _mm_sha256rnds2_epu32($abef, $cdgh, t2); - }}; +#[target_feature(enable = "sha,ssse3")] +unsafe fn schedule_rounds16( + r: usize, + abef: &mut __m128i, + cdgh: &mut __m128i, + w: &mut [__m128i; 4], +) { + for i in 0..4 { + let w0 = w[i]; + let w1 = w[(i + 1) % 4]; + let w2 = w[(i + 2) % 4]; + let w3 = w[(i + 3) % 4]; + + let t1 = _mm_sha256msg1_epu32(w0, w1); + let t2 = _mm_alignr_epi8(w3, w2, 4); + let t3 = _mm_add_epi32(t1, t2); + + w[i] = _mm_sha256msg2_epu32(t3, w3); + + rounds4(r + i, abef, cdgh, w[i]); + } } -macro_rules! schedule_rounds4 { - ( - $abef:ident, $cdgh:ident, - $w0:expr, $w1:expr, $w2:expr, $w3:expr, $w4:expr, - $i: expr - ) => {{ - $w4 = schedule($w0, $w1, $w2, $w3); - rounds4!($abef, $cdgh, $w4, $i); - }}; +#[target_feature(enable = "ssse3")] +unsafe fn read_block(block: &[u8; 64]) -> [__m128i; 4] { + let block_ptr: *const __m128i = block.as_ptr().cast(); + let mask = _mm_set_epi64x(0x0C0D_0E0F_0809_0A0B, 0x0405_0607_0001_0203); + core::array::from_fn(|i| { + let w = _mm_loadu_si128(block_ptr.add(i)); + _mm_shuffle_epi8(w, mask) + }) } -// we use unaligned loads with `__m128i` pointers -#[allow(clippy::cast_ptr_alignment)] -#[target_feature(enable = "sha,sse2,ssse3,sse4.1")] +#[allow( + clippy::cast_ptr_alignment, + reason = "we use unaligned loads with `__m128i` pointers" +)] +#[target_feature(enable = "sha,sse4.1")] pub(super) unsafe fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) { - #[allow(non_snake_case)] - let MASK: __m128i = _mm_set_epi64x( - 0x0C0D_0E0F_0809_0A0Bu64 as i64, - 0x0405_0607_0001_0203u64 as i64, - ); - - let state_ptr: *const __m128i = state.as_ptr().cast(); + let state_ptr: *mut __m128i = state.as_mut_ptr().cast(); let dcba = _mm_loadu_si128(state_ptr.add(0)); let hgfe = _mm_loadu_si128(state_ptr.add(1)); @@ -63,29 +77,16 @@ pub(super) unsafe fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) { let abef_save = abef; let cdgh_save = cdgh; - let block_ptr: *const __m128i = block.as_ptr().cast(); - let mut w0 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.add(0)), MASK); - let mut w1 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.add(1)), MASK); - let mut w2 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.add(2)), MASK); - let mut w3 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.add(3)), MASK); - let mut w4; - - rounds4!(abef, cdgh, w0, 0); - rounds4!(abef, cdgh, w1, 1); - rounds4!(abef, cdgh, w2, 2); - rounds4!(abef, cdgh, w3, 3); - schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 4); - schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 5); - schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 6); - schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 7); - schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 8); - schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 9); - schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 10); - schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 11); - schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 12); - schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 13); - schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 14); - schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 15); + let mut w = read_block(block); + + rounds4(0, &mut abef, &mut cdgh, w[0]); + rounds4(1, &mut abef, &mut cdgh, w[1]); + rounds4(2, &mut abef, &mut cdgh, w[2]); + rounds4(3, &mut abef, &mut cdgh, w[3]); + + schedule_rounds16(4, &mut abef, &mut cdgh, &mut w); + schedule_rounds16(8, &mut abef, &mut cdgh, &mut w); + schedule_rounds16(12, &mut abef, &mut cdgh, &mut w); abef = _mm_add_epi32(abef, abef_save); cdgh = _mm_add_epi32(cdgh, cdgh_save); @@ -96,7 +97,6 @@ pub(super) unsafe fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) { let dcba = _mm_blend_epi16(feba, dchg, 0xF0); let hgef = _mm_alignr_epi8(dchg, feba, 8); - let state_ptr_mut: *mut __m128i = state.as_mut_ptr().cast(); - _mm_storeu_si128(state_ptr_mut.add(0), dcba); - _mm_storeu_si128(state_ptr_mut.add(1), hgef); + _mm_storeu_si128(state_ptr.add(0), dcba); + _mm_storeu_si128(state_ptr.add(1), hgef); } From 27574d7b521a3717b3183900e2ba9e4f3760a320 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D1=91=D0=BC=20=D0=9F=D0=B0=D0=B2=D0=BB?= =?UTF-8?q?=D0=BE=D0=B2=20=5BArtyom=20Pavlov=5D?= Date: Thu, 7 May 2026 19:37:42 +0300 Subject: [PATCH 2/4] remove `#![allow(dead_code)]` from `consts` --- sha2/src/consts.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/sha2/src/consts.rs b/sha2/src/consts.rs index 95982afd9..65515ea91 100644 --- a/sha2/src/consts.rs +++ b/sha2/src/consts.rs @@ -1,5 +1,3 @@ -#![allow(dead_code)] - pub(crate) type State256 = [u32; 8]; pub(crate) type State512 = [u64; 8]; From 607c1680a097fac3ae50db62cb00925d4d9833a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D1=91=D0=BC=20=D0=9F=D0=B0=D0=B2=D0=BB?= =?UTF-8?q?=D0=BE=D0=B2=20=5BArtyom=20Pavlov=5D?= Date: Thu, 7 May 2026 19:40:55 +0300 Subject: [PATCH 3/4] remove `#[allow(clippy::cast_ptr_alignment)]` --- sha2/src/sha256/x86_sha.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sha2/src/sha256/x86_sha.rs b/sha2/src/sha256/x86_sha.rs index 7abbe55d6..af8142d01 100644 --- a/sha2/src/sha256/x86_sha.rs +++ b/sha2/src/sha256/x86_sha.rs @@ -58,10 +58,6 @@ unsafe fn read_block(block: &[u8; 64]) -> [__m128i; 4] { }) } -#[allow( - clippy::cast_ptr_alignment, - reason = "we use unaligned loads with `__m128i` pointers" -)] #[target_feature(enable = "sha,sse4.1")] pub(super) unsafe fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) { let state_ptr: *mut __m128i = state.as_mut_ptr().cast(); From a4d939ffcae51bceec3739f5d45560204d35b6b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D1=91=D0=BC=20=D0=9F=D0=B0=D0=B2=D0=BB?= =?UTF-8?q?=D0=BE=D0=B2=20=5BArtyom=20Pavlov=5D?= Date: Thu, 7 May 2026 19:48:07 +0300 Subject: [PATCH 4/4] tweak target feature detection --- sha2/src/sha256.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sha2/src/sha256.rs b/sha2/src/sha256.rs index a0fc50645..5239ca15a 100644 --- a/sha2/src/sha256.rs +++ b/sha2/src/sha256.rs @@ -20,11 +20,9 @@ cfg_if::cfg_if! { #[cfg(not(all( target_feature = "sha", - target_feature = "sse2", - target_feature = "ssse3", target_feature = "sse4.1", )))] - compile_error!("x86-sha backend requires sha, sse2, ssse3, sse4.1 target features"); + compile_error!("x86-sha backend requires sha and sse4.1 target features"); fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) { // SAFETY: we checked above that the required target features are enabled @@ -52,7 +50,7 @@ cfg_if::cfg_if! { cfg_if::cfg_if! { if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { mod x86_sha; - cpufeatures::new!(shani_cpuid, "sha", "sse2", "ssse3", "sse4.1"); + cpufeatures::new!(shani_cpuid, "sha", "sse4.1"); } else if #[cfg(target_arch = "aarch64")] { mod aarch64_sha2; cpufeatures::new!(sha2_hwcap, "sha2");