Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 0 additions & 24 deletions sha2/src/consts.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
#![allow(dead_code)]

pub(crate) type State256 = [u32; 8];
pub(crate) type State512 = [u64; 8];

Expand Down Expand Up @@ -68,25 +66,3 @@ pub(crate) const K64: [u64; 80] = [
0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c,
0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 0x5fcb6fab3ad6faec, 0x6c44198c4a475817,
];

/// Swapped round constants for SHA-256 family of digests
pub(crate) static K32X4: [[u32; 4]; 16] = {
let mut res = [[0u32; 4]; 16];
let mut i = 0;
while i < 16 {
res[i] = [K32[4 * i + 3], K32[4 * i + 2], K32[4 * i + 1], K32[4 * i]];
i += 1;
}
res
};

/// Swapped round constants for SHA-512 family of digests
pub(crate) const K64X2: [[u64; 2]; 40] = {
let mut res = [[0u64; 2]; 40];
let mut i = 0;
while i < 16 {
res[i] = [K64[4 * i + 1], K64[4 * i]];
i += 1;
}
res
};
6 changes: 2 additions & 4 deletions sha2/src/sha256.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,9 @@ cfg_if::cfg_if! {

#[cfg(not(all(
target_feature = "sha",
target_feature = "sse2",
target_feature = "ssse3",
target_feature = "sse4.1",
)))]
compile_error!("x86-sha backend requires sha, sse2, ssse3, sse4.1 target features");
compile_error!("x86-sha backend requires sha and sse4.1 target features");
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that sse4.1 implies ssse3, which implies sse3, which in turn (together with sha) implies sse2.


fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
// SAFETY: we checked above that the required target features are enabled
Expand Down Expand Up @@ -52,7 +50,7 @@ cfg_if::cfg_if! {
cfg_if::cfg_if! {
if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
mod x86_sha;
cpufeatures::new!(shani_cpuid, "sha", "sse2", "ssse3", "sse4.1");
cpufeatures::new!(shani_cpuid, "sha", "sse4.1");
} else if #[cfg(target_arch = "aarch64")] {
mod aarch64_sha2;
cpufeatures::new!(sha2_hwcap, "sha2");
Expand Down
116 changes: 56 additions & 60 deletions sha2/src/sha256/x86_sha.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,47 +10,57 @@ use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;

#[target_feature(enable = "sha,sse2,ssse3,sse4.1")]
unsafe fn schedule(v0: __m128i, v1: __m128i, v2: __m128i, v3: __m128i) -> __m128i {
let t1 = _mm_sha256msg1_epu32(v0, v1);
let t2 = _mm_alignr_epi8(v3, v2, 4);
let t3 = _mm_add_epi32(t1, t2);
_mm_sha256msg2_epu32(t3, v3)
#[target_feature(enable = "sha")]
unsafe fn rounds4(r: usize, abef: &mut __m128i, cdgh: &mut __m128i, rest: __m128i) {
use crate::consts::K32;
let rk = _mm_set_epi32(
K32[4 * r + 3] as i32,
K32[4 * r + 2] as i32,
K32[4 * r + 1] as i32,
K32[4 * r] as i32,
);
let t1 = _mm_add_epi32(rest, rk);
*cdgh = _mm_sha256rnds2_epu32(*cdgh, *abef, t1);
let t2 = _mm_shuffle_epi32(t1, 0x0E);
*abef = _mm_sha256rnds2_epu32(*abef, *cdgh, t2);
}

macro_rules! rounds4 {
($abef:ident, $cdgh:ident, $rest:expr, $i:expr) => {{
let k = crate::consts::K32X4[$i];
let kv = _mm_set_epi32(k[0] as i32, k[1] as i32, k[2] as i32, k[3] as i32);
let t1 = _mm_add_epi32($rest, kv);
$cdgh = _mm_sha256rnds2_epu32($cdgh, $abef, t1);
let t2 = _mm_shuffle_epi32(t1, 0x0E);
$abef = _mm_sha256rnds2_epu32($abef, $cdgh, t2);
}};
#[target_feature(enable = "sha,ssse3")]
unsafe fn schedule_rounds16(
r: usize,
abef: &mut __m128i,
cdgh: &mut __m128i,
w: &mut [__m128i; 4],
) {
for i in 0..4 {
let w0 = w[i];
let w1 = w[(i + 1) % 4];
let w2 = w[(i + 2) % 4];
let w3 = w[(i + 3) % 4];

let t1 = _mm_sha256msg1_epu32(w0, w1);
let t2 = _mm_alignr_epi8(w3, w2, 4);
let t3 = _mm_add_epi32(t1, t2);

w[i] = _mm_sha256msg2_epu32(t3, w3);

rounds4(r + i, abef, cdgh, w[i]);
}
}

macro_rules! schedule_rounds4 {
(
$abef:ident, $cdgh:ident,
$w0:expr, $w1:expr, $w2:expr, $w3:expr, $w4:expr,
$i: expr
) => {{
$w4 = schedule($w0, $w1, $w2, $w3);
rounds4!($abef, $cdgh, $w4, $i);
}};
#[target_feature(enable = "ssse3")]
unsafe fn read_block(block: &[u8; 64]) -> [__m128i; 4] {
let block_ptr: *const __m128i = block.as_ptr().cast();
let mask = _mm_set_epi64x(0x0C0D_0E0F_0809_0A0B, 0x0405_0607_0001_0203);
core::array::from_fn(|i| {
let w = _mm_loadu_si128(block_ptr.add(i));
_mm_shuffle_epi8(w, mask)
})
}

// we use unaligned loads with `__m128i` pointers
#[allow(clippy::cast_ptr_alignment)]
#[target_feature(enable = "sha,sse2,ssse3,sse4.1")]
#[target_feature(enable = "sha,sse4.1")]
pub(super) unsafe fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
#[allow(non_snake_case)]
let MASK: __m128i = _mm_set_epi64x(
0x0C0D_0E0F_0809_0A0Bu64 as i64,
0x0405_0607_0001_0203u64 as i64,
);

let state_ptr: *const __m128i = state.as_ptr().cast();
let state_ptr: *mut __m128i = state.as_mut_ptr().cast();
let dcba = _mm_loadu_si128(state_ptr.add(0));
let hgfe = _mm_loadu_si128(state_ptr.add(1));

Expand All @@ -63,29 +73,16 @@ pub(super) unsafe fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
let abef_save = abef;
let cdgh_save = cdgh;

let block_ptr: *const __m128i = block.as_ptr().cast();
let mut w0 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.add(0)), MASK);
let mut w1 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.add(1)), MASK);
let mut w2 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.add(2)), MASK);
let mut w3 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.add(3)), MASK);
let mut w4;

rounds4!(abef, cdgh, w0, 0);
rounds4!(abef, cdgh, w1, 1);
rounds4!(abef, cdgh, w2, 2);
rounds4!(abef, cdgh, w3, 3);
schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 4);
schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 5);
schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 6);
schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 7);
schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 8);
schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 9);
schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 10);
schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 11);
schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 12);
schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 13);
schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 14);
schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 15);
let mut w = read_block(block);

rounds4(0, &mut abef, &mut cdgh, w[0]);
rounds4(1, &mut abef, &mut cdgh, w[1]);
rounds4(2, &mut abef, &mut cdgh, w[2]);
rounds4(3, &mut abef, &mut cdgh, w[3]);

schedule_rounds16(4, &mut abef, &mut cdgh, &mut w);
schedule_rounds16(8, &mut abef, &mut cdgh, &mut w);
schedule_rounds16(12, &mut abef, &mut cdgh, &mut w);

abef = _mm_add_epi32(abef, abef_save);
cdgh = _mm_add_epi32(cdgh, cdgh_save);
Expand All @@ -96,7 +93,6 @@ pub(super) unsafe fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
let dcba = _mm_blend_epi16(feba, dchg, 0xF0);
let hgef = _mm_alignr_epi8(dchg, feba, 8);

let state_ptr_mut: *mut __m128i = state.as_mut_ptr().cast();
_mm_storeu_si128(state_ptr_mut.add(0), dcba);
_mm_storeu_si128(state_ptr_mut.add(1), hgef);
_mm_storeu_si128(state_ptr.add(0), dcba);
_mm_storeu_si128(state_ptr.add(1), hgef);
}
Loading