From 6136ddb130efc4674217a3bbae9a8687ec3031aa Mon Sep 17 00:00:00 2001 From: Arthur Gautier Date: Mon, 4 Mar 2024 10:11:55 -0800 Subject: [PATCH] salsa20: revert sse2 This reverts #328. The changes introduced here generate failure when used in `scrypt`. The `scrypt_block_mix` would generate a different value. I'm not able to figure out why that change breaks scrypt. Reverting until we can figure out why. --- Cargo.lock | 1 - salsa20/Cargo.toml | 1 - salsa20/src/backends.rs | 20 ----- salsa20/src/backends/soft.rs | 70 ---------------- salsa20/src/backends/sse2.rs | 156 ----------------------------------- salsa20/src/lib.rs | 121 +++++++++++++++------------ salsa20/src/xsalsa.rs | 17 +--- 7 files changed, 70 insertions(+), 316 deletions(-) delete mode 100644 salsa20/src/backends.rs delete mode 100644 salsa20/src/backends/soft.rs delete mode 100644 salsa20/src/backends/sse2.rs diff --git a/Cargo.lock b/Cargo.lock index 748854e0..a5540d1d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -145,7 +145,6 @@ dependencies = [ name = "salsa20" version = "0.11.0-pre" dependencies = [ - "cfg-if", "cipher", "hex-literal", ] diff --git a/salsa20/Cargo.toml b/salsa20/Cargo.toml index 7d611c3d..5307bf3a 100644 --- a/salsa20/Cargo.toml +++ b/salsa20/Cargo.toml @@ -13,7 +13,6 @@ keywords = ["crypto", "stream-cipher", "trait", "xsalsa20"] categories = ["cryptography", "no-std"] [dependencies] -cfg-if = "1" cipher = "=0.5.0-pre.4" [dev-dependencies] diff --git a/salsa20/src/backends.rs b/salsa20/src/backends.rs deleted file mode 100644 index 49f13ee5..00000000 --- a/salsa20/src/backends.rs +++ /dev/null @@ -1,20 +0,0 @@ -use cfg_if::cfg_if; - -cfg_if! { - if #[cfg(salsa20_force_soft)] { - pub(crate) mod soft; - } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { - cfg_if! { - if #[cfg(salsa20_force_sse2)] { - pub(crate) mod sse2; - } else if #[cfg(salsa20_force_soft)] { - pub(crate) mod soft; - } else { - pub(crate) mod sse2; - pub(crate) mod soft; - } - } - } else { - pub(crate) mod soft; - } -} diff --git a/salsa20/src/backends/soft.rs b/salsa20/src/backends/soft.rs deleted file mode 100644 index c7c2a91c..00000000 --- a/salsa20/src/backends/soft.rs +++ /dev/null @@ -1,70 +0,0 @@ -//! Portable implementation which does not rely on architecture-specific -//! intrinsics. - -use crate::{Block, SalsaCore, Unsigned, STATE_WORDS}; -use cipher::{ - consts::{U1, U64}, - BlockSizeUser, ParBlocksSizeUser, StreamBackend, StreamCipherSeekCore, -}; - -pub(crate) struct Backend<'a, R: Unsigned>(pub(crate) &'a mut SalsaCore); - -impl<'a, R: Unsigned> BlockSizeUser for Backend<'a, R> { - type BlockSize = U64; -} - -impl<'a, R: Unsigned> ParBlocksSizeUser for Backend<'a, R> { - type ParBlocksSize = U1; -} - -impl<'a, R: Unsigned> StreamBackend for Backend<'a, R> { - #[inline(always)] - fn gen_ks_block(&mut self, block: &mut Block) { - let res = run_rounds::(&self.0.state); - - self.0.set_block_pos(self.0.get_block_pos() + 1); - - for (chunk, val) in block.chunks_exact_mut(4).zip(res.iter()) { - chunk.copy_from_slice(&val.to_le_bytes()); - } - } -} - -#[inline] -#[allow(clippy::many_single_char_names)] -pub(crate) fn quarter_round( - a: usize, - b: usize, - c: usize, - d: usize, - state: &mut [u32; STATE_WORDS], -) { - state[b] ^= state[a].wrapping_add(state[d]).rotate_left(7); - state[c] ^= state[b].wrapping_add(state[a]).rotate_left(9); - state[d] ^= state[c].wrapping_add(state[b]).rotate_left(13); - state[a] ^= state[d].wrapping_add(state[c]).rotate_left(18); -} - -#[inline(always)] -fn run_rounds(state: &[u32; STATE_WORDS]) -> [u32; STATE_WORDS] { - let mut res = *state; - - for _ in 0..R::USIZE { - // column rounds - quarter_round(0, 4, 8, 12, &mut res); - quarter_round(5, 9, 13, 1, &mut res); - quarter_round(10, 14, 2, 6, &mut res); - quarter_round(15, 3, 7, 11, &mut res); - - // diagonal rounds - quarter_round(0, 1, 2, 3, &mut res); - quarter_round(5, 6, 7, 4, &mut res); - quarter_round(10, 11, 8, 9, &mut res); - quarter_round(15, 12, 13, 14, &mut res); - } - - for (s1, s0) in res.iter_mut().zip(state.iter()) { - *s1 = s1.wrapping_add(*s0); - } - res -} diff --git a/salsa20/src/backends/sse2.rs b/salsa20/src/backends/sse2.rs deleted file mode 100644 index 3e0199a8..00000000 --- a/salsa20/src/backends/sse2.rs +++ /dev/null @@ -1,156 +0,0 @@ -use crate::{Block, StreamClosure, Unsigned, STATE_WORDS}; -use cipher::{ - consts::{U1, U64}, - BlockSizeUser, ParBlocksSizeUser, StreamBackend, -}; -use core::marker::PhantomData; - -#[cfg(target_arch = "x86")] -use core::arch::x86::*; -#[cfg(target_arch = "x86_64")] -use core::arch::x86_64::*; - -#[inline] -#[target_feature(enable = "sse2")] -pub(crate) unsafe fn inner(state: &mut [u32; STATE_WORDS], f: F) -where - R: Unsigned, - F: StreamClosure, -{ - let state_ptr = state.as_ptr() as *const __m128i; - let mut backend = Backend:: { - v: [ - _mm_loadu_si128(state_ptr.add(0)), - _mm_loadu_si128(state_ptr.add(1)), - _mm_loadu_si128(state_ptr.add(2)), - _mm_loadu_si128(state_ptr.add(3)), - ], - _pd: PhantomData, - }; - - f.call(&mut backend); - - state[8] = _mm_cvtsi128_si32(backend.v[2]) as u32; -} - -struct Backend { - v: [__m128i; 4], - _pd: PhantomData, -} - -impl BlockSizeUser for Backend { - type BlockSize = U64; -} - -impl ParBlocksSizeUser for Backend { - type ParBlocksSize = U1; -} - -impl StreamBackend for Backend { - #[inline(always)] - fn gen_ks_block(&mut self, block: &mut Block) { - unsafe { - let res = rounds::(&self.v); - - self.v[2] = _mm_add_epi32(self.v[2], _mm_set_epi32(0, 0, 0, 1)); - let block_ptr = block.as_mut_ptr() as *mut __m128i; - - for (i, v) in res.iter().enumerate() { - _mm_storeu_si128(block_ptr.add(i), *v); - } - } - } -} - -#[inline] -#[target_feature(enable = "sse2")] -unsafe fn rounds(v: &[__m128i; 4]) -> [__m128i; 4] { - let mut res = *v; - - for _ in 0..R::USIZE { - double_round(&mut res); - } - - for i in 0..4 { - res[i] = _mm_add_epi32(res[i], v[i]); - } - - transpose(&mut res); - res[1] = _mm_shuffle_epi32(res[1], 0b_10_01_00_11); - res[2] = _mm_shuffle_epi32(res[2], 0b_01_00_11_10); - res[3] = _mm_shuffle_epi32(res[3], 0b_00_11_10_01); - transpose(&mut res); - - res -} - -/// The Salsa20 doubleround function for SSE2. -/// -/// https://users.rust-lang.org/t/can-the-compiler-infer-sse-instructions/59976 -#[inline] -#[target_feature(enable = "sse2")] -unsafe fn double_round([a, b, c, d]: &mut [__m128i; 4]) { - let mut t_sum: __m128i; - let mut t_rotl: __m128i; - - // Operate on "columns" - t_sum = _mm_add_epi32(*a, *d); - t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 7), _mm_srli_epi32(t_sum, 25)); - *b = _mm_xor_si128(*b, t_rotl); - - t_sum = _mm_add_epi32(*b, *a); - t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 9), _mm_srli_epi32(t_sum, 23)); - *c = _mm_xor_si128(*c, t_rotl); - - t_sum = _mm_add_epi32(*c, *b); - t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 13), _mm_srli_epi32(t_sum, 19)); - *d = _mm_xor_si128(*d, t_rotl); - - t_sum = _mm_add_epi32(*d, *c); - t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 18), _mm_srli_epi32(t_sum, 14)); - *a = _mm_xor_si128(*a, t_rotl); - - // Rearrange data. - *b = _mm_shuffle_epi32(*b, 0b_10_01_00_11); - *c = _mm_shuffle_epi32(*c, 0b_01_00_11_10); - *d = _mm_shuffle_epi32(*d, 0b_00_11_10_01); - - // Operate on "rows". - t_sum = _mm_add_epi32(*a, *b); - t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 7), _mm_srli_epi32(t_sum, 25)); - *d = _mm_xor_si128(*d, t_rotl); - - t_sum = _mm_add_epi32(*d, *a); - t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 9), _mm_srli_epi32(t_sum, 23)); - *c = _mm_xor_si128(*c, t_rotl); - - t_sum = _mm_add_epi32(*c, *d); - t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 13), _mm_srli_epi32(t_sum, 19)); - *b = _mm_xor_si128(*b, t_rotl); - - t_sum = _mm_add_epi32(*b, *c); - t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 18), _mm_srli_epi32(t_sum, 14)); - *a = _mm_xor_si128(*a, t_rotl); - - // Rearrange data. - *b = _mm_shuffle_epi32(*b, 0b_00_11_10_01); - *c = _mm_shuffle_epi32(*c, 0b_01_00_11_10); - *d = _mm_shuffle_epi32(*d, 0b_10_01_00_11); -} - -/// Transpose an integer 4 by 4 matrix in SSE2. -/// -/// https://randombit.net/bitbashing/posts/integer_matrix_transpose_in_sse2.html -#[inline] -#[target_feature(enable = "sse2")] -unsafe fn transpose([a, b, c, d]: &mut [__m128i; 4]) { - let t0 = _mm_unpacklo_epi32(*a, *b); - let t1 = _mm_unpacklo_epi32(*c, *d); - let t2 = _mm_unpackhi_epi32(*a, *b); - let t3 = _mm_unpackhi_epi32(*c, *d); - - *a = _mm_unpacklo_epi64(t0, t1); - *b = _mm_unpackhi_epi64(t0, t1); - *c = _mm_unpacklo_epi64(t2, t3); - *d = _mm_unpackhi_epi64(t2, t3); -} diff --git a/salsa20/src/lib.rs b/salsa20/src/lib.rs index 4e068220..45bc9f0d 100644 --- a/salsa20/src/lib.rs +++ b/salsa20/src/lib.rs @@ -61,21 +61,6 @@ //! assert_eq!(buffer, ciphertext); //! ``` //! -//! # Configuration Flags -//! -//! You can modify crate using the following configuration flags: -//! -//! - `salsa20_force_soft`: force software backend. -//! - `salsa20_force_sse2`: force SSE2 backend on x86/x86_64 targets. -//! Requires enabled SSE2 target feature. Ignored on non-x86(-64) targets. -//! -//! Salsa20 will run the SSE2 backend in x86(-64) targets unless `salsa20_force_soft` is set. -//! -//! The flags can be enabled using `RUSTFLAGS` environmental variable -//! (e.g. `RUSTFLAGS="--cfg salsa20_force_sse2"`) or by modifying `.cargo/config`. -//! -//! You SHOULD NOT enable several `force` flags simultaneously. -//! //! [Salsa]: https://en.wikipedia.org/wiki/Salsa20 #![no_std] @@ -87,21 +72,20 @@ )] #![warn(missing_docs, rust_2018_idioms, trivial_casts, unused_qualifications)] -use cfg_if::cfg_if; pub use cipher; use cipher::{ array::{typenum::Unsigned, Array}, - consts::{U10, U24, U32, U4, U6, U64, U8}, - Block, BlockSizeUser, IvSizeUser, KeyIvInit, KeySizeUser, StreamCipherCore, - StreamCipherCoreWrapper, StreamCipherSeekCore, StreamClosure, + consts::{U1, U10, U24, U32, U4, U6, U64, U8}, + Block, BlockSizeUser, IvSizeUser, KeyIvInit, KeySizeUser, ParBlocksSizeUser, StreamBackend, + StreamCipherCore, StreamCipherCoreWrapper, StreamCipherSeekCore, StreamClosure, }; use core::marker::PhantomData; #[cfg(feature = "zeroize")] use cipher::zeroize::{Zeroize, ZeroizeOnDrop}; -mod backends; +//mod backends; mod xsalsa; pub use xsalsa::{hsalsa, XSalsa12, XSalsa20, XSalsa8, XSalsaCore}; @@ -191,19 +175,6 @@ impl KeyIvInit for SalsaCore { state[15] = CONSTANTS[3]; - cfg_if! { - if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { - #[cfg(not(salsa20_force_soft))] { - state = [ - state[0], state[5], state[10], state[15], - state[4], state[9], state[14], state[3], - state[8], state[13], state[2], state[7], - state[12], state[1], state[6], state[11], - ]; - } - } - } - Self { state, rounds: PhantomData, @@ -218,23 +189,7 @@ impl StreamCipherCore for SalsaCore { rem.try_into().ok() } fn process_with_backend(&mut self, f: impl StreamClosure) { - cfg_if! { - if #[cfg(salsa20_force_soft)] { - f.call(&mut backends::soft::Backend(self)); - } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { - cfg_if! { - if #[cfg(not(salsa20_force_soft))] { - unsafe { - backends::sse2::inner::(&mut self.state, f); - } - } else { - f.call(&mut backends::soft::Backend(self)); - } - } - } else { - f.call(&mut backends::soft::Backend(self)); - } - } + f.call(&mut Backend(self)); } } @@ -243,12 +198,13 @@ impl StreamCipherSeekCore for SalsaCore { #[inline(always)] fn get_block_pos(&self) -> u64 { - self.state[8] as u64 + (self.state[8] as u64) + ((self.state[9] as u64) << 32) } #[inline(always)] fn set_block_pos(&mut self, pos: u64) { - self.state[8] = pos as u32; + self.state[8] = (pos & 0xffff_ffff) as u32; + self.state[9] = ((pos >> 32) & 0xffff_ffff) as u32; } } @@ -263,3 +219,64 @@ impl Drop for SalsaCore { #[cfg(feature = "zeroize")] #[cfg_attr(docsrs, doc(cfg(feature = "zeroize")))] impl ZeroizeOnDrop for SalsaCore {} + +struct Backend<'a, R: Unsigned>(&'a mut SalsaCore); + +impl<'a, R: Unsigned> BlockSizeUser for Backend<'a, R> { + type BlockSize = U64; +} + +impl<'a, R: Unsigned> ParBlocksSizeUser for Backend<'a, R> { + type ParBlocksSize = U1; +} + +impl<'a, R: Unsigned> StreamBackend for Backend<'a, R> { + #[inline(always)] + fn gen_ks_block(&mut self, block: &mut Block) { + let res = run_rounds::(&self.0.state); + self.0.set_block_pos(self.0.get_block_pos() + 1); + + for (chunk, val) in block.chunks_exact_mut(4).zip(res.iter()) { + chunk.copy_from_slice(&val.to_le_bytes()); + } + } +} + +#[inline] +#[allow(clippy::many_single_char_names)] +pub(crate) fn quarter_round( + a: usize, + b: usize, + c: usize, + d: usize, + state: &mut [u32; STATE_WORDS], +) { + state[b] ^= state[a].wrapping_add(state[d]).rotate_left(7); + state[c] ^= state[b].wrapping_add(state[a]).rotate_left(9); + state[d] ^= state[c].wrapping_add(state[b]).rotate_left(13); + state[a] ^= state[d].wrapping_add(state[c]).rotate_left(18); +} + +#[inline(always)] +fn run_rounds(state: &[u32; STATE_WORDS]) -> [u32; STATE_WORDS] { + let mut res = *state; + + for _ in 0..R::USIZE { + // column rounds + quarter_round(0, 4, 8, 12, &mut res); + quarter_round(5, 9, 13, 1, &mut res); + quarter_round(10, 14, 2, 6, &mut res); + quarter_round(15, 3, 7, 11, &mut res); + + // diagonal rounds + quarter_round(0, 1, 2, 3, &mut res); + quarter_round(5, 6, 7, 4, &mut res); + quarter_round(10, 11, 8, 9, &mut res); + quarter_round(15, 12, 13, 14, &mut res); + } + + for (s1, s0) in res.iter_mut().zip(state.iter()) { + *s1 = s1.wrapping_add(*s0); + } + res +} diff --git a/salsa20/src/xsalsa.rs b/salsa20/src/xsalsa.rs index fc8659a7..6316972b 100644 --- a/salsa20/src/xsalsa.rs +++ b/salsa20/src/xsalsa.rs @@ -1,6 +1,6 @@ //! XSalsa20 is an extended nonce variant of Salsa20 -use super::{Key, Nonce, SalsaCore, Unsigned, XNonce, CONSTANTS, STATE_WORDS}; +use super::{quarter_round, Key, Nonce, SalsaCore, Unsigned, XNonce, CONSTANTS}; use cipher::{ array::Array, consts::{U10, U16, U24, U32, U4, U6, U64}, @@ -136,18 +136,3 @@ pub fn hsalsa(key: &Key, input: &Array) -> Array output } - -/// The Salsa20 quarter round function -// for simplicity this function is copied from the software backend -pub(crate) fn quarter_round( - a: usize, - b: usize, - c: usize, - d: usize, - state: &mut [u32; STATE_WORDS], -) { - state[b] ^= state[a].wrapping_add(state[d]).rotate_left(7); - state[c] ^= state[b].wrapping_add(state[a]).rotate_left(9); - state[d] ^= state[c].wrapping_add(state[b]).rotate_left(13); - state[a] ^= state[d].wrapping_add(state[c]).rotate_left(18); -}