From b2d1e84abf731120e8c049d9d816bec64550fb4e Mon Sep 17 00:00:00 2001 From: Remco Bloemen Date: Tue, 1 Nov 2022 09:08:45 -0700 Subject: [PATCH] keccak: add `aarch64` asm intrinsics implementation keccak_f1600 (#23) --- .github/workflows/keccak.yml | 35 +++++++ keccak/src/aarch64_sha3.rs | 196 +++++++++++++++++++++++++++++++++++ keccak/src/lib.rs | 6 ++ 3 files changed, 237 insertions(+) create mode 100644 keccak/src/aarch64_sha3.rs diff --git a/.github/workflows/keccak.yml b/.github/workflows/keccak.yml index 1e1ea8b..5ec3a69 100644 --- a/.github/workflows/keccak.yml +++ b/.github/workflows/keccak.yml @@ -116,3 +116,38 @@ jobs: package: ${{ github.workflow }} target: ${{ matrix.target }} features: ${{ matrix.features }} + + aarch64-sha3: + needs: set-msrv + runs-on: ubuntu-latest + defaults: + run: + # Cross mounts only current package, i.e. by default it ignores workspace's Cargo.toml + working-directory: . + strategy: + matrix: + rust: + - ${{needs.set-msrv.outputs.msrv}} + - stable + steps: + - uses: actions/checkout@v3 + - uses: RustCrypto/actions/cargo-cache@master + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: ${{ matrix.rust }} + target: aarch64-unknown-linux-gnu + override: true + - name: Install pre-compiled cross + run: | + # We need a recent version for RUSTFLAGS to work. + wget -O /tmp/binaries.tar.gz https://github.com/cross-rs/cross/releases/download/v0.2.4/cross-x86_64-unknown-linux-gnu.tar.gz + tar -C /tmp -xzf /tmp/binaries.tar.gz + mv /tmp/cross ~/.cargo/bin + shell: bash + - name: cross test + run: | + cd keccak + # Cross doesn't enable `sha3` by default, but QEMU supports it. + export RUSTFLAGS="-C target-feature=+sha3" + cross test --target aarch64-unknown-linux-gnu --no-default-features diff --git a/keccak/src/aarch64_sha3.rs b/keccak/src/aarch64_sha3.rs new file mode 100644 index 0000000..93a640d --- /dev/null +++ b/keccak/src/aarch64_sha3.rs @@ -0,0 +1,196 @@ +#![cfg(all(target_arch = "aarch64", target_feature = "sha3"))] + +/// Keccak-f1600 on ARMv8.4-A with FEAT_SHA3. +/// +/// See p. K12.2.2 p. 11,749 of the ARM Reference manual. +/// Adapted from the Keccak-f1600 implementation in the XKCP/K12. +/// see +pub fn keccak_f1600(state: &mut [u64; 25]) { + unsafe { + core::arch::asm!(" + // Read state + ld1.1d {{ v0- v3}}, [x0], #32 + ld1.1d {{ v4- v7}}, [x0], #32 + ld1.1d {{ v8-v11}}, [x0], #32 + ld1.1d {{v12-v15}}, [x0], #32 + ld1.1d {{v16-v19}}, [x0], #32 + ld1.1d {{v20-v23}}, [x0], #32 + ld1.1d {{v24}}, [x0] + sub x0, x0, #192 + + // Loop 24 rounds + // NOTE: This loop actually computes two f1600 functions in + // parallel, in both the lower and the upper 64-bit of the + // 128-bit registers v0-v24. + mov x8, #24 + 0: sub x8, x8, #1 + + // Theta Calculations + eor3.16b v25, v20, v15, v10 + eor3.16b v26, v21, v16, v11 + eor3.16b v27, v22, v17, v12 + eor3.16b v28, v23, v18, v13 + eor3.16b v29, v24, v19, v14 + eor3.16b v25, v25, v5, v0 + eor3.16b v26, v26, v6, v1 + eor3.16b v27, v27, v7, v2 + eor3.16b v28, v28, v8, v3 + eor3.16b v29, v29, v9, v4 + rax1.2d v30, v25, v27 + rax1.2d v31, v26, v28 + rax1.2d v27, v27, v29 + rax1.2d v28, v28, v25 + rax1.2d v29, v29, v26 + + // Rho and Phi + eor.16b v0, v0, v29 + xar.2d v25, v1, v30, #64 - 1 + xar.2d v1, v6, v30, #64 - 44 + xar.2d v6, v9, v28, #64 - 20 + xar.2d v9, v22, v31, #64 - 61 + xar.2d v22, v14, v28, #64 - 39 + xar.2d v14, v20, v29, #64 - 18 + xar.2d v26, v2, v31, #64 - 62 + xar.2d v2, v12, v31, #64 - 43 + xar.2d v12, v13, v27, #64 - 25 + xar.2d v13, v19, v28, #64 - 8 + xar.2d v19, v23, v27, #64 - 56 + xar.2d v23, v15, v29, #64 - 41 + xar.2d v15, v4, v28, #64 - 27 + xar.2d v28, v24, v28, #64 - 14 + xar.2d v24, v21, v30, #64 - 2 + xar.2d v8, v8, v27, #64 - 55 + xar.2d v4, v16, v30, #64 - 45 + xar.2d v16, v5, v29, #64 - 36 + xar.2d v5, v3, v27, #64 - 28 + xar.2d v27, v18, v27, #64 - 21 + xar.2d v3, v17, v31, #64 - 15 + xar.2d v30, v11, v30, #64 - 10 + xar.2d v31, v7, v31, #64 - 6 + xar.2d v29, v10, v29, #64 - 3 + + // Chi and Iota + bcax.16b v20, v26, v22, v8 + bcax.16b v21, v8, v23, v22 + bcax.16b v22, v22, v24, v23 + bcax.16b v23, v23, v26, v24 + bcax.16b v24, v24, v8, v26 + + ld1r.2d {{v26}}, [x1], #8 + + bcax.16b v17, v30, v19, v3 + bcax.16b v18, v3, v15, v19 + bcax.16b v19, v19, v16, v15 + bcax.16b v15, v15, v30, v16 + bcax.16b v16, v16, v3, v30 + + bcax.16b v10, v25, v12, v31 + bcax.16b v11, v31, v13, v12 + bcax.16b v12, v12, v14, v13 + bcax.16b v13, v13, v25, v14 + bcax.16b v14, v14, v31, v25 + + bcax.16b v7, v29, v9, v4 + bcax.16b v8, v4, v5, v9 + bcax.16b v9, v9, v6, v5 + bcax.16b v5, v5, v29, v6 + bcax.16b v6, v6, v4, v29 + + bcax.16b v3, v27, v0, v28 + bcax.16b v4, v28, v1, v0 + bcax.16b v0, v0, v2, v1 + bcax.16b v1, v1, v27, v2 + bcax.16b v2, v2, v28, v27 + + eor.16b v0,v0,v26 + + // Rounds loop + cbnz w8, 0b + + // Write state + st1.1d {{ v0- v3}}, [x0], #32 + st1.1d {{ v4- v7}}, [x0], #32 + st1.1d {{ v8-v11}}, [x0], #32 + st1.1d {{v12-v15}}, [x0], #32 + st1.1d {{v16-v19}}, [x0], #32 + st1.1d {{v20-v23}}, [x0], #32 + st1.1d {{v24}}, [x0] + ", + in("x0") state.as_mut_ptr(), + in("x1") crate::RC.as_ptr(), + clobber_abi("C"), + options(nostack) + ); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_keccak_f1600() { + // Test vectors are copied from XKCP (eXtended Keccak Code Package) + // https://github.com/XKCP/XKCP/blob/master/tests/TestVectors/KeccakF-1600-IntermediateValues.txt + let state_first = [ + 0xF1258F7940E1DDE7, + 0x84D5CCF933C0478A, + 0xD598261EA65AA9EE, + 0xBD1547306F80494D, + 0x8B284E056253D057, + 0xFF97A42D7F8E6FD4, + 0x90FEE5A0A44647C4, + 0x8C5BDA0CD6192E76, + 0xAD30A6F71B19059C, + 0x30935AB7D08FFC64, + 0xEB5AA93F2317D635, + 0xA9A6E6260D712103, + 0x81A57C16DBCF555F, + 0x43B831CD0347C826, + 0x01F22F1A11A5569F, + 0x05E5635A21D9AE61, + 0x64BEFEF28CC970F2, + 0x613670957BC46611, + 0xB87C5A554FD00ECB, + 0x8C3EE88A1CCF32C8, + 0x940C7922AE3A2614, + 0x1841F924A2C509E4, + 0x16F53526E70465C2, + 0x75F644E97F30A13B, + 0xEAF1FF7B5CECA249, + ]; + let state_second = [ + 0x2D5C954DF96ECB3C, + 0x6A332CD07057B56D, + 0x093D8D1270D76B6C, + 0x8A20D9B25569D094, + 0x4F9C4F99E5E7F156, + 0xF957B9A2DA65FB38, + 0x85773DAE1275AF0D, + 0xFAF4F247C3D810F7, + 0x1F1B9EE6F79A8759, + 0xE4FECC0FEE98B425, + 0x68CE61B6B9CE68A1, + 0xDEEA66C4BA8F974F, + 0x33C43D836EAFB1F5, + 0xE00654042719DBD9, + 0x7CF8A9F009831265, + 0xFD5449A6BF174743, + 0x97DDAD33D8994B40, + 0x48EAD5FC5D0BE774, + 0xE3B8C8EE55B7B03C, + 0x91A0226E649E42E9, + 0x900E3129E7BADD7B, + 0x202A9EC5FAA3CCE8, + 0x5B3402464E1C3DB6, + 0x609F4E62A44C1059, + 0x20D06CD26A8FBF5C, + ]; + + let mut state = [0u64; 25]; + keccak_f1600(&mut state); + assert_eq!(state, state_first); + keccak_f1600(&mut state); + assert_eq!(state, state_second); + } +} diff --git a/keccak/src/lib.rs b/keccak/src/lib.rs index 87abf9e..9ce5559 100644 --- a/keccak/src/lib.rs +++ b/keccak/src/lib.rs @@ -48,6 +48,7 @@ use core::{ #[rustfmt::skip] mod unroll; +mod aarch64_sha3; const PLEN: usize = 25; @@ -143,8 +144,13 @@ macro_rules! impl_keccak { impl_keccak!(f200, u8); impl_keccak!(f400, u16); impl_keccak!(f800, u32); + +#[cfg(not(all(target_arch = "aarch64", target_feature = "sha3")))] impl_keccak!(f1600, u64); +#[cfg(all(target_arch = "aarch64", target_feature = "sha3"))] +pub use aarch64_sha3::keccak_f1600 as f1600; + #[cfg(feature = "simd")] /// SIMD implementations for Keccak-f1600 sponge function pub mod simd {