Skip to content

Commit

Permalink
sha2: add read_volatile workaround for round constants (#547)
Browse files Browse the repository at this point in the history
Prevents compiler from inlining round constants or spilling them to
stack, which can slightly improve performance.
  • Loading branch information
newpavlov committed Jan 12, 2024
1 parent 05db3ae commit 3020704
Show file tree
Hide file tree
Showing 3 changed files with 122 additions and 111 deletions.
125 changes: 55 additions & 70 deletions sha2/src/consts.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,40 @@
#![allow(dead_code, clippy::unreadable_literal)]
#![allow(dead_code)]

pub const STATE_LEN: usize = 8;
pub const BLOCK_LEN: usize = 16;
pub type State256 = [u32; 8];
pub type State512 = [u64; 8];

pub type State256 = [u32; STATE_LEN];
pub type State512 = [u64; STATE_LEN];
pub const H256_224: State256 = [
0xc1059ed8, 0x367cd507, 0x3070dd17, 0xf70e5939,
0xffc00b31, 0x68581511, 0x64f98fa7, 0xbefa4fa4,
];

pub const H256_256: State256 = [
0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19,
];

pub const H512_224: State512 = [
0x8c3d37c819544da2, 0x73e1996689dcd4d6, 0x1dfab7ae32ff9c82, 0x679dd514582f9fcf,
0x0f6d2b697bd44da8, 0x77e36f7304c48942, 0x3f9d85a86a1d36c8, 0x1112e6ad91d692a1,
];

/// Constants necessary for SHA-256 family of digests.
pub const K32: [u32; 64] = [
pub const H512_256: State512 = [
0x22312194fc2bf72c, 0x9f555fa3c84c64c2, 0x2393b86b6f53b151, 0x963877195940eabd,
0x96283ee2a88effe3, 0xbe5e1e2553863992, 0x2b0199fc2c85b8aa, 0x0eb72ddc81c52ca2,
];

pub const H512_384: State512 = [
0xcbbb9d5dc1059ed8, 0x629a292a367cd507, 0x9159015a3070dd17, 0x152fecd8f70e5939,
0x67332667ffc00b31, 0x8eb44a8768581511, 0xdb0c2e0d64f98fa7, 0x47b5481dbefa4fa4,
];

pub const H512_512: State512 = [
0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
];

/// Round constants for SHA-256 family of digests
pub static K32: [u32; 64] = [
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
Expand All @@ -18,27 +45,7 @@ pub const K32: [u32; 64] = [
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
];

/// Constants necessary for SHA-256 family of digests.
pub const K32X4: [[u32; 4]; 16] = [
[K32[3], K32[2], K32[1], K32[0]],
[K32[7], K32[6], K32[5], K32[4]],
[K32[11], K32[10], K32[9], K32[8]],
[K32[15], K32[14], K32[13], K32[12]],
[K32[19], K32[18], K32[17], K32[16]],
[K32[23], K32[22], K32[21], K32[20]],
[K32[27], K32[26], K32[25], K32[24]],
[K32[31], K32[30], K32[29], K32[28]],
[K32[35], K32[34], K32[33], K32[32]],
[K32[39], K32[38], K32[37], K32[36]],
[K32[43], K32[42], K32[41], K32[40]],
[K32[47], K32[46], K32[45], K32[44]],
[K32[51], K32[50], K32[49], K32[48]],
[K32[55], K32[54], K32[53], K32[52]],
[K32[59], K32[58], K32[57], K32[56]],
[K32[63], K32[62], K32[61], K32[60]],
];

/// Constants necessary for SHA-512 family of digests.
/// Round constants for SHA-512 family of digests
pub const K64: [u64; 80] = [
0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc,
0x3956c25bf348b538, 0x59f111f1b605d019, 0x923f82a4af194f9b, 0xab1c5ed5da6d8118,
Expand All @@ -62,46 +69,24 @@ pub const K64: [u64; 80] = [
0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 0x5fcb6fab3ad6faec, 0x6c44198c4a475817,
];

/// Constants necessary for SHA-512 family of digests.
pub const K64X2: [[u64; 2]; 40] = [
[K64[1], K64[0]], [K64[3], K64[2]], [K64[5], K64[4]], [K64[7], K64[6]],
[K64[9], K64[8]], [K64[11], K64[10]], [K64[13], K64[12]], [K64[15], K64[14]],
[K64[17], K64[16]], [K64[19], K64[18]], [K64[21], K64[20]], [K64[23], K64[22]],
[K64[25], K64[24]], [K64[27], K64[26]], [K64[29], K64[28]], [K64[31], K64[30]],
[K64[33], K64[32]], [K64[35], K64[34]], [K64[37], K64[36]], [K64[39], K64[38]],
[K64[41], K64[40]], [K64[43], K64[42]], [K64[45], K64[44]], [K64[47], K64[46]],
[K64[49], K64[48]], [K64[51], K64[50]], [K64[53], K64[52]], [K64[55], K64[54]],
[K64[57], K64[56]], [K64[59], K64[58]], [K64[61], K64[60]], [K64[63], K64[62]],
[K64[65], K64[64]], [K64[67], K64[66]], [K64[69], K64[68]], [K64[71], K64[70]],
[K64[73], K64[72]], [K64[75], K64[74]], [K64[77], K64[76]], [K64[79], K64[78]],
];

pub const H256_224: State256 = [
0xc1059ed8, 0x367cd507, 0x3070dd17, 0xf70e5939,
0xffc00b31, 0x68581511, 0x64f98fa7, 0xbefa4fa4,
];

pub const H256_256: State256 = [
0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19,
];

pub const H512_224: State512 = [
0x8c3d37c819544da2, 0x73e1996689dcd4d6, 0x1dfab7ae32ff9c82, 0x679dd514582f9fcf,
0x0f6d2b697bd44da8, 0x77e36f7304c48942, 0x3f9d85a86a1d36c8, 0x1112e6ad91d692a1,
];

pub const H512_256: State512 = [
0x22312194fc2bf72c, 0x9f555fa3c84c64c2, 0x2393b86b6f53b151, 0x963877195940eabd,
0x96283ee2a88effe3, 0xbe5e1e2553863992, 0x2b0199fc2c85b8aa, 0x0eb72ddc81c52ca2,
];
/// Swapped round constants for SHA-256 family of digests
pub static K32X4: [[u32; 4]; 16] = {
let mut res = [[0u32; 4]; 16];
let mut i = 0;
while i < 16 {
res[i] = [K32[4 * i + 3], K32[4 * i + 2], K32[4 * i + 1], K32[4 * i]];
i += 1;
}
res
};

pub const H512_384: State512 = [
0xcbbb9d5dc1059ed8, 0x629a292a367cd507, 0x9159015a3070dd17, 0x152fecd8f70e5939,
0x67332667ffc00b31, 0x8eb44a8768581511, 0xdb0c2e0d64f98fa7, 0x47b5481dbefa4fa4,
];

pub const H512_512: State512 = [
0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
];
/// Swapped round constants for SHA-512 family of digests
pub const K64X2: [[u64; 2]; 40] = {
let mut res = [[0u64; 2]; 40];
let mut i = 0;
while i < 16 {
res[i] = [K64[4 * i + 1], K64[4 * i]];
i += 1;
}
res
};
37 changes: 29 additions & 8 deletions sha2/src/sha256/soft.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#![allow(clippy::many_single_char_names)]
use crate::consts::BLOCK_LEN;
use crate::consts::K32;

#[inline(always)]
fn shr(v: [u32; 4], o: u32) -> [u32; 4] {
Expand Down Expand Up @@ -31,6 +31,31 @@ fn add(a: [u32; 4], b: [u32; 4]) -> [u32; 4] {
]
}

#[inline(always)]
fn add_round_const(mut a: [u32; 4], i: usize) -> [u32; 4] {
fn k(i: usize, j: usize) -> u32 {
// `read_volatile` forces compiler to read round constants from the static
// instead of inlining them, which improves codegen and performance on some platforms.
// On x86 targets 32-bit constants can be encoded using immediate argument on the `add`
// instruction, so it's more efficient to inline them.
cfg_if::cfg_if! {
if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
use core::ptr::read as r;
} else {
use core::ptr::read_volatile as r;
}
}

unsafe { r(K32.as_ptr().add(4 * i + j)) }
}

a[3] = a[3].wrapping_add(k(i, 0));
a[2] = a[2].wrapping_add(k(i, 1));
a[1] = a[1].wrapping_add(k(i, 2));
a[0] = a[0].wrapping_add(k(i, 3));
a
}

fn sha256load(v2: [u32; 4], v3: [u32; 4]) -> [u32; 4] {
[v3[3], v2[0], v2[1], v2[2]]
}
Expand Down Expand Up @@ -142,7 +167,7 @@ fn schedule(v0: [u32; 4], v1: [u32; 4], v2: [u32; 4], v3: [u32; 4]) -> [u32; 4]

macro_rules! rounds4 {
($abef:ident, $cdgh:ident, $rest:expr, $i:expr) => {{
let t1 = add($rest, crate::consts::K32X4[$i]);
let t1 = add_round_const($rest, $i);
$cdgh = sha256_digest_round_x2($cdgh, $abef, t1);
let t2 = sha256swap(t1);
$abef = sha256_digest_round_x2($abef, $cdgh, t2);
Expand Down Expand Up @@ -203,15 +228,11 @@ fn sha256_digest_block_u32(state: &mut [u32; 8], block: &[u32; 16]) {
}

pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
let mut block_u32 = [0u32; BLOCK_LEN];
// since LLVM can't properly use aliasing yet it will make
// unnecessary state stores without this copy
let mut state_cpy = *state;
for block in blocks {
let mut block_u32 = [0u32; 16];
for (o, chunk) in block_u32.iter_mut().zip(block.chunks_exact(4)) {
*o = u32::from_be_bytes(chunk.try_into().unwrap());
}
sha256_digest_block_u32(&mut state_cpy, &block_u32);
sha256_digest_block_u32(state, &block_u32);
}
*state = state_cpy;
}
71 changes: 38 additions & 33 deletions sha2/src/sha512/soft.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
#![allow(clippy::many_single_char_names)]
use crate::consts::{BLOCK_LEN, K64X2};

fn add(a: [u64; 2], b: [u64; 2]) -> [u64; 2] {
[a[0].wrapping_add(b[0]), a[1].wrapping_add(b[1])]
}
use crate::consts::K64;

/// Not an intrinsic, but works like an unaligned load.
fn sha512load(v0: [u64; 2], v1: [u64; 2]) -> [u64; 2] {
Expand Down Expand Up @@ -93,10 +89,23 @@ pub fn sha512_digest_round(
[a1, e1]
}

#[inline(always)]
fn add_rk(mut w: [u64; 2], i: usize) -> [u64; 2] {
fn rk(i: usize, j: usize) -> u64 {
// `read_volatile` forces compiler to read round constants from the static
// instead of inlining them, which improves codegen and performance
unsafe {
let p = K64.as_ptr().add(2 * i + j);
core::ptr::read_volatile(p)
}
}
w[1] = w[1].wrapping_add(rk(i, 0));
w[0] = w[0].wrapping_add(rk(i, 1));
w
}

/// Process a block with the SHA-512 algorithm.
pub fn sha512_digest_block_u64(state: &mut [u64; 8], block: &[u64; 16]) {
let k = &K64X2;

macro_rules! schedule {
($v0:expr, $v1:expr, $v4:expr, $v5:expr, $v7:expr) => {
sha512_schedule_x2($v0, $v1, sha512load($v4, $v5), $v7)
Expand All @@ -122,67 +131,67 @@ pub fn sha512_digest_block_u64(state: &mut [u64; 8], block: &[u64; 16]) {

// Rounds 0..20
let (mut w1, mut w0) = ([block[3], block[2]], [block[1], block[0]]);
rounds4!(ae, bf, cg, dh, add(k[0], w0), add(k[1], w1));
rounds4!(ae, bf, cg, dh, add_rk(w0, 0), add_rk(w1, 1));
let (mut w3, mut w2) = ([block[7], block[6]], [block[5], block[4]]);
rounds4!(ae, bf, cg, dh, add(k[2], w2), add(k[3], w3));
rounds4!(ae, bf, cg, dh, add_rk(w2, 2), add_rk(w3, 3));
let (mut w5, mut w4) = ([block[11], block[10]], [block[9], block[8]]);
rounds4!(ae, bf, cg, dh, add(k[4], w4), add(k[5], w5));
rounds4!(ae, bf, cg, dh, add_rk(w4, 4), add_rk(w5, 5));
let (mut w7, mut w6) = ([block[15], block[14]], [block[13], block[12]]);
rounds4!(ae, bf, cg, dh, add(k[6], w6), add(k[7], w7));
rounds4!(ae, bf, cg, dh, add_rk(w6, 6), add_rk(w7, 7));
let mut w8 = schedule!(w0, w1, w4, w5, w7);
let mut w9 = schedule!(w1, w2, w5, w6, w8);
rounds4!(ae, bf, cg, dh, add(k[8], w8), add(k[9], w9));
rounds4!(ae, bf, cg, dh, add_rk(w8, 8), add_rk(w9, 9));

// Rounds 20..40
w0 = schedule!(w2, w3, w6, w7, w9);
w1 = schedule!(w3, w4, w7, w8, w0);
rounds4!(ae, bf, cg, dh, add(k[10], w0), add(k[11], w1));
rounds4!(ae, bf, cg, dh, add_rk(w0, 10), add_rk(w1, 11));
w2 = schedule!(w4, w5, w8, w9, w1);
w3 = schedule!(w5, w6, w9, w0, w2);
rounds4!(ae, bf, cg, dh, add(k[12], w2), add(k[13], w3));
rounds4!(ae, bf, cg, dh, add_rk(w2, 12), add_rk(w3, 13));
w4 = schedule!(w6, w7, w0, w1, w3);
w5 = schedule!(w7, w8, w1, w2, w4);
rounds4!(ae, bf, cg, dh, add(k[14], w4), add(k[15], w5));
rounds4!(ae, bf, cg, dh, add_rk(w4, 14), add_rk(w5, 15));
w6 = schedule!(w8, w9, w2, w3, w5);
w7 = schedule!(w9, w0, w3, w4, w6);
rounds4!(ae, bf, cg, dh, add(k[16], w6), add(k[17], w7));
rounds4!(ae, bf, cg, dh, add_rk(w6, 16), add_rk(w7, 17));
w8 = schedule!(w0, w1, w4, w5, w7);
w9 = schedule!(w1, w2, w5, w6, w8);
rounds4!(ae, bf, cg, dh, add(k[18], w8), add(k[19], w9));
rounds4!(ae, bf, cg, dh, add_rk(w8, 18), add_rk(w9, 19));

// Rounds 40..60
w0 = schedule!(w2, w3, w6, w7, w9);
w1 = schedule!(w3, w4, w7, w8, w0);
rounds4!(ae, bf, cg, dh, add(k[20], w0), add(k[21], w1));
rounds4!(ae, bf, cg, dh, add_rk(w0, 20), add_rk(w1, 21));
w2 = schedule!(w4, w5, w8, w9, w1);
w3 = schedule!(w5, w6, w9, w0, w2);
rounds4!(ae, bf, cg, dh, add(k[22], w2), add(k[23], w3));
rounds4!(ae, bf, cg, dh, add_rk(w2, 22), add_rk(w3, 23));
w4 = schedule!(w6, w7, w0, w1, w3);
w5 = schedule!(w7, w8, w1, w2, w4);
rounds4!(ae, bf, cg, dh, add(k[24], w4), add(k[25], w5));
rounds4!(ae, bf, cg, dh, add_rk(w4, 24), add_rk(w5, 25));
w6 = schedule!(w8, w9, w2, w3, w5);
w7 = schedule!(w9, w0, w3, w4, w6);
rounds4!(ae, bf, cg, dh, add(k[26], w6), add(k[27], w7));
rounds4!(ae, bf, cg, dh, add_rk(w6, 26), add_rk(w7, 27));
w8 = schedule!(w0, w1, w4, w5, w7);
w9 = schedule!(w1, w2, w5, w6, w8);
rounds4!(ae, bf, cg, dh, add(k[28], w8), add(k[29], w9));
rounds4!(ae, bf, cg, dh, add_rk(w8, 28), add_rk(w9, 29));

// Rounds 60..80
w0 = schedule!(w2, w3, w6, w7, w9);
w1 = schedule!(w3, w4, w7, w8, w0);
rounds4!(ae, bf, cg, dh, add(k[30], w0), add(k[31], w1));
rounds4!(ae, bf, cg, dh, add_rk(w0, 30), add_rk(w1, 31));
w2 = schedule!(w4, w5, w8, w9, w1);
w3 = schedule!(w5, w6, w9, w0, w2);
rounds4!(ae, bf, cg, dh, add(k[32], w2), add(k[33], w3));
rounds4!(ae, bf, cg, dh, add_rk(w2, 32), add_rk(w3, 33));
w4 = schedule!(w6, w7, w0, w1, w3);
w5 = schedule!(w7, w8, w1, w2, w4);
rounds4!(ae, bf, cg, dh, add(k[34], w4), add(k[35], w5));
rounds4!(ae, bf, cg, dh, add_rk(w4, 34), add_rk(w5, 35));
w6 = schedule!(w8, w9, w2, w3, w5);
w7 = schedule!(w9, w0, w3, w4, w6);
rounds4!(ae, bf, cg, dh, add(k[36], w6), add(k[37], w7));
rounds4!(ae, bf, cg, dh, add_rk(w6, 36), add_rk(w7, 37));
w8 = schedule!(w0, w1, w4, w5, w7);
w9 = schedule!(w1, w2, w5, w6, w8);
rounds4!(ae, bf, cg, dh, add(k[38], w8), add(k[39], w9));
rounds4!(ae, bf, cg, dh, add_rk(w8, 38), add_rk(w9, 39));

let [a, e] = ae;
let [b, f] = bf;
Expand All @@ -200,15 +209,11 @@ pub fn sha512_digest_block_u64(state: &mut [u64; 8], block: &[u64; 16]) {
}

pub fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
let mut block_u32 = [0u64; BLOCK_LEN];
// since LLVM can't properly use aliasing yet it will make
// unnecessary state stores without this copy
let mut state_cpy = *state;
for block in blocks {
let mut block_u32 = [0u64; 16];
for (o, chunk) in block_u32.iter_mut().zip(block.chunks_exact(8)) {
*o = u64::from_be_bytes(chunk.try_into().unwrap());
}
sha512_digest_block_u64(&mut state_cpy, &block_u32);
sha512_digest_block_u64(state, &block_u32);
}
*state = state_cpy;
}

0 comments on commit 3020704

Please sign in to comment.