From c29bf531bb1cf18143564a006bd44f9c272cde7f Mon Sep 17 00:00:00 2001 From: SChernykh Date: Thu, 28 Sep 2023 10:52:40 +0200 Subject: [PATCH] aarch64 JIT: optimized soft AES 26 fewer instructions per call --- src/jit_compiler_a64_static.S | 149 ++++++++++++++-------------------- 1 file changed, 59 insertions(+), 90 deletions(-) diff --git a/src/jit_compiler_a64_static.S b/src/jit_compiler_a64_static.S index 62d258e..f066412 100644 --- a/src/jit_compiler_a64_static.S +++ b/src/jit_compiler_a64_static.S @@ -539,83 +539,54 @@ DECL(randomx_program_aarch64_v2_FE_mix_soft_aes): stp x16, x30, [sp, 128] stp q0, q1, [sp, 144] - # f0 = aesenc(f0, e0), f1 = aesdec(f1, e0), f2 = aesenc(f2, e0), f3 = aesdec(f3, e0) - - mov v1.16b, v20.16b + adr x18, randomx_aes_lut_enc + adr x19, randomx_aes_lut_dec + # f0 = aesenc(f0, e0), f0 = aesenc(f0, e1), f0 = aesenc(f0, e2), f0 = aesenc(f0, e3) mov v0.16b, v16.16b + mov v1.16b, v20.16b bl randomx_soft_aesenc - mov v16.16b, v0.16b - - mov v0.16b, v17.16b - bl randomx_soft_aesdec - mov v17.16b, v0.16b - - mov v0.16b, v18.16b - bl randomx_soft_aesenc - mov v18.16b, v0.16b - - mov v0.16b, v19.16b - bl randomx_soft_aesdec - mov v19.16b, v0.16b - - # f0 = aesenc(f0, e1), f1 = aesdec(f1, e1), f2 = aesenc(f2, e1), f3 = aesdec(f3, e1) - mov v1.16b, v21.16b - - mov v0.16b, v16.16b - bl randomx_soft_aesenc - mov v16.16b, v0.16b - - mov v0.16b, v17.16b - bl randomx_soft_aesdec - mov v17.16b, v0.16b - - mov v0.16b, v18.16b bl randomx_soft_aesenc - mov v18.16b, v0.16b - - mov v0.16b, v19.16b - bl randomx_soft_aesdec - mov v19.16b, v0.16b - - # f0 = aesenc(f0, e2), f1 = aesdec(f1, e2), f2 = aesenc(f2, e2), f3 = aesdec(f3, e2) - mov v1.16b, v22.16b - - mov v0.16b, v16.16b + bl randomx_soft_aesenc + mov v1.16b, v23.16b bl randomx_soft_aesenc mov v16.16b, v0.16b + # f1 = aesdec(f1, e0), f1 = aesdec(f1, e1), f1 = aesdec(f1, e2), f1 = aesdec(f1, e3) mov v0.16b, v17.16b + mov v1.16b, v20.16b bl randomx_soft_aesdec - mov v17.16b, v0.16b - - mov v0.16b, v18.16b - bl randomx_soft_aesenc - mov v18.16b, v0.16b - - mov v0.16b, v19.16b + mov v1.16b, v21.16b + bl randomx_soft_aesdec + mov v1.16b, v22.16b bl randomx_soft_aesdec - mov v19.16b, v0.16b - - # f0 = aesenc(f0, e3), f1 = aesdec(f1, e3), f2 = aesenc(f2, e3), f3 = aesdec(f3, e3) - mov v1.16b, v23.16b - - mov v0.16b, v16.16b - bl randomx_soft_aesenc - mov v16.16b, v0.16b - - mov v0.16b, v17.16b bl randomx_soft_aesdec mov v17.16b, v0.16b + # f2 = aesenc(f2, e0), f2 = aesenc(f2, e1), f2 = aesenc(f2, e2), f2 = aesenc(f2, e3) mov v0.16b, v18.16b + mov v1.16b, v20.16b + bl randomx_soft_aesenc + mov v1.16b, v21.16b + bl randomx_soft_aesenc + mov v1.16b, v22.16b + bl randomx_soft_aesenc + mov v1.16b, v23.16b bl randomx_soft_aesenc mov v18.16b, v0.16b + # f3 = aesdec(f3, e0), f3 = aesdec(f3, e1), f3 = aesdec(f3, e2), f3 = aesdec(f3, e3) mov v0.16b, v19.16b + mov v1.16b, v20.16b + bl randomx_soft_aesdec + mov v1.16b, v21.16b + bl randomx_soft_aesdec + mov v1.16b, v22.16b + bl randomx_soft_aesdec + mov v1.16b, v23.16b bl randomx_soft_aesdec mov v19.16b, v0.16b @@ -644,7 +615,6 @@ randomx_soft_aesenc: umov w11, v0.b[3] umov w5, v0.b[0] umov w16, v0.b[4] - adr x0, randomx_aes_lut_enc add x4, x4, 256 add x1, x1, 512 add x12, x12, 768 @@ -654,20 +624,20 @@ randomx_soft_aesenc: add x9, x9, 256 add x2, x2, 512 add x11, x11, 768 - ldr w10, [x0, x4, lsl 2] - ldr w15, [x0, x5, lsl 2] + ldr w10, [x18, x4, lsl 2] + ldr w15, [x18, x5, lsl 2] umov w13, v0.b[8] - ldr w14, [x0, x12, lsl 2] + ldr w14, [x18, x12, lsl 2] umov w6, v0.b[1] - ldr w1, [x0, x1, lsl 2] + ldr w1, [x18, x1, lsl 2] eor w10, w10, w15 - ldr w2, [x0, x2, lsl 2] + ldr w2, [x18, x2, lsl 2] umov w5, v0.b[6] - ldr w9, [x0, x9, lsl 2] + ldr w9, [x18, x9, lsl 2] umov w4, v0.b[11] - ldr w12, [x0, x16, lsl 2] + ldr w12, [x18, x16, lsl 2] eor w1, w1, w14 - ldr w11, [x0, x11, lsl 2] + ldr w11, [x18, x11, lsl 2] eor w1, w1, w10 add x8, x8, 512 add x3, x3, 256 @@ -677,21 +647,21 @@ randomx_soft_aesenc: eor w1, w2, w11 umov w10, v0.b[12] eor w1, w1, w9 - ldr w3, [x0, x3, lsl 2] + ldr w3, [x18, x3, lsl 2] add x6, x6, 256 - ldr w9, [x0, x13, lsl 2] + ldr w9, [x18, x13, lsl 2] ins v28.s[1], w1 - ldr w2, [x0, x8, lsl 2] + ldr w2, [x18, x8, lsl 2] add x5, x5, 512 - ldr w7, [x0, x7, lsl 2] + ldr w7, [x18, x7, lsl 2] add x4, x4, 768 eor w1, w3, w9 - ldr w3, [x0, x6, lsl 2] + ldr w3, [x18, x6, lsl 2] eor w2, w2, w7 - ldr w6, [x0, x10, lsl 2] + ldr w6, [x18, x10, lsl 2] eor w2, w2, w1 - ldr w1, [x0, x5, lsl 2] - ldr w0, [x0, x4, lsl 2] + ldr w1, [x18, x5, lsl 2] + ldr w0, [x18, x4, lsl 2] eor w3, w3, w6 ins v28.s[2], w2 eor w0, w1, w0 @@ -709,7 +679,6 @@ randomx_soft_aesdec: umov w11, v0.b[1] umov w4, v0.b[0] umov w16, v0.b[4] - adr x0, randomx_aes_lut_dec add x3, x3, 768 add x1, x1, 512 add x12, x12, 256 @@ -719,20 +688,20 @@ randomx_soft_aesdec: add x9, x9, 768 add x2, x2, 512 add x11, x11, 256 - ldr w15, [x0, x3, lsl 2] - ldr w10, [x0, x4, lsl 2] + ldr w15, [x19, x3, lsl 2] + ldr w10, [x19, x4, lsl 2] umov w13, v0.b[8] - ldr w14, [x0, x12, lsl 2] + ldr w14, [x19, x12, lsl 2] umov w5, v0.b[9] - ldr w1, [x0, x1, lsl 2] + ldr w1, [x19, x1, lsl 2] umov w3, v0.b[6] - ldr w12, [x0, x9, lsl 2] + ldr w12, [x19, x9, lsl 2] umov w4, v0.b[3] - ldr w9, [x0, x16, lsl 2] + ldr w9, [x19, x16, lsl 2] eor w1, w1, w15 - ldr w2, [x0, x2, lsl 2] + ldr w2, [x19, x2, lsl 2] eor w10, w10, w14 - ldr w11, [x0, x11, lsl 2] + ldr w11, [x19, x11, lsl 2] eor w1, w1, w10 add x8, x8, 256 add x6, x6, 512 @@ -742,21 +711,21 @@ randomx_soft_aesdec: eor w1, w9, w11 eor w1, w2, w1 umov w9, v0.b[12] - ldr w2, [x0, x13, lsl 2] + ldr w2, [x19, x13, lsl 2] add x5, x5, 256 - ldr w8, [x0, x8, lsl 2] + ldr w8, [x19, x8, lsl 2] ins v28.s[1], w1 - ldr w6, [x0, x6, lsl 2] + ldr w6, [x19, x6, lsl 2] add x3, x3, 512 - ldr w7, [x0, x7, lsl 2] + ldr w7, [x19, x7, lsl 2] add x4, x4, 768 eor w2, w2, w8 - ldr w1, [x0, x9, lsl 2] + ldr w1, [x19, x9, lsl 2] eor w6, w6, w7 - ldr w3, [x0, x3, lsl 2] + ldr w3, [x19, x3, lsl 2] eor w2, w2, w6 - ldr w4, [x0, x4, lsl 2] - ldr w5, [x0, x5, lsl 2] + ldr w4, [x19, x4, lsl 2] + ldr w5, [x19, x5, lsl 2] ins v28.s[2], w2 eor w0, w1, w5 eor w1, w3, w4