Skip to content

Commit

Permalink
aarch64 JIT: optimized soft AES
Browse files Browse the repository at this point in the history
26 fewer instructions per call
  • Loading branch information
SChernykh committed Sep 28, 2023
1 parent a99d803 commit c29bf53
Showing 1 changed file with 59 additions and 90 deletions.
149 changes: 59 additions & 90 deletions src/jit_compiler_a64_static.S
Original file line number Diff line number Diff line change
Expand Up @@ -539,83 +539,54 @@ DECL(randomx_program_aarch64_v2_FE_mix_soft_aes):
stp x16, x30, [sp, 128]
stp q0, q1, [sp, 144]

# f0 = aesenc(f0, e0), f1 = aesdec(f1, e0), f2 = aesenc(f2, e0), f3 = aesdec(f3, e0)

mov v1.16b, v20.16b
adr x18, randomx_aes_lut_enc
adr x19, randomx_aes_lut_dec

# f0 = aesenc(f0, e0), f0 = aesenc(f0, e1), f0 = aesenc(f0, e2), f0 = aesenc(f0, e3)
mov v0.16b, v16.16b
mov v1.16b, v20.16b
bl randomx_soft_aesenc
mov v16.16b, v0.16b

mov v0.16b, v17.16b
bl randomx_soft_aesdec
mov v17.16b, v0.16b

mov v0.16b, v18.16b
bl randomx_soft_aesenc
mov v18.16b, v0.16b

mov v0.16b, v19.16b
bl randomx_soft_aesdec
mov v19.16b, v0.16b

# f0 = aesenc(f0, e1), f1 = aesdec(f1, e1), f2 = aesenc(f2, e1), f3 = aesdec(f3, e1)

mov v1.16b, v21.16b

mov v0.16b, v16.16b
bl randomx_soft_aesenc
mov v16.16b, v0.16b

mov v0.16b, v17.16b
bl randomx_soft_aesdec
mov v17.16b, v0.16b

mov v0.16b, v18.16b
bl randomx_soft_aesenc
mov v18.16b, v0.16b

mov v0.16b, v19.16b
bl randomx_soft_aesdec
mov v19.16b, v0.16b

# f0 = aesenc(f0, e2), f1 = aesdec(f1, e2), f2 = aesenc(f2, e2), f3 = aesdec(f3, e2)

mov v1.16b, v22.16b

mov v0.16b, v16.16b
bl randomx_soft_aesenc
mov v1.16b, v23.16b
bl randomx_soft_aesenc
mov v16.16b, v0.16b

# f1 = aesdec(f1, e0), f1 = aesdec(f1, e1), f1 = aesdec(f1, e2), f1 = aesdec(f1, e3)
mov v0.16b, v17.16b
mov v1.16b, v20.16b
bl randomx_soft_aesdec
mov v17.16b, v0.16b

mov v0.16b, v18.16b
bl randomx_soft_aesenc
mov v18.16b, v0.16b

mov v0.16b, v19.16b
mov v1.16b, v21.16b
bl randomx_soft_aesdec
mov v1.16b, v22.16b
bl randomx_soft_aesdec
mov v19.16b, v0.16b

# f0 = aesenc(f0, e3), f1 = aesdec(f1, e3), f2 = aesenc(f2, e3), f3 = aesdec(f3, e3)

mov v1.16b, v23.16b

mov v0.16b, v16.16b
bl randomx_soft_aesenc
mov v16.16b, v0.16b

mov v0.16b, v17.16b
bl randomx_soft_aesdec
mov v17.16b, v0.16b

# f2 = aesenc(f2, e0), f2 = aesenc(f2, e1), f2 = aesenc(f2, e2), f2 = aesenc(f2, e3)
mov v0.16b, v18.16b
mov v1.16b, v20.16b
bl randomx_soft_aesenc
mov v1.16b, v21.16b
bl randomx_soft_aesenc
mov v1.16b, v22.16b
bl randomx_soft_aesenc
mov v1.16b, v23.16b
bl randomx_soft_aesenc
mov v18.16b, v0.16b

# f3 = aesdec(f3, e0), f3 = aesdec(f3, e1), f3 = aesdec(f3, e2), f3 = aesdec(f3, e3)
mov v0.16b, v19.16b
mov v1.16b, v20.16b
bl randomx_soft_aesdec
mov v1.16b, v21.16b
bl randomx_soft_aesdec
mov v1.16b, v22.16b
bl randomx_soft_aesdec
mov v1.16b, v23.16b
bl randomx_soft_aesdec
mov v19.16b, v0.16b

Expand Down Expand Up @@ -644,7 +615,6 @@ randomx_soft_aesenc:
umov w11, v0.b[3]
umov w5, v0.b[0]
umov w16, v0.b[4]
adr x0, randomx_aes_lut_enc
add x4, x4, 256
add x1, x1, 512
add x12, x12, 768
Expand All @@ -654,20 +624,20 @@ randomx_soft_aesenc:
add x9, x9, 256
add x2, x2, 512
add x11, x11, 768
ldr w10, [x0, x4, lsl 2]
ldr w15, [x0, x5, lsl 2]
ldr w10, [x18, x4, lsl 2]
ldr w15, [x18, x5, lsl 2]
umov w13, v0.b[8]
ldr w14, [x0, x12, lsl 2]
ldr w14, [x18, x12, lsl 2]
umov w6, v0.b[1]
ldr w1, [x0, x1, lsl 2]
ldr w1, [x18, x1, lsl 2]
eor w10, w10, w15
ldr w2, [x0, x2, lsl 2]
ldr w2, [x18, x2, lsl 2]
umov w5, v0.b[6]
ldr w9, [x0, x9, lsl 2]
ldr w9, [x18, x9, lsl 2]
umov w4, v0.b[11]
ldr w12, [x0, x16, lsl 2]
ldr w12, [x18, x16, lsl 2]
eor w1, w1, w14
ldr w11, [x0, x11, lsl 2]
ldr w11, [x18, x11, lsl 2]
eor w1, w1, w10
add x8, x8, 512
add x3, x3, 256
Expand All @@ -677,21 +647,21 @@ randomx_soft_aesenc:
eor w1, w2, w11
umov w10, v0.b[12]
eor w1, w1, w9
ldr w3, [x0, x3, lsl 2]
ldr w3, [x18, x3, lsl 2]
add x6, x6, 256
ldr w9, [x0, x13, lsl 2]
ldr w9, [x18, x13, lsl 2]
ins v28.s[1], w1
ldr w2, [x0, x8, lsl 2]
ldr w2, [x18, x8, lsl 2]
add x5, x5, 512
ldr w7, [x0, x7, lsl 2]
ldr w7, [x18, x7, lsl 2]
add x4, x4, 768
eor w1, w3, w9
ldr w3, [x0, x6, lsl 2]
ldr w3, [x18, x6, lsl 2]
eor w2, w2, w7
ldr w6, [x0, x10, lsl 2]
ldr w6, [x18, x10, lsl 2]
eor w2, w2, w1
ldr w1, [x0, x5, lsl 2]
ldr w0, [x0, x4, lsl 2]
ldr w1, [x18, x5, lsl 2]
ldr w0, [x18, x4, lsl 2]
eor w3, w3, w6
ins v28.s[2], w2
eor w0, w1, w0
Expand All @@ -709,7 +679,6 @@ randomx_soft_aesdec:
umov w11, v0.b[1]
umov w4, v0.b[0]
umov w16, v0.b[4]
adr x0, randomx_aes_lut_dec
add x3, x3, 768
add x1, x1, 512
add x12, x12, 256
Expand All @@ -719,20 +688,20 @@ randomx_soft_aesdec:
add x9, x9, 768
add x2, x2, 512
add x11, x11, 256
ldr w15, [x0, x3, lsl 2]
ldr w10, [x0, x4, lsl 2]
ldr w15, [x19, x3, lsl 2]
ldr w10, [x19, x4, lsl 2]
umov w13, v0.b[8]
ldr w14, [x0, x12, lsl 2]
ldr w14, [x19, x12, lsl 2]
umov w5, v0.b[9]
ldr w1, [x0, x1, lsl 2]
ldr w1, [x19, x1, lsl 2]
umov w3, v0.b[6]
ldr w12, [x0, x9, lsl 2]
ldr w12, [x19, x9, lsl 2]
umov w4, v0.b[3]
ldr w9, [x0, x16, lsl 2]
ldr w9, [x19, x16, lsl 2]
eor w1, w1, w15
ldr w2, [x0, x2, lsl 2]
ldr w2, [x19, x2, lsl 2]
eor w10, w10, w14
ldr w11, [x0, x11, lsl 2]
ldr w11, [x19, x11, lsl 2]
eor w1, w1, w10
add x8, x8, 256
add x6, x6, 512
Expand All @@ -742,21 +711,21 @@ randomx_soft_aesdec:
eor w1, w9, w11
eor w1, w2, w1
umov w9, v0.b[12]
ldr w2, [x0, x13, lsl 2]
ldr w2, [x19, x13, lsl 2]
add x5, x5, 256
ldr w8, [x0, x8, lsl 2]
ldr w8, [x19, x8, lsl 2]
ins v28.s[1], w1
ldr w6, [x0, x6, lsl 2]
ldr w6, [x19, x6, lsl 2]
add x3, x3, 512
ldr w7, [x0, x7, lsl 2]
ldr w7, [x19, x7, lsl 2]
add x4, x4, 768
eor w2, w2, w8
ldr w1, [x0, x9, lsl 2]
ldr w1, [x19, x9, lsl 2]
eor w6, w6, w7
ldr w3, [x0, x3, lsl 2]
ldr w3, [x19, x3, lsl 2]
eor w2, w2, w6
ldr w4, [x0, x4, lsl 2]
ldr w5, [x0, x5, lsl 2]
ldr w4, [x19, x4, lsl 2]
ldr w5, [x19, x5, lsl 2]
ins v28.s[2], w2
eor w0, w1, w5
eor w1, w3, w4
Expand Down

0 comments on commit c29bf53

Please sign in to comment.