diff --git a/keccak/Cargo.toml b/keccak/Cargo.toml
index 570db27..9cd542b 100644
--- a/keccak/Cargo.toml
+++ b/keccak/Cargo.toml
@@ -14,5 +14,9 @@ categories = ["cryptography", "no-std"]
 readme = "README.md"
 
 [features]
+asm = []       # Use optimized assembly when available (currently only ARMv8)
 no_unroll = [] # Do no unroll loops for binary size reduction
-simd = [] # Use core::simd (WARNING: requires Nigthly)
+simd = []      # Use core::simd (WARNING: requires Nigthly)
+
+[target.'cfg(target_arch = "aarch64")'.dependencies]
+cpufeatures = "0.2"
diff --git a/keccak/src/aarch64_sha3.rs b/keccak/src/aarch64_sha3.rs
index 93a640d..88e756c 100644
--- a/keccak/src/aarch64_sha3.rs
+++ b/keccak/src/aarch64_sha3.rs
@@ -1,130 +1,127 @@
-#![cfg(all(target_arch = "aarch64", target_feature = "sha3"))]
-
 /// Keccak-f1600 on ARMv8.4-A with FEAT_SHA3.
 ///
 /// See p. K12.2.2  p. 11,749 of the ARM Reference manual.
 /// Adapted from the Keccak-f1600 implementation in the XKCP/K12.
 /// see <https://github.com/XKCP/K12/blob/df6a21e6d1f34c1aa36e8d702540899c97dba5a0/lib/ARMv8Asha3/KeccakP-1600-ARMv8Asha3.S#L69>
-pub fn keccak_f1600(state: &mut [u64; 25]) {
-    unsafe {
-        core::arch::asm!("
-            // Read state
-            ld1.1d {{ v0- v3}}, [x0], #32
-            ld1.1d {{ v4- v7}}, [x0], #32
-            ld1.1d {{ v8-v11}}, [x0], #32
-            ld1.1d {{v12-v15}}, [x0], #32
-            ld1.1d {{v16-v19}}, [x0], #32
-            ld1.1d {{v20-v23}}, [x0], #32
-            ld1.1d {{v24}},     [x0]
-            sub x0, x0, #192
+#[target_feature(enable = "sha3")]
+pub unsafe fn f1600_asm(state: &mut [u64; 25]) {
+    core::arch::asm!("
+        // Read state
+        ld1.1d {{ v0- v3}}, [x0], #32
+        ld1.1d {{ v4- v7}}, [x0], #32
+        ld1.1d {{ v8-v11}}, [x0], #32
+        ld1.1d {{v12-v15}}, [x0], #32
+        ld1.1d {{v16-v19}}, [x0], #32
+        ld1.1d {{v20-v23}}, [x0], #32
+        ld1.1d {{v24}},     [x0]
+        sub x0, x0, #192
 
-            // Loop 24 rounds
-            // NOTE: This loop actually computes two f1600 functions in
-            // parallel, in both the lower and the upper 64-bit of the
-            // 128-bit registers v0-v24.
-            mov	x8, #24
-        0:  sub	x8, x8, #1
+        // Loop 24 rounds
+        // NOTE: This loop actually computes two f1600 functions in
+        // parallel, in both the lower and the upper 64-bit of the
+        // 128-bit registers v0-v24.
+        mov	x8, #24
+    0:  sub	x8, x8, #1
 
-            // Theta Calculations
-            eor3.16b   v25, v20, v15, v10
-            eor3.16b   v26, v21, v16, v11
-            eor3.16b   v27, v22, v17, v12
-            eor3.16b   v28, v23, v18, v13
-            eor3.16b   v29, v24, v19, v14
-            eor3.16b   v25, v25,  v5,  v0
-            eor3.16b   v26, v26,  v6,  v1
-            eor3.16b   v27, v27,  v7,  v2
-            eor3.16b   v28, v28,  v8,  v3
-            eor3.16b   v29, v29,  v9,  v4
-            rax1.2d    v30, v25, v27
-            rax1.2d    v31, v26, v28
-            rax1.2d    v27, v27, v29
-            rax1.2d    v28, v28, v25
-            rax1.2d    v29, v29, v26
-            
-            // Rho and Phi
-            eor.16b     v0,  v0, v29
-            xar.2d     v25,  v1, v30, #64 -  1
-            xar.2d      v1,  v6, v30, #64 - 44
-            xar.2d      v6,  v9, v28, #64 - 20
-            xar.2d      v9, v22, v31, #64 - 61
-            xar.2d     v22, v14, v28, #64 - 39
-            xar.2d     v14, v20, v29, #64 - 18
-            xar.2d     v26,  v2, v31, #64 - 62
-            xar.2d      v2, v12, v31, #64 - 43
-            xar.2d     v12, v13, v27, #64 - 25
-            xar.2d     v13, v19, v28, #64 -  8
-            xar.2d     v19, v23, v27, #64 - 56
-            xar.2d     v23, v15, v29, #64 - 41
-            xar.2d     v15,  v4, v28, #64 - 27
-            xar.2d     v28, v24, v28, #64 - 14
-            xar.2d     v24, v21, v30, #64 -  2
-            xar.2d      v8,  v8, v27, #64 - 55
-            xar.2d      v4, v16, v30, #64 - 45
-            xar.2d     v16,  v5, v29, #64 - 36
-            xar.2d      v5,  v3, v27, #64 - 28
-            xar.2d     v27, v18, v27, #64 - 21
-            xar.2d      v3, v17, v31, #64 - 15
-            xar.2d     v30, v11, v30, #64 - 10
-            xar.2d     v31,  v7, v31, #64 -  6
-            xar.2d     v29, v10, v29, #64 -  3
+        // Theta Calculations
+        eor3.16b   v25, v20, v15, v10
+        eor3.16b   v26, v21, v16, v11
+        eor3.16b   v27, v22, v17, v12
+        eor3.16b   v28, v23, v18, v13
+        eor3.16b   v29, v24, v19, v14
+        eor3.16b   v25, v25,  v5,  v0
+        eor3.16b   v26, v26,  v6,  v1
+        eor3.16b   v27, v27,  v7,  v2
+        eor3.16b   v28, v28,  v8,  v3
+        eor3.16b   v29, v29,  v9,  v4
+        rax1.2d    v30, v25, v27
+        rax1.2d    v31, v26, v28
+        rax1.2d    v27, v27, v29
+        rax1.2d    v28, v28, v25
+        rax1.2d    v29, v29, v26
 
-            // Chi and Iota
-            bcax.16b   v20, v26, v22,  v8
-            bcax.16b   v21,  v8, v23, v22
-            bcax.16b   v22, v22, v24, v23
-            bcax.16b   v23, v23, v26, v24
-            bcax.16b   v24, v24,  v8, v26
-            
-            ld1r.2d    {{v26}}, [x1], #8
+        // Rho and Phi
+        eor.16b     v0,  v0, v29
+        xar.2d     v25,  v1, v30, #64 -  1
+        xar.2d      v1,  v6, v30, #64 - 44
+        xar.2d      v6,  v9, v28, #64 - 20
+        xar.2d      v9, v22, v31, #64 - 61
+        xar.2d     v22, v14, v28, #64 - 39
+        xar.2d     v14, v20, v29, #64 - 18
+        xar.2d     v26,  v2, v31, #64 - 62
+        xar.2d      v2, v12, v31, #64 - 43
+        xar.2d     v12, v13, v27, #64 - 25
+        xar.2d     v13, v19, v28, #64 -  8
+        xar.2d     v19, v23, v27, #64 - 56
+        xar.2d     v23, v15, v29, #64 - 41
+        xar.2d     v15,  v4, v28, #64 - 27
+        xar.2d     v28, v24, v28, #64 - 14
+        xar.2d     v24, v21, v30, #64 -  2
+        xar.2d      v8,  v8, v27, #64 - 55
+        xar.2d      v4, v16, v30, #64 - 45
+        xar.2d     v16,  v5, v29, #64 - 36
+        xar.2d      v5,  v3, v27, #64 - 28
+        xar.2d     v27, v18, v27, #64 - 21
+        xar.2d      v3, v17, v31, #64 - 15
+        xar.2d     v30, v11, v30, #64 - 10
+        xar.2d     v31,  v7, v31, #64 -  6
+        xar.2d     v29, v10, v29, #64 -  3
 
-            bcax.16b   v17, v30, v19,  v3
-            bcax.16b   v18,  v3, v15, v19
-            bcax.16b   v19, v19, v16, v15
-            bcax.16b   v15, v15, v30, v16
-            bcax.16b   v16, v16,  v3, v30
-            
-            bcax.16b   v10, v25, v12, v31
-            bcax.16b   v11, v31, v13, v12
-            bcax.16b   v12, v12, v14, v13
-            bcax.16b   v13, v13, v25, v14
-            bcax.16b   v14, v14, v31, v25
+        // Chi and Iota
+        bcax.16b   v20, v26, v22,  v8
+        bcax.16b   v21,  v8, v23, v22
+        bcax.16b   v22, v22, v24, v23
+        bcax.16b   v23, v23, v26, v24
+        bcax.16b   v24, v24,  v8, v26
 
-            bcax.16b    v7, v29,  v9,  v4
-            bcax.16b    v8,  v4,  v5,  v9
-            bcax.16b    v9,  v9,  v6,  v5
-            bcax.16b    v5,  v5, v29,  v6
-            bcax.16b    v6,  v6,  v4, v29
-            
-            bcax.16b    v3, v27,  v0, v28
-            bcax.16b    v4, v28,  v1,  v0
-            bcax.16b    v0,  v0,  v2,  v1
-            bcax.16b    v1,  v1, v27,  v2
-            bcax.16b    v2,  v2, v28, v27
+        ld1r.2d    {{v26}}, [x1], #8
 
-            eor.16b v0,v0,v26
+        bcax.16b   v17, v30, v19,  v3
+        bcax.16b   v18,  v3, v15, v19
+        bcax.16b   v19, v19, v16, v15
+        bcax.16b   v15, v15, v30, v16
+        bcax.16b   v16, v16,  v3, v30
 
-            // Rounds loop
-            cbnz    w8, 0b
+        bcax.16b   v10, v25, v12, v31
+        bcax.16b   v11, v31, v13, v12
+        bcax.16b   v12, v12, v14, v13
+        bcax.16b   v13, v13, v25, v14
+        bcax.16b   v14, v14, v31, v25
 
-            // Write state
-            st1.1d	{{ v0- v3}}, [x0], #32
-            st1.1d	{{ v4- v7}}, [x0], #32
-            st1.1d	{{ v8-v11}}, [x0], #32
-            st1.1d	{{v12-v15}}, [x0], #32
-            st1.1d	{{v16-v19}}, [x0], #32
-            st1.1d	{{v20-v23}}, [x0], #32
-            st1.1d	{{v24}},     [x0]
-        ",
-            in("x0") state.as_mut_ptr(),
-            in("x1") crate::RC.as_ptr(),
-            clobber_abi("C"),
-            options(nostack)
-        );
-    }
+        bcax.16b    v7, v29,  v9,  v4
+        bcax.16b    v8,  v4,  v5,  v9
+        bcax.16b    v9,  v9,  v6,  v5
+        bcax.16b    v5,  v5, v29,  v6
+        bcax.16b    v6,  v6,  v4, v29
+
+        bcax.16b    v3, v27,  v0, v28
+        bcax.16b    v4, v28,  v1,  v0
+        bcax.16b    v0,  v0,  v2,  v1
+        bcax.16b    v1,  v1, v27,  v2
+        bcax.16b    v2,  v2, v28, v27
+
+        eor.16b v0,v0,v26
+
+        // Rounds loop
+        cbnz    w8, 0b
+
+        // Write state
+        st1.1d	{{ v0- v3}}, [x0], #32
+        st1.1d	{{ v4- v7}}, [x0], #32
+        st1.1d	{{ v8-v11}}, [x0], #32
+        st1.1d	{{v12-v15}}, [x0], #32
+        st1.1d	{{v16-v19}}, [x0], #32
+        st1.1d	{{v20-v23}}, [x0], #32
+        st1.1d	{{v24}},     [x0]
+    ",
+        in("x0") state.as_mut_ptr(),
+        in("x1") crate::RC.as_ptr(),
+        clobber_abi("C"),
+        options(nostack)
+    );
 }
 
-#[cfg(test)]
+#[cfg(all(test, target_feature = "sha3"))]
 mod tests {
     use super::*;
 
@@ -188,9 +185,9 @@ mod tests {
         ];
 
         let mut state = [0u64; 25];
-        keccak_f1600(&mut state);
+        unsafe { keccak_f1600(&mut state) };
         assert_eq!(state, state_first);
-        keccak_f1600(&mut state);
+        unsafe { keccak_f1600(&mut state) };
         assert_eq!(state, state_second);
     }
 }
diff --git a/keccak/src/lib.rs b/keccak/src/lib.rs
index 9ce5559..b5de246 100644
--- a/keccak/src/lib.rs
+++ b/keccak/src/lib.rs
@@ -48,8 +48,16 @@ use core::{
 
 #[rustfmt::skip]
 mod unroll;
+
+#[cfg(all(target_arch = "aarch64", feature = "asm"))]
 mod aarch64_sha3;
 
+#[cfg(all(target_arch = "aarch64", feature = "asm"))]
+pub use aarch64_sha3::f1600_asm;
+
+#[cfg(all(target_arch = "aarch64", feature = "asm"))]
+cpufeatures::new!(armv8_sha3_intrinsics, "sha3");
+
 const PLEN: usize = 25;
 
 const RHO: [u32; 24] = [
@@ -145,11 +153,17 @@ impl_keccak!(f200, u8);
 impl_keccak!(f400, u16);
 impl_keccak!(f800, u32);
 
-#[cfg(not(all(target_arch = "aarch64", target_feature = "sha3")))]
+#[cfg(not(all(target_arch = "aarch64", feature = "asm")))]
 impl_keccak!(f1600, u64);
 
-#[cfg(all(target_arch = "aarch64", target_feature = "sha3"))]
-pub use aarch64_sha3::keccak_f1600 as f1600;
+#[cfg(all(target_arch = "aarch64", feature = "asm"))]
+pub fn f1600(state: &mut [u64; PLEN]) {
+    if armv8_sha3_intrinsics::get() {
+        unsafe { f1600_asm(state) }
+    } else {
+        keccak_p(state, u64::KECCAK_F_ROUND_COUNT);
+    }
+}
 
 #[cfg(feature = "simd")]
 /// SIMD implementations for Keccak-f1600 sponge function