ServiceNow · jlamypoirier · Apr 23, 2026 · Apr 23, 2026
diff --git a/fast_llm_external_models/tests/test_apriel2/test_mixer_equivalence.py b/fast_llm_external_models/tests/test_apriel2/test_mixer_equivalence.py
@@ -144,7 +144,7 @@ def kda_mixer_config(kda_config):
         "heads": num_heads,
         "head_dim": head_dim,
         "convolution_layer": {"kernel_size": 4},
-        "normalization": {"epsilon": 1e-5},
+        "normalization": {"epsilon": 1e-5, "activation": "sigmoid"},
     }
 
 
@@ -1088,9 +1088,8 @@ def test_vs_fla(
         fla_cache = FLACache()
         apriel_cache = Apriel2Cache(make_apriel2_config(kda_hidden_size, kda_mixer_config))
 
-        # Force chunk mode for prefill
-        fla_kda.mode = "chunk"
-        apriel_kda.mode = "chunk"
+        # Match Apriel2's mode selection: fused_recurrent for seq_len<=64 in eval
+        fla_kda.mode = "fused_recurrent"
 
         # ========== PHASE 1: Initial Prefill ==========
         prefill_input = hidden_states[:, :prefill_len, :]
@@ -1125,7 +1124,6 @@ def test_vs_fla(
 
         # ========== PHASE 2: Decode (single tokens) ==========
         fla_kda.mode = "fused_recurrent"
-        apriel_kda.mode = "fused_recurrent"
 
         for i in range(decode_steps):
             pos = prefill_len + i
@@ -1160,9 +1158,7 @@ def test_vs_fla(
         )
 
         # ========== PHASE 3: Prefill again (decode→prefill transition) ==========
-        # FLA KDA correctly uses initial_state in chunk mode, so this should match
-        fla_kda.mode = "chunk"
-        apriel_kda.mode = "chunk"
+        fla_kda.mode = "fused_recurrent"
 
         prefill2_start = prefill_len + decode_steps
         prefill2_input = hidden_states[:, prefill2_start : prefill2_start + prefill2_len, :]

diff --git a/setup.cfg b/setup.cfg
@@ -44,6 +44,7 @@ OPTIONAL =
 # Huggingface tools
 HUGGINGFACE =
     transformers>=4.57.3,<5.0.0
+    accelerate>=1.4.0
     hf-transfer>=0.1.9
     datasets>=4.4.1
     huggingface-hub>=0.36.0