feat(HookedTransformer) accelerate inference with flash attention

OpenMOSS · Jul 3, 2024 · 0e3d268 · 0e3d268
1 parent d844b51
commit 0e3d268
Show file tree

Hide file tree

Showing 11 changed files with 289 additions and 34 deletions.
diff --git a/TransformerLens/tests/integration/test_flash_attn.py b/TransformerLens/tests/integration/test_flash_attn.py
@@ -0,0 +1,125 @@
+import einops
+import torch
+
+from transformer_lens.components import Attention, GroupedQueryAttention
+from transformer_lens.HookedTransformerConfig import HookedTransformerConfig
+
+
+def test_flash_attention_output_is_correct():
+    """
+    Verify if flash attention output is correct.
+    """
+    d_model = 512
+    d_head = 32
+    n_heads = 16
+    n_ctx = 128
+    n_key_value_heads = 4
+    n_layers = 1
+    dtype = torch.bfloat16
+    device = torch.device('cuda')
+
+    cfg_dict = {
+        'use_flash_attn': False,
+        'd_model': d_model,
+        'd_head': d_head,
+        'n_heads': n_heads,
+        'n_ctx': n_ctx,
+        'n_key_value_heads': n_key_value_heads,
+        'n_layers': n_layers,
+        'act_fn': "silu",
+        'dtype': torch.bfloat16,
+    }
+    regular_attention_cfg = HookedTransformerConfig.from_dict(cfg_dict)
+    cfg_dict['use_flash_attn'] = True
+    flash_attention_cfg = HookedTransformerConfig.from_dict(cfg_dict)
+    flash_gqa_attention_cfg = HookedTransformerConfig.from_dict(cfg_dict)
+
+    regular_attention = Attention(regular_attention_cfg)
+
+    assert not hasattr(regular_attention, 'flash_attn_func'), "AbstractAttention should not have 'flash_attn_func' if set `use_flash_attn=False`"
+
+    flash_attention = Attention(flash_attention_cfg)
+
+    assert hasattr(flash_attention, 'flash_attn_func'), "AbstractAttention should have 'flash_attn_func' if set `use_flash_attn=True`"
+
+    flash_gqa_attention = GroupedQueryAttention(flash_gqa_attention_cfg)
+
+    # Variables started with `_` mean that the GQA key/value parameters
+    W_Q = torch.rand((n_heads, d_model, d_head), dtype=dtype)
+    b_Q = torch.rand((n_heads, d_head), dtype=dtype)
+    _W_K = torch.rand((n_key_value_heads, d_model, d_head), dtype=dtype)
+    W_K = torch.repeat_interleave(_W_K, dim=0, repeats=n_heads // n_key_value_heads)
+    _b_K = torch.rand((n_key_value_heads, d_head), dtype=dtype)
+    b_K = torch.repeat_interleave(_b_K, dim=0, repeats=n_heads // n_key_value_heads)
+    _W_V = torch.rand((n_key_value_heads, d_model, d_head), dtype=dtype)
+    W_V = torch.repeat_interleave(_W_V, dim=0, repeats=n_heads // n_key_value_heads)
+    _b_V = torch.rand((n_key_value_heads, d_head), dtype=dtype)
+    b_V = torch.repeat_interleave(_b_V, dim=0, repeats=n_heads // n_key_value_heads)
+    W_O = torch.rand((n_heads, d_head, d_model), dtype=dtype)
+    b_O = torch.rand(d_model, dtype=dtype)
+
+    regular_attention_state_dict = {
+        "W_Q": W_Q,
+        "b_Q": b_Q,
+        "W_O": W_O,
+        "b_O": b_O,
+        "W_K": W_K,
+        "b_K": b_K,
+        "W_V": W_V,
+        "b_V": b_V,
+        "mask": regular_attention.state_dict()["mask"],
+        "IGNORE": regular_attention.state_dict()["IGNORE"],
+    }
+    flash_attention_state_dict = {
+        "W_Q": W_Q,
+        "b_Q": b_Q,
+        "W_O": W_O,
+        "b_O": b_O,
+        "W_K": W_K,
+        "b_K": b_K,
+        "W_V": W_V,
+        "b_V": b_V,
+        "mask": flash_attention.state_dict()["mask"],
+        "IGNORE": flash_attention.state_dict()["IGNORE"],
+    }
+    flash_gqa_attention_state_dict = {
+        "W_Q": W_Q,
+        "b_Q": b_Q,
+        "W_O": W_O,
+        "b_O": b_O,
+        "_W_K": _W_K,
+        "_b_K": _b_K,
+        "_W_V": _W_V,
+        "_b_V": _b_V,
+        "mask": flash_attention.state_dict()["mask"],
+        "IGNORE": flash_attention.state_dict()["IGNORE"],
+    }
+
+    regular_attention.load_state_dict(regular_attention_state_dict)
+    regular_attention.to(device)
+    flash_attention.load_state_dict(flash_attention_state_dict)
+    flash_attention.to(device)
+    flash_gqa_attention.load_state_dict(flash_gqa_attention_state_dict)
+    flash_gqa_attention.to(device)
+
+    query_input = torch.rand((1, 5, d_model), dtype=dtype).to(device)
+    key_input = torch.rand((1, 5, d_model), dtype=dtype).to(device)
+    value_input = torch.rand((1, 5, d_model), dtype=dtype).to(device)
+
+    # Test regular attention and attention with FlashAttentionV2 
+    regular_attn_output = regular_attention(query_input, key_input, value_input)
+    flash_attn_output = flash_attention(query_input, key_input, value_input)
+
+    assert torch.allclose(regular_attn_output, flash_attn_output, rtol=1e-2)
+
+    # Test FlashAttention behaves correctly when use_split_qkv_input is True
+    flash_gqa_attention.cfg.use_split_qkv_input = True
+    split_query_input = einops.repeat(query_input, "b n d -> b n h d", h=n_heads).clone()
+    split_key_input = einops.repeat(key_input, "b n d -> b n h d", h=n_key_value_heads).clone()
+    split_value_input = einops.repeat(value_input, "b n d -> b n h d", h=n_key_value_heads).clone()
+
+    split_flash_attn_output = flash_gqa_attention(
+        split_query_input, split_key_input, split_value_input
+    )
+
+    assert torch.allclose(regular_attn_output, split_flash_attn_output, rtol=1e-2)
diff --git a/TransformerLens/transformer_lens/HookedTransformer.py b/TransformerLens/transformer_lens/HookedTransformer.py
@@ -1039,6 +1039,7 @@ def from_pretrained(
         cls,
         model_name: str,
         fold_ln: bool = True,
+        use_flash_attn: bool = False,
         center_writing_weights: bool = True,
         center_unembed: bool = True,
         refactor_factored_attn_matrices: bool = False,
@@ -1240,6 +1241,7 @@ def from_pretrained(
             checkpoint_index=checkpoint_index,
             checkpoint_value=checkpoint_value,
             fold_ln=fold_ln,
+            use_flash_attn=use_flash_attn,
             device=device,
             n_devices=n_devices,
             default_prepend_bos=default_prepend_bos,

diff --git a/TransformerLens/transformer_lens/HookedTransformerConfig.py b/TransformerLens/transformer_lens/HookedTransformerConfig.py
@@ -73,6 +73,8 @@ class HookedTransformerConfig:
             custom config, if loading from pretrained then this is not needed.
         use_local_attn (bool): whether to use local attention - ie each
             destination token can only attend to source tokens a certain distance back.
+        use_flash_attn (bool): whether to use FlashAttention-2. Please refer to
+            https://github.com/Dao-AILab/flash-attention.
         window_size (int, *optional*): the size of the window for local
             attention
         attn_types (List[str], *optional*): the types of attention to use for
@@ -177,6 +179,7 @@ class HookedTransformerConfig:
     use_hook_mlp_in: bool = False
     use_attn_in: bool = False
     use_local_attn: bool = False
+    use_flash_attn: bool = False
     original_architecture: Optional[str] = None
     from_checkpoint: bool = False
     checkpoint_index: Optional[int] = None

diff --git a/TransformerLens/transformer_lens/components/abstract_attention.py b/TransformerLens/transformer_lens/components/abstract_attention.py
@@ -96,13 +96,27 @@ def __init__(
         if self.cfg.scale_attn_by_inverse_layer_idx:
             assert self.layer_id is not None  # keep mypy happy
             self.attn_scale *= self.layer_id + 1
-
+        
         self.hook_k = HookPoint()  # [batch, pos, head_index, d_head]
         self.hook_q = HookPoint()  # [batch, pos, head_index, d_head]
         self.hook_v = HookPoint()  # [batch, pos, head_index, d_head]
-        self.hook_z = HookPoint()  # [batch, pos, head_index, d_head]
-        self.hook_attn_scores = HookPoint()  # [batch, head_index, query_pos, key_pos]
-        self.hook_pattern = HookPoint()  # [batch, head_index, query_pos, key_pos]
+
+        if self.cfg.use_flash_attn:
+            # If using FlashAttention, import flash-attn and create related class method.
+            from flash_attn import flash_attn_func, flash_attn_varlen_func
+            from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input
+            self.flash_attn_func = flash_attn_func
+            self.flash_attn_varlen_func = flash_attn_varlen_func
+            self.fa_index_first_axis = index_first_axis
+            self.fa_pad_input = pad_input
+            self.fa_unpad_input = unpad_input
+        # Because of FlashAttention's characteristic, intermediate results (attention scores, pattern, z) are not supported to be hooked.
+        else:
+            self.hook_z = HookPoint()  # [batch, pos, head_index, d_head]
+            self.hook_attn_scores = HookPoint()  # [batch, head_index, query_pos, key_pos]
+            self.hook_pattern = HookPoint()  # [batch, head_index, query_pos, key_pos]
+
+
         self.hook_result = HookPoint()  # [batch, pos, head_index, d_model]
 
         # See HookedTransformerConfig for more details.
@@ -200,40 +214,72 @@ def forward(
             q = q.to(torch.float32)
             k = k.to(torch.float32)
 
-        attn_scores = self.calculate_attention_scores(
-            q, k
-        )  # [batch, head_index, query_pos, key_pos]
-
-        if self.cfg.positional_embedding_type == "alibi":
-            query_ctx = attn_scores.size(-2)
-            # The key context length is the number of positions in the past - this includes all positions in the cache
-            key_ctx = attn_scores.size(-1)
-
-            # only recompute when necessary to increase efficiency.
-            if self.alibi is None or key_ctx > self.alibi.size(-1):
-                self.alibi = AbstractAttention.create_alibi_bias(
-                    self.cfg.n_heads, key_ctx, self.cfg.device
+        # use FlashAttentionV2 to accelerate inference. self.hook_attn_scores, self.hook_pattern, self.hook_z are not supported in this case.
+        if self.cfg.use_flash_attn:
+            # FlashAttention could only accept the dtype of bfp16 and fp16
+            q = q.to(torch.bfloat16)
+            k = k.to(torch.bfloat16)
+
+            # Contains at least one padding token in the sequence
+            causal = True if self.cfg.attention_dir == "causal" else False
+            if attention_mask is not None:
+                batch_size, query_length, _ = q.shape
+                query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                    q, k, v, attention_mask, q.shape[1]
                 )
 
-            attn_scores += self.alibi[
-                :, :query_ctx, :key_ctx
-            ]  # [batch, head_index, query_pos, key_pos]
+                cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+                max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+                attn_output_unpad = self.flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    causal=causal,
+                )
 
-        if self.cfg.attention_dir == "causal":
-            # If causal attention, we mask it to only attend backwards. If bidirectional, we don't mask.
-            attn_scores = self.apply_causal_mask(
-                attn_scores, kv_cache_pos_offset, attention_mask
+                z = self.fa_pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+            else:
+                z = self.flash_attn_func(q, k, v, causal=causal)
+        else:
+            attn_scores = self.calculate_attention_scores(
+                q, k
             )  # [batch, head_index, query_pos, key_pos]
-        if additive_attention_mask is not None:
-            attn_scores += additive_attention_mask
-
-        attn_scores = self.hook_attn_scores(attn_scores)
-        pattern = F.softmax(attn_scores, dim=-1)
-        pattern = torch.where(torch.isnan(pattern), torch.zeros_like(pattern), pattern)
-        pattern = self.hook_pattern(pattern)  # [batch, head_index, query_pos, key_pos]
-        pattern = pattern.to(self.cfg.dtype)
-        pattern = pattern.to(v.device)
-        z = self.calculate_z_scores(v, pattern)  # [batch, pos, head_index, d_head]
+
+            if self.cfg.positional_embedding_type == "alibi":
+                query_ctx = attn_scores.size(-2)
+                # The key context length is the number of positions in the past - this includes all positions in the cache
+                key_ctx = attn_scores.size(-1)
+
+                # only recompute when necessary to increase efficiency.
+                if self.alibi is None or key_ctx > self.alibi.size(-1):
+                    self.alibi = AbstractAttention.create_alibi_bias(
+                        self.cfg.n_heads, key_ctx, self.cfg.device
+                    )
+
+                attn_scores += self.alibi[
+                    :, :query_ctx, :key_ctx
+                ]  # [batch, head_index, query_pos, key_pos]
+
+            if self.cfg.attention_dir == "causal":
+                # If causal attention, we mask it to only attend backwards. If bidirectional, we don't mask.
+                attn_scores = self.apply_causal_mask(
+                    attn_scores, kv_cache_pos_offset, attention_mask
+                )  # [batch, head_index, query_pos, key_pos]
+            if additive_attention_mask is not None:
+                attn_scores += additive_attention_mask
+
+            attn_scores = self.hook_attn_scores(attn_scores)
+            pattern = F.softmax(attn_scores, dim=-1)
+            pattern = torch.where(torch.isnan(pattern), torch.zeros_like(pattern), pattern)
+            pattern = self.hook_pattern(pattern)  # [batch, head_index, query_pos, key_pos]
+            pattern = pattern.to(self.cfg.dtype)
+            pattern = pattern.to(v.device)
+            z = self.calculate_z_scores(v, pattern)  # [batch, pos, head_index, d_head]
         if not self.cfg.use_attn_result:
             if self.cfg.load_in_4bit:
                 # call bitsandbytes method to dequantize and multiply
@@ -656,3 +702,66 @@ def create_alibi_bias(
         alibi_bias = torch.einsum("ij,k->kij", slope, multipliers)
 
         return alibi_bias
+
+    def _upad_input(
+            self, 
+            query_layer: Float[torch.Tensor, "batch key_pos head_index d_head"], 
+            key_layer: Float[torch.Tensor, "batch key_pos head_index d_head"], 
+            value_layer: Float[torch.Tensor, "batch key_pos head_index d_head"], 
+            attention_mask: Optional[Float[torch.Tensor, "batch 1 1 pos"]], 
+            query_length: int,
+        ):
+        """
+        Refer to the implementation of flash attention of llama3 in package transformers: LlamaFlashAttention2.
+        The function is used when attention mask is not None and query length is not equal to key length. 
+        """
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+
+        key_layer = self.fa_index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        value_layer = self.fa_index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        if query_length == kv_seq_len:
+            query_layer = self.fa_index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = self.fa_unpad_input(query_layer, attention_mask)
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+def _get_unpad_data(attention_mask):
+    """
+    From transformers.models.llama.modeling_llama
+    """
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
diff --git a/TransformerLens/transformer_lens/loading_from_pretrained.py b/TransformerLens/transformer_lens/loading_from_pretrained.py
@@ -1224,6 +1224,7 @@ def get_pretrained_model_config(
     checkpoint_index: Optional[int] = None,
     checkpoint_value: Optional[int] = None,
     fold_ln: bool = False,
+    use_flash_attn: bool = False,
     device: Optional[Union[str, torch.device]] = None,
     n_devices: int = 1,
     default_prepend_bos: bool = True,
@@ -1251,6 +1252,8 @@ def get_pretrained_model_config(
         fold_ln (bool, optional): Whether to fold the layer norm into the
             subsequent linear layers (see HookedTransformer.fold_layer_norm for
             details). Defaults to False.
+        use_flash_attn (bool): whether to use FlashAttention-2. Please refer to
+            https://github.com/Dao-AILab/flash-attention. Defaults to False.
         device (str, optional): The device to load the model onto. By
             default will load to CUDA if available, else CPU.
         n_devices (int, optional): The number of devices to split the model across. Defaults to 1.
@@ -1310,6 +1313,8 @@ def get_pretrained_model_config(
             cfg_dict["normalization_type"] = "RMSPre"
         else:
             logging.warning("Cannot fold in layer norm, normalization_type is not LN.")
+    if use_flash_attn:
+        cfg_dict["use_flash_attn"] = True
 
     if checkpoint_index is not None or checkpoint_value is not None:
         checkpoint_labels, checkpoint_label_type = get_checkpoint_labels(

diff --git a/examples/configuration/analyze.toml b/examples/configuration/analyze.toml
@@ -19,6 +19,7 @@ exp_result_dir = "results"
 [lm]
 model_name = "gpt2"
 d_model = 768
+use_flash_attn = false
 
 [dataset]
 dataset_path = "openwebtext"