In [1]:
import numpy as np
import mindspore
from mindspore import nn, ops, Tensor

  setattr(self, word, getattr(machar, word).flat[0])
  setattr(self, word, getattr(machar, word).flat[0])


# GPT-2 Masked Self-Attention

## GPT-2 Self-attention: 1- Creating queries, keys, and values

![gpt2-self-attention-3.png](https://jalammar.github.io/images/gpt2/gpt2-self-attention-3.png)

In [2]:
batch_size = 1
seq_len = 10
embed_dim = 768

x = Tensor(np.random.randn(batch_size, seq_len, embed_dim), mindspore.float32)

In [5]:
from mindnlp._legacy.functional import split
from mindnlp.transformers.ms_utils import Conv1D

c_attn = Conv1D(3 * embed_dim, embed_dim)
query, key, value = split(c_attn(x), embed_dim, axis=2)
query.shape, key.shape, value.shape

  self._event_pipes[threading.current_thread()] = event_pipe


\

((1, 10, 768), (1, 10, 768), (1, 10, 768))

![gpt2-self-attention-split-attention-heads-1.png](https://jalammar.github.io/images/gpt2/gpt2-self-attention-split-attention-heads-1.png)

![gpt2-self-attention-split-attention-heads-2.png](https://jalammar.github.io/images/gpt2/gpt2-self-attention-split-attention-heads-2.png)

In [6]:
def split_heads(tensor, num_heads, attn_head_size):
    """
    Splits hidden_size dim into attn_head_size and num_heads
    """
    new_shape = tensor.shape[:-1] + (num_heads, attn_head_size)
    tensor = tensor.view(new_shape)
    return ops.transpose(tensor, (0, 2, 1, 3))  # (batch, head, seq_length, head_features)

In [7]:
num_heads = 12
head_dim = embed_dim // num_heads

query = split_heads(query, num_heads, head_dim)
key = split_heads(key, num_heads, head_dim)
value = split_heads(value, num_heads, head_dim)

query.shape, key.shape, value.shape

((1, 12, 10, 64), (1, 12, 10, 64), (1, 12, 10, 64))

|

## GPT-2 Self-attention: 2- Scoring

![gpt2-self-attention-scoring.png](https://jalammar.github.io/images/gpt2/gpt2-self-attention-scoring.png)

![](https://jalammar.github.io/images/gpt2/gpt2-self-attention-scoring-2.png)

In [8]:
attn_weights = ops.matmul(query, key.swapaxes(-1, -2))

attn_weights.shape

(1, 12, 10, 10)

/

![](https://jalammar.github.io/images/gpt2/transformer-decoder-attention-mask-dataset.png)

In [9]:
max_positions = seq_len

bias = Tensor(np.tril(np.ones((max_positions, max_positions))).reshape(
              (1, 1, max_positions, max_positions)), mindspore.bool_)
bias

Tensor(shape=[1, 1, 10, 10], dtype=Bool, value=
[[[[ True, False, False ... False, False, False],
   [ True,  True, False ... False, False, False],
   [ True,  True,  True ... False, False, False],
   ...
   [ True,  True,  True ...  True, False, False],
   [ True,  True,  True ...  True,  True, False],
   [ True,  True,  True ...  True,  True,  True]]]])

![](https://jalammar.github.io/images/gpt2/queries-keys-attention-mask.png)

![](https://jalammar.github.io/images/gpt2/transformer-attention-mask.png)

In [11]:
from mindnlp._legacy.functional import where, softmax

attn_weights = attn_weights / ops.sqrt(ops.scalar_to_tensor(value.shape[-1]))
query_length, key_length = query.shape[-2], key.shape[-2]
causal_mask = bias[:, :, key_length - query_length: key_length, :key_length].bool()
mask_value = Tensor(np.finfo(np.float32).min, dtype=attn_weights.dtype)
attn_weights = where(causal_mask, attn_weights, mask_value)

/

In [12]:
np.finfo(np.float32).min

-3.4028235e+38

In [13]:
attn_weights[0, 0]

/

Tensor(shape=[10, 10], dtype=Float32, value=
[[-4.46166992e-02, -3.40282347e+38, -3.40282347e+38 ... -3.40282347e+38, -3.40282347e+38, -3.40282347e+38],
 [-3.74755859e-02, -4.25109863e-02, -3.40282347e+38 ... -3.40282347e+38, -3.40282347e+38, -3.40282347e+38],
 [-3.51867676e-02,  7.01293945e-02,  2.89764404e-02 ... -3.40282347e+38, -3.40282347e+38, -3.40282347e+38],
 ...
 [-1.82189941e-02,  4.60510254e-02,  3.79333496e-02 ... -1.85203552e-03, -3.40282347e+38, -3.40282347e+38],
 [ 1.98974609e-02, -6.01196289e-02,  3.48510742e-02 ...  2.45208740e-02, -8.94165039e-03, -3.40282347e+38],
 [ 6.25610352e-02, -3.41796875e-02,  3.34167480e-02 ...  4.76989746e-02, -7.72476196e-03,  1.26876831e-02]])

![](https://jalammar.github.io/images/gpt2/transformer-attention-masked-scores-softmax.png)

In [14]:
attn_weights = softmax(attn_weights, axis=-1)
attn_weights.shape

(1, 12, 10, 10)

In [15]:
attn_weights[0, 0]

|

Tensor(shape=[10, 10], dtype=Float32, value=
[[ 1.00000000e+00,  0.00000000e+00,  0.00000000e+00 ...  0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
 [ 5.01258850e-01,  4.98741180e-01,  0.00000000e+00 ...  0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
 [ 3.14729840e-01,  3.49684328e-01,  3.35585862e-01 ...  0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
 ...
 [ 1.20966740e-01,  1.28996551e-01,  1.27953634e-01 ...  1.22962892e-01,  0.00000000e+00,  0.00000000e+00],
 [ 1.13199592e-01,  1.04494609e-01,  1.14905059e-01 ...  1.13724172e-01,  1.09981641e-01,  0.00000000e+00],
 [ 1.04792789e-01,  9.51299891e-02,  1.01782754e-01 ...  1.03246868e-01,  9.76802260e-02,  9.96946022e-02]])

![](https://jalammar.github.io/images/gpt2/gpt2-self-attention-multihead-sum-1.png)

In [16]:
attn_output = ops.matmul(attn_weights, value)

attn_output.shape

(1, 12, 10, 64)

## GPT-2 Self-attention: 3.5- Merge attention heads

![](https://jalammar.github.io/images/gpt2/gpt2-self-attention-merge-heads-1.png)

In [17]:
def merge_heads(tensor, num_heads, attn_head_size):
    """
    Merges attn_head_size dim and num_attn_heads dim into hidden_size
    """
    tensor = ops.transpose(tensor, (0, 2, 1, 3))
    new_shape = tensor.shape[:-2] + (num_heads * attn_head_size,)
    return tensor.view(new_shape)

In [18]:
attn_output = merge_heads(attn_output, num_heads, head_dim)

attn_output.shape

(1, 10, 768)

## GPT-2 Self-attention: 4- Projecting

![](https://jalammar.github.io/images/gpt2/gpt2-self-attention-project-1.png)

In [19]:
c_proj = Conv1D(embed_dim, embed_dim)

In [20]:
attn_output = c_proj(attn_output)
attn_output.shape

\

(1, 10, 768)