In [1]:
import torch
import torch.nn as nn


In [2]:
sentence = torch.tensor([0, 7, 1, 2, 5, 6, 4, 3])
sentence

tensor([0, 7, 1, 2, 5, 6, 4, 3])

In [3]:
torch.manual_seed(123)
embed = nn.Embedding(num_embeddings=10, embedding_dim=16)
embedded_sentence = embed(sentence).detach()
embedded_sentence.shape

torch.Size([8, 16])

In [4]:
omega = torch.empty(8, 8)
for i, x_i in enumerate(embedded_sentence):
     for j, x_j in enumerate(embedded_sentence):
         omega[i, j] = torch.dot(x_i, x_j)
omega

tensor([[ 9.7601,  1.7326,  4.7543, -1.3587,  0.4752, -1.6717,  1.0227, -0.1286],
        [ 1.7326, 16.0787,  9.0642, -0.3370,  1.1368,  1.1972,  1.6485, -1.2789],
        [ 4.7543,  9.0642, 22.6615, -0.8519,  7.7799,  2.7483, -0.6832,  1.6236],
        [-1.3587, -0.3370, -0.8519, 13.9473, -1.4198, 10.9659, -0.5887,  2.3869],
        [ 0.4752,  1.1368,  7.7799, -1.4198, 13.7511, -6.8568, -2.5114, -3.3468],
        [-1.6717,  1.1972,  2.7483, 10.9659, -6.8568, 24.6738, -3.8294,  4.9581],
        [ 1.0227,  1.6485, -0.6832, -0.5887, -2.5114, -3.8294, 15.8691,  2.0269],
        [-0.1286, -1.2789,  1.6236,  2.3869, -3.3468,  4.9581,  2.0269, 18.7382]])

In [5]:
omega_mat = embedded_sentence.matmul(embedded_sentence.T)
omega_mat

tensor([[ 9.7601,  1.7326,  4.7543, -1.3587,  0.4752, -1.6717,  1.0227, -0.1286],
        [ 1.7326, 16.0787,  9.0642, -0.3370,  1.1368,  1.1972,  1.6485, -1.2789],
        [ 4.7543,  9.0642, 22.6615, -0.8519,  7.7799,  2.7483, -0.6832,  1.6236],
        [-1.3587, -0.3370, -0.8519, 13.9473, -1.4198, 10.9659, -0.5887,  2.3869],
        [ 0.4752,  1.1368,  7.7799, -1.4198, 13.7511, -6.8568, -2.5114, -3.3468],
        [-1.6717,  1.1972,  2.7483, 10.9659, -6.8568, 24.6738, -3.8294,  4.9581],
        [ 1.0227,  1.6485, -0.6832, -0.5887, -2.5114, -3.8294, 15.8691,  2.0269],
        [-0.1286, -1.2789,  1.6236,  2.3869, -3.3468,  4.9581,  2.0269, 18.7382]])

In [6]:
torch.allclose(omega, omega_mat)

True

In [7]:
import torch.nn.functional as F
attention_weights = F.softmax(omega, dim=1)
attention_weights.shape

torch.Size([8, 8])

In [8]:
attention_weights.sum(dim=1)

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000])

In [9]:
x_2 = embedded_sentence[1, :]
context_vec_2 = torch.zeros(x_2.shape)
for j in range(8):
    x_j = embedded_sentence[j, :]
    print('x_j:', x_j)
    context_vec_2 += attention_weights[1, j] * x_j
    print('context_vec_2:', context_vec_2)

x_j: tensor([ 0.3374, -0.1778, -0.3035, -0.5880,  0.3486,  0.6603, -0.2196, -0.3792,
         0.7671, -1.1925,  0.6984, -1.4097,  0.1794,  1.8951,  0.4954,  0.2692])
context_vec_2: tensor([ 1.9828e-07, -1.0448e-07, -1.7839e-07, -3.4559e-07,  2.0488e-07,
         3.8810e-07, -1.2909e-07, -2.2285e-07,  4.5085e-07, -7.0086e-07,
         4.1044e-07, -8.2853e-07,  1.0543e-07,  1.1138e-06,  2.9119e-07,
         1.5822e-07])
x_j: tensor([-9.4053e-01, -4.6806e-01,  1.0322e+00, -2.8300e-01,  4.9275e-01,
        -1.4078e-02, -2.7466e-01, -7.6409e-01,  1.3966e+00, -9.9491e-01,
        -1.5822e-03,  1.2471e+00, -7.7105e-02,  1.2774e+00, -1.4596e+00,
        -2.1595e+00])
context_vec_2: tensor([-9.3968e-01, -4.6764e-01,  1.0313e+00, -2.8275e-01,  4.9231e-01,
        -1.4065e-02, -2.7442e-01, -7.6340e-01,  1.3953e+00, -9.9402e-01,
        -1.5804e-03,  1.2460e+00, -7.7035e-02,  1.2762e+00, -1.4583e+00,
        -2.1576e+00])
x_j: tensor([-0.0770, -1.0205, -0.1690,  0.9178,  1.5810,  1.3010,  1.2753, 

In [10]:
context_vectors = torch.matmul(attention_weights, embedded_sentence)
context_vectors

tensor([[ 3.3420e-01, -1.8324e-01, -3.0218e-01, -5.7772e-01,  3.5662e-01,
          6.6452e-01, -2.0998e-01, -3.7798e-01,  7.6537e-01, -1.1946e+00,
          6.9960e-01, -1.4067e+00,  1.7021e-01,  1.8838e+00,  4.8729e-01,
          2.4730e-01],
        [-9.3975e-01, -4.6856e-01,  1.0311e+00, -2.8192e-01,  4.9373e-01,
         -1.2896e-02, -2.7327e-01, -7.6358e-01,  1.3958e+00, -9.9543e-01,
         -7.1287e-04,  1.2449e+00, -7.8077e-02,  1.2765e+00, -1.4589e+00,
         -2.1601e+00],
        [-7.7021e-02, -1.0205e+00, -1.6895e-01,  9.1776e-01,  1.5810e+00,
          1.3010e+00,  1.2753e+00, -2.0095e-01,  4.9647e-01, -1.5723e+00,
          9.6657e-01, -1.1481e+00, -1.1589e+00,  3.2547e-01, -6.3151e-01,
         -2.8400e+00],
        [-1.3679e+00,  1.0614e-01, -2.1317e+00,  1.0480e+00, -3.7127e-01,
         -9.1234e-01, -4.3802e-01, -1.0329e+00,  9.3425e-01,  1.5453e+00,
          5.7218e-01, -1.8049e-01, -6.0453e-03, -8.8691e-02,  2.0559e-01,
         -5.2292e-01],
        [ 2.5444e-01

In [11]:
torch.allclose(context_vectors[1], context_vec_2)

True

In [12]:
torch.manual_seed(123)
d = embedded_sentence.shape[1]
U_query = torch.rand(d, d)
U_key = torch.rand(d, d)
U_value = torch.rand(d, d)

d

16

In [13]:
x_2 = embedded_sentence[1]
query_2 = U_query.matmul(x_2)
query_2, query_2.shape

(tensor([-1.2403, -2.9754, -0.2894, -0.4004, -2.9578, -0.2939, -0.2266, -3.6482,
         -2.6450, -0.9536, -1.1116,  1.1717, -2.2671, -0.7874, -2.0140, -1.6652]),
 torch.Size([16]))

In [14]:
key_2 = U_key.matmul(x_2)
value_2 = U_value.matmul(x_2)
key_2, value_2

(tensor([-1.2952,  0.5116, -0.5343, -2.1730, -0.5293, -0.4932, -2.0952, -0.5830,
         -0.2856,  0.1277,  0.6852, -1.5782, -0.9960, -2.3458, -0.4437, -0.5510]),
 tensor([ 0.6654, -1.1762,  0.2593, -1.2616, -1.1232, -1.1314, -0.8960, -0.0376,
         -3.1714, -0.4293, -1.6761, -0.0262, -0.6826, -0.7709,  0.5206, -2.5693]))

In [15]:
queries = embedded_sentence.matmul(U_query.T)
queries

tensor([[-0.7665, -1.1306,  0.0167,  1.3456, -0.0111,  0.2577,  0.3018,  0.5771,
          0.8287, -0.3224,  0.9979, -1.3807,  0.7953,  0.3018,  1.1813,  1.3559],
        [-1.2403, -2.9754, -0.2894, -0.4004, -2.9578, -0.2939, -0.2266, -3.6482,
         -2.6450, -0.9536, -1.1116,  1.1717, -2.2671, -0.7874, -2.0140, -1.6652],
        [-1.4620, -2.9039, -2.9815,  0.1616, -1.7143, -3.4239, -1.4447, -2.2414,
         -3.0720, -3.0587,  1.1979,  0.2866, -0.5552, -0.0643, -1.3262, -0.3223],
        [ 0.0758, -1.2134, -2.7353, -1.2504, -2.3630, -1.7872, -2.4223, -1.3309,
          0.0371,  0.4128, -0.2362, -1.7722, -0.9576, -0.6908, -1.9191, -0.0077],
        [-1.0274, -3.9126, -2.1115, -1.1729, -2.0862, -4.8391, -1.5899, -2.5706,
         -3.0113, -3.2927, -4.0568, -0.3453, -3.0388, -2.1831, -2.6464, -2.5228],
        [-0.6838, -2.4960, -4.3936, -3.7471, -2.7305, -2.1619, -5.9295, -3.5328,
         -1.5616,  0.2982, -0.4995, -2.9656, -1.4150, -1.2241, -2.2443, -2.1584],
        [ 0.8982,  0.1

In [16]:
keys1 = U_key.matmul(embedded_sentence.T).T
values1 = U_value.matmul(embedded_sentence.T).T
# print(keys, values)

keys = embedded_sentence.matmul(U_key.T)  # embedded_sentence (L, d), U_value (16x16), для перемножения нужно чтобы было (16хА) и (Ах8)
values = embedded_sentence.matmul(U_value.T)
torch.allclose(keys1, keys), torch.allclose(values1, values)

(True, True)

In [17]:
import numpy as np

In [18]:
scores = queries @ keys.T
scores

tensor([[ -0.7569,  -3.7951,  -7.9465, -10.0615, -12.1732, -12.8006,   4.1644,
           6.3346],
        [-25.1623,   9.3602,  14.3667,  32.1482,  53.8976,  46.6626,  -1.2131,
         -32.9392],
        [-28.8096,  10.9046,  14.4355,  23.8255,  52.7999,  41.3237,   1.5884,
         -35.1890],
        [-15.5115,  17.5500,  19.8771,  21.5002,  42.0597,  35.2061,  -0.5541,
         -25.9203],
        [-36.3682,  20.2438,  27.1240,  49.8610,  84.9364,  85.7472,   5.8265,
         -69.9103],
        [-34.6901,  38.3814,  42.0269,  48.1298,  92.0512,  74.9869,  -6.6510,
         -65.5576],
        [ -1.1880,   3.7619,  -5.6129,  -6.8690,   6.3126, -13.3452,  -1.3225,
          -6.2390],
        [ 31.8297, -25.2041, -25.3536, -57.8440, -79.4676, -85.3054, -10.5390,
          64.5980]])

In [19]:
scaled_scores = scores / np.sqrt(U_key.shape[1]) # масштабирование
attention_weights = torch.softmax(scaled_scores, dim=-1)  # нормализация с помощью SoftMax
attention_weights

tensor([[8.9701e-02, 4.1968e-02, 1.4866e-02, 8.7611e-03, 5.1675e-03, 4.4174e-03,
         3.0698e-01, 5.2813e-01],
        [2.2317e-09, 1.2499e-05, 4.3696e-05, 3.7242e-03, 8.5596e-01, 1.4026e-01,
         8.8897e-07, 3.1935e-10],
        [1.3033e-09, 2.6728e-05, 6.4614e-05, 6.7582e-04, 9.4557e-01, 5.3664e-02,
         2.6030e-06, 2.6450e-10],
        [4.7089e-07, 1.8304e-03, 3.2749e-03, 4.9139e-03, 8.3877e-01, 1.5119e-01,
         1.9811e-05, 3.4899e-08],
        [3.0354e-14, 4.2539e-08, 2.3757e-07, 6.9893e-05, 4.4946e-01, 5.5047e-01,
         1.1573e-09, 6.9252e-18],
        [1.7107e-14, 1.4683e-06, 3.6528e-06, 1.6797e-05, 9.8614e-01, 1.3842e-02,
         1.8944e-11, 7.6169e-18],
        [7.7890e-02, 2.6847e-01, 2.5766e-02, 1.8822e-02, 5.0797e-01, 3.7285e-03,
         7.5313e-02, 2.2033e-02],
        [2.7676e-04, 1.7772e-10, 1.7120e-10, 5.0805e-14, 2.2812e-16, 5.3006e-17,
         6.9499e-09, 9.9972e-01]])

In [20]:
attention_weights.shape, values.shape

(torch.Size([8, 8]), torch.Size([8, 16]))

In [21]:
final_context_vector = attention_weights @ values
final_context_vector

tensor([[ 1.1060,  0.9678,  1.5669,  1.5762,  1.7334,  0.3509,  2.1180,  1.6764,
          1.6418,  0.7886,  1.9459,  1.0276,  1.2672,  1.0364, -0.4764,  0.7733],
        [-1.2226, -3.4387, -4.3928, -5.2125, -1.1249, -3.3041, -1.4316, -3.2765,
         -2.5114, -2.6105, -1.5793, -2.8433, -2.4142, -0.3998, -1.9917, -3.3499],
        [-1.1639, -3.5045, -4.5964, -5.8008, -0.9892, -3.2593, -1.3924, -3.4383,
         -2.6604, -2.4408, -1.3912, -2.9849, -2.3546,  0.1087, -2.3330, -3.6736],
        [-1.2207, -3.4202, -4.3491, -5.1151, -1.1445, -3.2986, -1.4325, -3.2335,
         -2.4892, -2.6230, -1.6025, -2.8163, -2.4199, -0.4728, -1.9363, -3.2950],
        [-1.4894, -3.1681, -3.4794, -2.5261, -1.7571, -3.5199, -1.6488, -2.5390,
         -1.8424, -3.4175, -2.4575, -2.2117, -2.7089, -2.7630, -0.4126, -1.8831],
        [-1.1375, -3.5332, -4.6883, -6.0675, -0.9272, -3.2388, -1.3731, -3.5119,
         -2.7275, -2.3628, -1.3052, -3.0486, -2.3267,  0.3408, -2.4886, -3.8200],
        [-0.1553, -2.1

In [22]:
values.shape

torch.Size([8, 16])

In [23]:
attention_weights.shape, U_value.shape

(torch.Size([8, 8]), torch.Size([16, 16]))

In [24]:
torch.manual_seed(123)
d = embedded_sentence.shape[1]
one_U_query = torch.rand(d, d)

h = 8
multihead_U_query = torch.rand(h, d, d)
multihead_U_key = torch.rand(h, d, d)
multihead_U_value = torch.rand(h, d, d)

In [25]:
multihead_U_key.shape

torch.Size([8, 16, 16])

In [26]:
x_2

tensor([-9.4053e-01, -4.6806e-01,  1.0322e+00, -2.8300e-01,  4.9275e-01,
        -1.4078e-02, -2.7466e-01, -7.6409e-01,  1.3966e+00, -9.9491e-01,
        -1.5822e-03,  1.2471e+00, -7.7105e-02,  1.2774e+00, -1.4596e+00,
        -2.1595e+00])

In [27]:
mh_q2 = multihead_U_query.matmul(x_2)
x_2.shape, multihead_U_query.shape

(torch.Size([16]), torch.Size([8, 16, 16]))

In [28]:
mh_q2

tensor([[-1.2952,  0.5116, -0.5343, -2.1730, -0.5293, -0.4932, -2.0952, -0.5830,
         -0.2856,  0.1277,  0.6852, -1.5782, -0.9960, -2.3458, -0.4437, -0.5510],
        [ 0.6654, -1.1762,  0.2593, -1.2616, -1.1232, -1.1314, -0.8960, -0.0376,
         -3.1714, -0.4293, -1.6761, -0.0262, -0.6826, -0.7709,  0.5206, -2.5693],
        [-1.0130,  2.4321, -0.0691, -1.1237, -1.3568,  0.1933, -1.6615, -2.0000,
          0.4469, -1.0241, -1.0079, -1.6620, -2.3453, -0.1093, -2.5826, -1.3624],
        [ 1.0933, -2.7744, -1.0884, -3.5698,  0.2782, -1.6767, -1.5483,  0.3518,
         -1.6836, -0.1056,  0.7932,  0.5691, -1.4224, -0.5437, -1.3615, -1.9626],
        [ 0.0784,  0.1832, -1.6212, -0.6045, -2.6574, -0.9026, -1.7585, -1.9547,
         -1.8166, -3.3531, -1.9808,  0.5954, -1.2462, -1.1287, -4.5376, -1.2510],
        [-0.9583, -0.4244, -1.3361, -0.7831,  0.0427, -0.4317, -0.3390, -0.8504,
         -0.9676, -1.1346, -1.1462, -1.2955,  0.3383,  0.7116,  0.5113, -1.7576],
        [-2.4430, -1.1

In [29]:
mh_k2 = multihead_U_key.matmul(x_2)
mh_v2 = multihead_U_value.matmul(x_2)

In [30]:
embedded_sentence.shape

torch.Size([8, 16])

In [31]:
embedded_sentence.T.repeat(8, 1, 1).shape

torch.Size([8, 16, 8])

In [32]:
torch.allclose(embedded_sentence, embedded_sentence.T.repeat(8, 1, 1)[0].T)

True

In [40]:
embedded_sentence.T.repeat(8, 1, 1).shape

torch.Size([8, 16, 8])

In [44]:
4 * 16

64

In [35]:
embedded_sentence.T.repeat(8, 1, 1)[0].T

tensor([[ 3.3737e-01, -1.7778e-01, -3.0353e-01, -5.8801e-01,  3.4861e-01,
          6.6034e-01, -2.1964e-01, -3.7917e-01,  7.6711e-01, -1.1925e+00,
          6.9835e-01, -1.4097e+00,  1.7938e-01,  1.8951e+00,  4.9545e-01,
          2.6920e-01],
        [-9.4053e-01, -4.6806e-01,  1.0322e+00, -2.8300e-01,  4.9275e-01,
         -1.4078e-02, -2.7466e-01, -7.6409e-01,  1.3966e+00, -9.9491e-01,
         -1.5822e-03,  1.2471e+00, -7.7105e-02,  1.2774e+00, -1.4596e+00,
         -2.1595e+00],
        [-7.7020e-02, -1.0205e+00, -1.6896e-01,  9.1776e-01,  1.5810e+00,
          1.3010e+00,  1.2753e+00, -2.0095e-01,  4.9647e-01, -1.5723e+00,
          9.6657e-01, -1.1481e+00, -1.1589e+00,  3.2547e-01, -6.3151e-01,
         -2.8400e+00],
        [-1.3250e+00,  1.7843e-01, -2.1338e+00,  1.0524e+00, -3.8848e-01,
         -9.3435e-01, -4.9914e-01, -1.0867e+00,  8.8054e-01,  1.5542e+00,
          6.2662e-01, -1.7549e-01,  9.8284e-02, -9.3507e-02,  2.6621e-01,
         -5.8504e-01],
        [ 2.5529e-01

In [33]:
embedded_sentence.T.repeat(8, 1, 1)[0].T.shape

torch.Size([8, 16])

In [45]:
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model='gpt2')

Device set to use cuda:0


In [46]:
set_seed(123)
generator('Hello folks, today is', max_length=20, num_return_sequences=3)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[{'generated_text': "Hello folks, today is the first time that I have ever written about a game (and not a game about a band like you) that has not been released to the public, so I thought I'd try to share a couple of the early examples of what was happening here.\n\nIn the early days, we were all working on the title of a game. Then, after a short time, we decided that we needed to add more content. We had a few ideas, but after a while, we decided that we needed to make an action game. Since the studio wanted to make a game about a band, we had to make an action game.\n\nThe story of this game is very similar to the story of any other band. In order to get a band to play, we had to pay a band's license fee. We then had to sign a contract with the studio to make the game. This meant that we had to pay the band's cost, and as we got older, the band had to pay the studio's cost. This was a very large burden, and we had to pay it.\n\nIn our project we had to make a game that we could se

In [48]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
text = 'Lets us encode this sentence'
encoded_input = tokenizer(text, return_tensors='pt')

encoded_input

{'input_ids': tensor([[   43,  1039,   514, 37773,   428,  6827]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}