In [1]:
import sys

#### Loading a Dataset

In [None]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [2]:
##opening the Datset

In [3]:
with open("input.txt", "r") as input_file:
    text = input_file.read()

In [4]:
len(text)

1115394

In [5]:
##Printing first 10000 words of the text

In [6]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [7]:
##Creating the vocabualary
##In this case, we are going to use characters as tokens to be generated or to be seen by the model.
##In GPT and other decoder only LLMs, you will see that the data or vocabulary is token/subwords level.

In [8]:
chars = sorted(list(set(text))) ##All the unique chars in our dataset
vocab_size = len(chars)
print("Vocabulary", "".join(chars))
print("Length of our vocabulary", vocab_size)  ##These are basically all the elements from our dataset which the model will see/generate or emit.

Vocabulary 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Length of our vocabulary 65


In [9]:
##There are many libraries based on different schemas that we can use to create tokens out of words and then those subwords can be put in the vocabulary.
##Good example is: toktoken, sentencepiece

In [10]:
##Tokenization:
##When we talk about tokenization, it can be see nas an strategy to convert the string in a sequence of integers according to elements in the vocabulary

In [11]:
##In our case, we will be translating individual characters into integers.

In [12]:
stoi = { ch:i for i, ch in enumerate(chars)}
print(stoi) ## Char to number

{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}


In [13]:
itos = { i:ch for i, ch in enumerate(chars)}
print(itos) ##Numbers back to words

{0: '\n', 1: ' ', 2: '!', 3: '$', 4: '&', 5: "'", 6: ',', 7: '-', 8: '.', 9: '3', 10: ':', 11: ';', 12: '?', 13: 'A', 14: 'B', 15: 'C', 16: 'D', 17: 'E', 18: 'F', 19: 'G', 20: 'H', 21: 'I', 22: 'J', 23: 'K', 24: 'L', 25: 'M', 26: 'N', 27: 'O', 28: 'P', 29: 'Q', 30: 'R', 31: 'S', 32: 'T', 33: 'U', 34: 'V', 35: 'W', 36: 'X', 37: 'Y', 38: 'Z', 39: 'a', 40: 'b', 41: 'c', 42: 'd', 43: 'e', 44: 'f', 45: 'g', 46: 'h', 47: 'i', 48: 'j', 49: 'k', 50: 'l', 51: 'm', 52: 'n', 53: 'o', 54: 'p', 55: 'q', 56: 'r', 57: 's', 58: 't', 59: 'u', 60: 'v', 61: 'w', 62: 'x', 63: 'y', 64: 'z'}


In [14]:
##Creating our encoder and decoder

In [15]:
encode = lambda string: [stoi[c] for c in string]

In [16]:
decode = lambda l: "".join([itos[c] for c in l])

In [17]:
string = "hi there"
string_int = []
for c in string:
    string_int.append(stoi[c])
print(string_int)

[46, 47, 1, 58, 46, 43, 56, 43]


In [18]:
decoded_str = ""
for l in string_int:
    decoded_str += itos[l]

print(decoded_str)

hi there


In [19]:
string = "hi there"
encoded_string = encode(string)
print(encoded_string)  ##Representation of the characters in strings into numbers
print(decode(encoded_string))  ##Converting integers back to characters

[46, 47, 1, 58, 46, 43, 56, 43]
hi there


In [20]:
##Let's convert the dataset into the tokens

In [21]:
import torch
data = torch.tensor(encode(text), dtype = torch.long)  ##Converting into the pytroch tensor

  cpu = _conversion_method_template(device=torch.device("cpu"))


In [22]:
print(data[:1000])

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
        47, 59, 57,  1, 47, 57,  1, 41, 

In [23]:
##Splitting up the Data

In [24]:
n = int(0.9 * len(data))
print(n)

1003854


In [25]:
train_data = data[:n]
val_data = data[n:]

In [26]:
##Now our data is split.

In [27]:
##Now we gotta feed this data into our model, so that it can understand the patterns.
##One thing to make a note of is that we do not send the entire data to our model at once, because that will be
##very computationally expensive.
##We send the chunks of the train data to our model to train. We sample out random level chunks out of the 
##training set and train on those chunks at a time.

In [28]:
##These chunks have something called maximum length.

In [29]:
max_length = 8

In [30]:
a_random_chunk = train_data[:max_length + 1]
print(a_random_chunk)

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])


In [31]:
##This is a random chunk from our train data. 
##One thing to make a note of is, when we sample out a chunk of data like this, or these 9 characters out of the data
##this has multiple examples packed into it, that is because all of these characters follow each other.
##In our case, characters, in general case, words or tokens.
##Since, each character is followed by a character. there are multuple examples packed into this.

In [32]:
##So when this thing is passed into the transformer, what we are going to do is,
##we are going to simultaneously train it to make predictions on every one of these positions.
##In our case, in this chunk of 9 characters, there are 8 examples packed in there.

In [33]:
x = train_data[:max_length] ##X is going to be the max_length set of characters, input to the transformers
y = train_data[1:max_length + 1] ##Y is going to be the next max_length of characters, so it is offset by 1. Targets for each position in the input.
print(x)
print(y)

tensor([18, 47, 56, 57, 58,  1, 15, 47])
tensor([47, 56, 57, 58,  1, 15, 47, 58])


In [34]:
for t in range(max_length):
    input_ = x[:t + 1]  ##input is going to be all the characters in max_length upto to t, including t
    target = y[t]  ##target us always the t'th character in the array y.
    print(f"When input is: {input_}, target is: {target}")

When input is: tensor([18]), target is: 47
When input is: tensor([18, 47]), target is: 56
When input is: tensor([18, 47, 56]), target is: 57
When input is: tensor([18, 47, 56, 57]), target is: 58
When input is: tensor([18, 47, 56, 57, 58]), target is: 1
When input is: tensor([18, 47, 56, 57, 58,  1]), target is: 15
When input is: tensor([18, 47, 56, 57, 58,  1, 15]), target is: 47
When input is: tensor([18, 47, 56, 57, 58,  1, 15, 47]), target is: 58


In [35]:
##This is how 8 examples are packed into that single chunk of train_data.

In [36]:
##We are going to have multiple batches of these chunks, when train the transformers, 
##all passed to the model to train parallely. ALl are completely independent.

In [37]:
batch_size = 4

In [38]:
##Generating random indices or numbers based on batch size form the train_data
ix = torch.randint(len(train_data) - max_length, (batch_size,))
print(ix)

tensor([383251, 758695, 470941, 450793])


In [39]:
for i in ix:
    print(train_data[i: i+max_length])
    print(train_data[i + 1: i + max_length + 1])

tensor([42, 57, 51, 43, 52,  1, 50, 43])
tensor([57, 51, 43, 52,  1, 50, 43, 39])
tensor([41, 53, 51, 43,  0, 13,  1, 50])
tensor([53, 51, 43,  0, 13,  1, 50, 39])
tensor([43, 56, 43,  1, 47, 52,  1, 51])
tensor([56, 43,  1, 47, 52,  1, 51, 63])
tensor([58,  1, 61, 46, 39, 58, 12,  0])
tensor([ 1, 61, 46, 39, 58, 12,  0,  0])


In [40]:
def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - max_length, (batch_size, ))
    x = torch.stack([data[i: i + max_length] for i in ix])
    y = torch.stack([data[i + 1: i + max_length + 1] for i in ix])

    return x, y

In [41]:
xb, yb = get_batch("train")

In [42]:
print(xb)

tensor([[61, 43,  1, 52, 43, 43, 42,  1],
        [53, 59, 56,  1, 51, 47, 57, 43],
        [52, 42,  5, 57, 58,  1, 47, 52],
        [ 0,  0, 22, 33, 24, 21, 17, 32]])


In [43]:
print(yb)

tensor([[43,  1, 52, 43, 43, 42,  1, 46],
        [59, 56,  1, 51, 47, 57, 43, 56],
        [42,  5, 57, 58,  1, 47, 52,  1],
        [ 0, 22, 33, 24, 21, 17, 32, 10]])


In [44]:
print(xb.shape)  ##A tensor of (4, 8), that is (batch_size, length of sequence(max_length))

torch.Size([4, 8])


#### Training a Bi-Gram Model without any History/Context Awareness
##### The Model, taking in the last word out of the whole sequence and predicting the next word.

#### Understanding Attention (A Mathematical Trick)

In [45]:
#### Let's define our B, T and C dimensions.
#### So in simpler terms,
##  B - batch size
## T - Timesteps/Time/Max Sequence Length
## c - Number of Channels / Embedding Dimension / Embedding Size

In [46]:
torch.manual_seed(1337)  ##Setting up a seed for Reproducibility

<torch._C.Generator at 0x1c0324091f0>

In [47]:
B, T, C = 4, 8, 2

In [48]:
x = torch.randn(B, T, C)   ##Sampling a Random tensor of the dimension (B, T, C), batch size = 4, time/max_sequence_length = 8, and number of channels/embedding_size/ embedding_dimensions = 2

In [49]:
print(x)

tensor([[[ 0.1808, -0.0700],
         [-0.3596, -0.9152],
         [ 0.6258,  0.0255],
         [ 0.9545,  0.0643],
         [ 0.3612,  1.1679],
         [-1.3499, -0.5102],
         [ 0.2360, -0.2398],
         [-0.9211,  1.5433]],

        [[ 1.3488, -0.1396],
         [ 0.2858,  0.9651],
         [-2.0371,  0.4931],
         [ 1.4870,  0.5910],
         [ 0.1260, -1.5627],
         [-1.1601, -0.3348],
         [ 0.4478, -0.8016],
         [ 1.5236,  2.5086]],

        [[-0.6631, -0.2513],
         [ 1.0101,  0.1215],
         [ 0.1584,  1.1340],
         [-1.1539, -0.2984],
         [-0.5075, -0.9239],
         [ 0.5467, -1.4948],
         [-1.2057,  0.5718],
         [-0.5974, -0.6937]],

        [[ 1.6455, -0.8030],
         [ 1.3514, -0.2759],
         [-1.5108,  2.1048],
         [ 2.7630, -1.7465],
         [ 1.4516, -1.5103],
         [ 0.8212, -0.2115],
         [ 0.7789,  1.5333],
         [ 1.6097, -0.4032]]])


In [50]:
print(x.shape) ##C can also be taken as some information that we have at each point in the sequence.

torch.Size([4, 8, 2])


In [51]:
## So now we have these 8 tokens in a batch, and currently they are not talking to each other.
## We want them to talk to each other. (In other words, when you move from the one word to next word, the previous word
## should contain some context or information from the words preceeding it or even can be succeeding it.)

In [52]:
## In our Specific case, we want these tokens to only talk to the tokens preceeding it not the future tokens.

In [53]:
## For example, the token at the 6th location should not talk to the tokens at 7th and 8th location.
## It should only talk to the tokens at 1, 2, 3, 4, and 5th location to talk to each other.

In [54]:
## So the information that only flow from the previous steps to the current timestep. 
## We can not get any information from the future because we are about to try to predict the next word / Future.

In [55]:
## The easiest way for tokens to communicate is:
## Let's say if we are upto token number 6, and I like to communicate with my past. The simplest way is to
## just do an average of all the preceeding elements.

In [56]:
## So, if I am at the 6th token, I would like to take the channels that are information at my step, and 
## also the channels from the 5th, 4th, 3rd, 2nd and the 1st step. I like to average them up and 
## that sort of becomes a feature vector that summarized me in the context of my history.

In [57]:
## One thing to keep in mind, is that addition or average is an extremely weak form of interaction.
## Because we lose a ton of information like spatial arrangements of the tokens and all.

##### Let's calculate the average of all vectors in all the previous tokens and also at this t'th token for every single batch element independently for every t'th token in that sequence.

In [58]:
## Creating a bag of words, a term used for averaging of things.
## So there is a word stored on every one of these 8 locations and we are just averaging.

In [59]:
x_bow = torch.zeros((B, T, C))  ## Initialize it as zero

In [60]:
print(x_bow)

tensor([[[0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.]]])


In [61]:
## Now looping over each batch and time

In [62]:
for b in range(B):  ##Looping over each batch
    # print(b)
    for t in range(T):  ##Looping over each token/word in time in that batch
        # print(t)
        x_prev = x[b, :t + 1]  ##Taking everything in this batch upto and including the current t'th token
        print(x_prev.shape)  ##As you can see, the dimension is (Time, Channels), time is the token, upto and including the t'th token.
        print(x_prev)  ##So, now it is going to be, the tokens with embeddings for that batch, upto and including the t'th token.
        x_bow[b, t] = torch.mean(x_prev, 0)  ##Averaging over the 0th dimension, so averagin out the time, we will get a C vector, vector having information and storing it in x_bow

torch.Size([1, 2])
tensor([[ 0.1808, -0.0700]])
torch.Size([2, 2])
tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152]])
torch.Size([3, 2])
tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255]])
torch.Size([4, 2])
tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643]])
torch.Size([5, 2])
tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679]])
torch.Size([6, 2])
tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102]])
torch.Size([7, 2])
tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398]])
torch.Size([8, 2])
tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
 

In [63]:
## Previously the dimension was (B, T, C), as we saw, the number of batches, then the max sequence length / time, and the embedding size.
## For Example:
## If I have the time or maximum sequence length as 8, then,
## First upto the first token,
## Second upto the second token (Second Included)
## Third upto the third token (Third included)

##Now we are trying to take an average of the embeddings/channels upto the t'th token including itself to establish
##the communication between them.

##So, Foor example:
##Let's say I am at 6th step, so I have the channels of tokens from 1st, 2nd upto 6th step.
## Like this: [[-2.0555,  1.8275],
##            [ 1.3035, -0.4501],
##            [ 1.3471,  1.6910],
##            [-0.1244, -1.6824],
##            [-0.0266,  0.0740],
##            [ 1.0517,  0.6779]]

##And then I will take average of all of these, to finally determine or define the token at 6th step.
##So now it has the information from all the tokens preceeding it with itself.

In [64]:
##Let's see the first batch of x and the x_bow

In [65]:
x[0]

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])

In [66]:
x_bow[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [67]:
## As it can be seen, the first word's average is equal, because we are doing an average of this one token.
## But after that, second one is the average the first two in x and then third is the average of the first 3 in x and so on.

In [68]:
#### So this is the trick, this is all well and good but this is very inefficient.
#### So we can be efficient here by doing Matrix Multiplication.

#### Doing Matrix Multiplication and being Efficient

In [69]:
#### Matrix multiplication can be done independently for each batch, that is batch-wise matrix multiplication, and we also will
#### will not be losing information like we did while averaging out the tokens upto the t'th token.

In [70]:
### Let's see an example first

In [71]:
torch.manual_seed(42)  ##Setting up the manual seed to reproduce the results

<torch._C.Generator at 0x1c0324091f0>

In [72]:
a = torch.ones(3, 3)  ##an identity matrix of dimensions (3, 3)

In [73]:
a

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])

In [74]:
b = torch.randint(0, 10, (3, 2)).float()  ##A random matrix from 0 to 10 with dimensions (3, 2)

In [75]:
b

tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])

In [76]:
#### If you see this matrix b, you can see that it identical to our sequence at the first batch, or any other batch.
#### The only difference is, it was (8, 2), it is (3, 2).  **(3, 2) for an example according to the identity matrix A that is (3, 3).
#### It is only an example for now to gain an understanding.

In [77]:
c = a @ b  ##Doing the Matrix Multiplication

In [78]:
c

tensor([[14., 16.],
        [14., 16.],
        [14., 16.]])

In [79]:
#### As you can see, it gives a matrix (3, 2), same in the dimensionality as matrix b.
#### The way they are achieved is, first row of a, dot product with first column of b and that gives the first element of c
#### Then first row of the am dot product with second column of b, and that gives the second element of c and so on.

In [80]:
#### Now what the trick is:
#### Right now, a is just a boring matrix with one. But we can do something like:

In [81]:
torch.tril(torch.ones(3, 3))

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])

In [82]:
#### It is a wrapper around the matrix and as you can see, it gives us the lower triangular matrix.
#### Now let's do this with a (manipulating a, to change the output.)

In [83]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b
print(a)
print(b)
print(c)

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
tensor([[ 2.,  7.],
        [ 8., 11.],
        [14., 16.]])


In [84]:
#### Now if you see the output, the pattern is, depending on how many 1s and 0s we have in a, we are basically doing sum of the 
#### variable numbers in b. And it is because we have ones in the lower traigular fashion and the other elements are just zeroed out.
#### So, for example:
####([[ 2.,  7.],
#### [ 8., 11.],
#### [14., 16.]])

#### If you see here, first row is as it is in the matrix b (keep thinking of the matrix x[0] while understanding it. So first token in the sequence as it is.)
#### Then second row is the sum of the first and second rows b (Second token in the matrix x[0])
#### Third row is the sum of all the first to third rows of b (Third token in the matrix [0])

#### You can set up an intuition that, the information is kinda being/begin to flow from the preceeding tokens to the current or t'th token.
#### And it is all because how we manipulate the matrix a.

In [85]:
#### It is the summation right now because there are all ones in the matrix a.
#### We can also do the average of matrix b, in the incremental fashion.

In [86]:
#### We can normalize the rows in matrix a so these  sum to 1, then we can get an average.

In [87]:
torch.manual_seed(42)

<torch._C.Generator at 0x1c0324091f0>

In [88]:
a = torch.tril(torch.ones(3, 3))

In [89]:
a

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])

In [90]:
torch.sum(a)

tensor(6.)

In [91]:
torch.sum(a, 1)  ## This is summing a dimension 1.

tensor([1., 2., 3.])

In [92]:
torch.sum(a, 1, keepdim = True)  ##For broadcasting, we keep the dimension

tensor([[1.],
        [2.],
        [3.]])

In [93]:
a = a / torch.sum(a, 1, keepdim = True)

In [94]:
a  ##Now as you can see, matrix a has been normalized, and now each row had the elements that sum to 1.

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])

In [95]:
#### Now if we multiply this with matrix b, we get an average for elements in rows in matrix b in an incremental fashion.
#### So let's do this

In [96]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim = True)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b
print(a)
print(b)
print(c)

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [97]:
#### As you can see, first row in c is the first row of b.
#### Second row in c, is the average of the first two rows of b.
#### Third row in c, is the average of all the rows of b.

#### Now from here if you imagine the matrix x[0], you can set the intuition that the information flow in b is taking place from the preceeding tokens in the sequence
#### to the current / t'th.

#### This is all happening based on how we manipulate the elements in the multiplying matrix a and we can do these averages in this
#### incremental fashion and we can manipulate this based on the elements in matrix a.

In [98]:
#### Now let's apply this in the current batch of sequences.

In [99]:
#### So first, let's produce a matrix a, and I will call this matrix as 'wei'

In [100]:
wei = torch.tril(torch.ones(T, T))  ## The dimensions, as you can see is (T, T) that is (8, 8), because the dimension of the each sequence is (8, 2).
                                    ## So we need a matrix to store all the information from our sequence.
wei = wei / torch.sum(wei, 1, keepdim = True)

In [101]:
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [102]:
#### As you can see, now we have a matrix wei, similar to matrix a, where each row sums up to 1.
#### And this is, how much of every row we want to average up.

In [103]:
#### b is going to be x.

In [104]:
#### So now we have x_bow 2

In [105]:
wei = torch.tril(torch.ones(T, T))
wei = wei / torch.sum(wei, 1, keepdim = True)
x_bow2 = wei @ x  ## So our wei is (T, T) and our x is (B, T, C), so pytorch will come and see that these shapes are not same.
                  ## And it will add a batch dimension in wei, so it becomes (B, T, T). And it is batch matrix multiplication.
                  ## So matrix multiplication happens in each batch, individually and parallely. And for each batch element
                  ## It will be (T, T) multiplying (T, C), exactly as we saw while multiplying matrix a and matrix b.
                  ## So in the end we get (B, T, C), because for each batch element, we get (T, C).

In [106]:
torch.allclose(x_bow, x_bow2)  ##This is telling that x_bow and x_bow2 are identical.

True

In [107]:
## So if we see
print(x_bow[0])
print(x_bow2[0])

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])
tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])


In [108]:
#### As you can see, the elements or the sequence in the first batch of x_bow and x_bow2 is identical.

In [109]:
#### So this is the mathematical trick. 
#### We were able to do this, batched matrix multiply to do this aggregation, it is a weighted aggregation.
#### And the weights are specified in that (T, T) array/Matrix, that is wei. 
#### And we are basically doing weighted sums and weights are in the wei matrix, that has taken on this triangular form (Lower Triangular).
#### And it is so that, the tokens at the t'th dimension will only get information from the tokens preceeding it.
#### As we can see in the matrix/array x_bow2, that the token at the 6th position only has the information from the previous tokens and itself from matrix x.
#### So we only take the rows or tokens that are preceeding to the current / t'th token.
#### It is all possible by manipulating that wei matrix, making it lower triangular and normalizing it for averaging (without normalizing, we get only sums in the incremental fashion).
#### After normalizing the elements in row of wei, to be the sum equal to 1, we get the averages of the x in an incremental fashion.
#### And it happens batch-wise, pytorch automatically add the batch (B) dimension to the wei matrix that is (T, T) dimension.

#### Let's rewrite it in one more way, that is even more efficient and interesting

In [110]:
## So we have a matrix tril
tril = torch.tril(torch.ones(T, T))

In [111]:
tril

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

In [112]:
## Now we have the wei matrix
wei = torch.zeros((T, T))

In [113]:
wei

tensor([[0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [114]:
#### wei begins as all zeroes, then we use masked fill,
#### So what am I saying is, every element in tril that is 0, make them negative infinity.

In [115]:
wei = wei.masked_fill(tril == 0, float('-inf'))

In [116]:
wei

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [117]:
#### So now if you see wei, you can see that all the elements in the upper triangular part of wei is '-inf' now.

In [118]:
#### Then we do softmax
#### So if I take softmax along eveery single row, so the dim is -1.

In [119]:
import torch.nn.functional as F

In [120]:
wei = F.softmax(wei, dim = -1)  ##Taking softmax along the rows

In [121]:
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [122]:
#### Softmax is also like a normalization operation, so you get the exact same matrix as before.
#### So in dsoftmax, we are going to exponentiate every single one of the elements in the row and divide by the sum.
#### SO for example:
#### in the first row, when we exponentiate, we will get 1 where there is 0 and 0 where there is '-inf'.
#### And then when we divide by sum that is 1, we get 1, sa on.

In [123]:
#### So this is also the same way to produce this mask, but what makes this more interesting is:
#### these weights in wei, begin with 0. So you can think of as an interaction strength or affinity so it is telling us that
#### how much of the token from the past do we want to aggregate/average out.
#### Then we create/apply a mask and say that tokens from the past can not communicate with future, so we set 0s in tril as '-inf'.
#### Then it goes through softmax and matrix multiplication.

In [124]:
#### So we initialize the wei matrix, that is weights matrix as 0s, but these affinities between the tokens
#### are not just goimg to be zeros, theyare going to be data dependent, these tokens are going to start looking at each
#### other and some tokens will find other tokens interesting depending on what their values are, and then we say future can not communicate
#### with the past and then we normalize and sum, we are going to aggregate their values depending on how interesting they find each other.
#### And this is the preview for self attention.

In [125]:
## So let's implement this (for future intuituin, this weights matrix is going to carry weights and will be trained along with the neural network so it will not just contain zeroes but it will change and carry the affinity as we talked about.)
tril = torch.tril(torch.ones(T, T))  ##Creating the lower triangular matrix
wei = torch.zeros((T, T))  ##Initializing weights matrix as all zeroes.
wei = wei.masked_fill(tril == 0, float('-inf'))  ##Appliying the mask so that past tokens do not interact with future (also for the weighted average in incremental fashion.)
wei = F.softmax(wei, dim = -1)  ##Applying softmax to normalize wei and get the amount of how much each word can fuse itself.
x_bow3 = wei @ x  ##Then we do matrix multiplication to get the weighted average in the incremental fashion.

In [126]:
torch.allclose(x_bow2, x_bow3)

True

In [127]:
#### As you can see these are identical to each other. If I want to see further.
x_bow[0], x_bow2[0], x_bow3[0]

(tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]),
 tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]),
 tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]))

In [128]:
#### As can be seen, these all are identical but the methods of arriving here are different.

### So long story short from this is:
1. We can do Weighted Aggregations of the past elements by using matrix multiplication of the lower triangular fashion.
2. And each of these part in the lower triangle tells us, how much of each element fuses into this position.

### Self-Attention
#### Let's implement what we have learned so far and perform attention (Single Head Attention)

In [134]:
## So we have

In [135]:
torch.manual_seed(1337) ## Setting up a seed for reproducibility

<torch._C.Generator at 0x1c0324091f0>

In [136]:
##Setting up the dimensions
B, T, C = 4, 8, 32  ## Now setting up the number of channels or embeddig dimension as 32 insread of 2, so it looks a bit more 
                    ## reasonable. Remember, these channels / vectors contain the information about that token they correspond to,
                    ## as we already know.

In [137]:
x = torch.randn(B, T, C)  ##Creating a random tensor X for showcase.

In [138]:
x[0]

tensor([[ 1.8077e-01, -6.9988e-02, -3.5962e-01, -9.1520e-01,  6.2577e-01,
          2.5510e-02,  9.5451e-01,  6.4349e-02,  3.6115e-01,  1.1679e+00,
         -1.3499e+00, -5.1018e-01,  2.3596e-01, -2.3978e-01, -9.2111e-01,
          1.5433e+00,  1.3488e+00, -1.3964e-01,  2.8580e-01,  9.6512e-01,
         -2.0371e+00,  4.9314e-01,  1.4870e+00,  5.9103e-01,  1.2603e-01,
         -1.5627e+00, -1.1601e+00, -3.3484e-01,  4.4777e-01, -8.0164e-01,
          1.5236e+00,  2.5086e+00],
        [-6.6310e-01, -2.5128e-01,  1.0101e+00,  1.2155e-01,  1.5840e-01,
          1.1340e+00, -1.1539e+00, -2.9840e-01, -5.0754e-01, -9.2392e-01,
          5.4671e-01, -1.4948e+00, -1.2057e+00,  5.7182e-01, -5.9735e-01,
         -6.9368e-01,  1.6455e+00, -8.0299e-01,  1.3514e+00, -2.7592e-01,
         -1.5108e+00,  2.1048e+00,  2.7630e+00, -1.7465e+00,  1.4516e+00,
         -1.5103e+00,  8.2115e-01, -2.1153e-01,  7.7890e-01,  1.5333e+00,
          1.6097e+00, -4.0323e-01],
        [-8.3447e-01,  5.9780e-01, -5.14

In [139]:
x[0].shape

torch.Size([8, 32])

In [140]:
x.shape

torch.Size([4, 8, 32])

In [141]:
## Now let's put everything together
torch.manual_seed(1337)  ##Setting up the seed
B, T, C = 4, 8, 32  ##Batch, Time/Max Sequence Length, Channels/embedding dimension/embeddings
x = torch.randn(B, T, C)  ##Creating a random tensor of dimension (B, T, C), 4 batches, sequence length of 8 and channels/embedding dimension as 3.
tril = torch.tril(torch.ones(T, T))  ##Creeating a lower triangular matrix of ones, because we are going to initialize matrix wei as matrix of all 0s.
wei = torch.zeros((T, T))  ##Initializing the wei matrix, rememeber, there are all zeroes in this matrix initially, that is the affinitis between tokens/nodes is 0.
wei = wei.masked_fill(tril == 0, float("-inf"))  ##Applying the mask to stop the past tokens to communicate from future tokens. So we get an aggregation in an incremental fashion upto the t'th token.
wei = F.softmax(wei, dim = -1)  ##Normalizing the wei matrix to get the weighted average when we do matrix multiplication. Doing softmax along the rows. 
out = wei @ x  ##batch matrix multiply to get the weighted aggregation / average from past tokens upto the t'th token.

In [143]:
out.shape  ##As can be seen the shape of the output is as the shape of X

torch.Size([4, 8, 32])

In [144]:
tril

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

In [145]:
##This is the matrix in lower triangular fashion

In [146]:
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [147]:
##This is the matrix wei, after normalizing and masking, as can be seen, all the rows are noramlized so that the sum of all the elements
##in a row is equal to 1.

In [148]:
#### As we can see in the code above, it does a simple average of all the past tokens and a current token.
#### So it is the previous information and current information mixed together and then averaged.
#### This is achieved by the lower triangular matrix tril, that allows us to mask out this wei matrix.
#### So we masked it out and then we normalize it.

In [149]:
#### Now, currently when we initialize the affinities between all the different tokens/nodes in wei matrix to be zero.
#### We see that wei gives us this structure where every single row has this uniform numbers. So that's what then send in the matrix
#### multiply and it makes it so that we do a simple average.

In [150]:
#### Now, look carefully, that when we have initialized the affinity between all the tokens as 0s, the
#### wei matrix has the uniform numbers for each node/token. That means, when the affinities are 0, that is equal for
#### all the tokens, their impact on other tokens that is the amount they should communicate to other tokens is also uniform,
#### or in a sense equal. That is why when we do softmax to normalize the wei matrix, all tokens/nodes are treated in the same way and
#### we get a uniform matrix wei. So when we do the batch matrix multiply with x, we get a simple weighted average.

#### Keep in mind, this is because the weights matrix, wei is still initialized with 0s.

In [151]:
#### Now, as you can see that these numbers/affinities between tokens/nodes are uniform, initially zeros for each token.
#### Now we do not want it to be all uniform, because different tokens will find different other tokens more interesting, different tokens will
#### play different roles for other different tokens. How much a token want to communicate to other token will be different. So
#### we want them to be data dependent.

#### For example:
#### If I am a vowel and maybe I am looking for the consonants in my past and maybe I want to know what are those consonants and I 
#### want that information flow to me.

In [152]:
#### So I want that information in the past, but now I want to do it in a data dependent way. 
#### And this is the problem that self attention solves.

In [154]:
#### Self Attention solves it in a following way:
#### Now every single node / token at each position will emit two vectors, it will emit a 'Query' and it will emit a "Key".
#### The 'Query' vector roughly speaking is 'What am I looking for?' and 'Key' vector is roughly speaking 'What do I contain?'.
#### And the way we get affinities between these tokens now in a sequence is we do a 'Dot Product' between the keys and queries.

#### So my 'Query' dot products with all the 'Keys' of all the other tokens and that dot product now becomes wei.
#### So as to get the intuition my 'What am I looking for?' that is 'Query' dot products with other tokens' 'What do they contain?' that are 'Keys'.
#### So like this I will find out what am I looking for and maybe get some token's keys or what do that contain more interesting to me.
#### So that is how we will get affinities now.

#### And it happens for all the tokens, the query of the t'th token and the keys of all the preceeding tokens.
#### So if the key and the query is self-aligned, they interact with a very high amount and I will get to learn more about that as 
#### opposed to any other token.

In [155]:
## So let's implement this now, we will implement the single head of self attention.

In [156]:
## There's a hyper paramter involved with self attention that is 'head size'.

In [170]:
head_size = 16  ##passing head size as 16

In [171]:
##Then we will initialize the linear modules from pytorch and keep bias as false.
##So these are just going to matrix multiply with some fixed weights.

In [172]:
##So it will be like this
key = nn.Linear(C, head_size, bias = False)  ##Simple matrix multiplication with fixed weights and getting the final dimension as head size instead of
                               ## C, that is now it will be 16, instead of 32.
query = nn.Linear(C, head_size, bias = False)

In [173]:
key, query

(Linear(in_features=32, out_features=16, bias=False),
 Linear(in_features=32, out_features=16, bias=False))

In [174]:
##As it can be seen, the input features are 32, that is the channels/embedding dimension of x and output features are
##16, that is the new paramter called head size. There will be multiple heads in future but now we are just implementing for one
##head only. We use this linear layer, that will do the normal matrix multiplication with fixed weights and project the final dimension
##size, from C to head_size, that is 32 to 16.

In [175]:
##I will forward these modules on x, and produce the k and q.

In [176]:
##so We will do it like this
k = key(x)
q = query(x)

In [177]:
##The size will become (B, T, 16) for k and q now because we are forwarding the linear modules on x with the head size as the
##output feautres.

#### For future reference, the head size will be the 1/4th of the embedding dimension, according to the paper (Attention is all you need).
#### And with that will define the number of heads.
#### For example:
#### If I have the embedding size / embeddings dimension/ channels as 512, and then I will divide it by 4 (can be number of heads).
#### That is I get the head_size as 128. Then I concatenate all the heads together and we get the channels/ embedding dimension / embeddings back as the final dimension.
#### All the different heads will have some information gathered up and we concatenate that.

In [178]:
q[0], k[0]  ##These are the queries and keys of each node/ token in the first sequence in the batch.

(tensor([[-0.8604, -0.6466,  0.0754, -0.6363, -0.1464,  0.3840, -0.3053, -0.6389,
           0.0544,  0.0762,  0.2364, -0.1792, -0.2908,  0.3348, -0.9431, -0.1245],
         [-0.8963, -0.1909,  0.0988, -1.0337, -0.4137,  0.9316, -0.1586, -0.1791,
           0.1190,  0.6819,  0.0265,  0.5562, -0.1615,  0.1101,  0.0846, -0.6664],
         [-0.7785,  0.2959, -0.2016, -0.7096,  0.4515,  0.4565,  0.2554,  0.4989,
           0.0964,  0.5159,  0.3097, -0.3569, -0.2772, -0.3573,  0.4011,  0.0285],
         [ 0.6401, -0.1027,  0.4378,  0.6083,  0.5365,  0.3228,  0.0042,  0.6595,
           0.4381, -0.3499, -1.4774, -0.4494,  0.6942, -0.0524,  0.1576,  0.2287],
         [ 0.3436,  0.7927,  1.3172,  0.2551, -0.1853,  0.3615,  1.2092,  0.6290,
           0.1703,  0.3958,  0.2562, -0.1899, -0.8195,  0.0613,  0.2854, -1.2319],
         [ 1.6714, -0.7109,  1.5069,  2.0694,  0.4068,  0.2825,  0.9223,  1.3963,
          -1.0078,  0.0742,  0.2363,  0.8289,  0.1465, -1.5054,  0.6776,  0.4237],
         [

In [180]:
q.shape, k.shape

(torch.Size([4, 8, 16]), torch.Size([4, 8, 16]))

In [181]:
##Now if we do the dot product, we get the wei matrix
out = q @ k

RuntimeError: Expected size for first two dimensions of batch2 tensor to be: [4, 16] but got: [4, 8].

In [182]:
## As can be seen, we get an error, so we gotta transpose k
k = k.transpose(-2, -1)

In [184]:
k.shape  ##As you can see it has been transposed

torch.Size([4, 16, 8])

In [185]:
##Now if I do the matrix multiplication
out = q @ k

In [187]:
##As you can see the matrix multiplication is done
out.shape

torch.Size([4, 8, 8])

In [188]:
##As you can see the shape of the out is (B, T, T) now. And it will work as the wei matrix now.
##Previosuly, all the batches has the same wei matrix with uniform numbers to do the matrix multiplication with.
##But now, since we calculated the queries and keys for all the tokens in the all the sequences in all the batches.
##And did the batch-wise matrix multiplication. So the wei matrix for all the batches that are 4, will have different values
##because all contain different tokens and the affinities will also be different now, because the wei matrix is data dependent now.

In [189]:
out

tensor([[[ 0.4384, -0.2834, -0.9627, -2.0258, -0.7739, -0.4457,  0.1768,
          -1.2515],
         [ 0.9968, -1.8596, -2.7785, -1.7456, -1.2913,  1.2673, -0.6790,
          -0.4780],
         [ 1.6637, -1.2497, -1.1042, -1.0542,  0.2847,  0.3279,  0.3317,
           0.4054],
         [-1.3585, -0.2854,  0.3539,  2.2091,  1.6465, -0.4384, -0.7792,
           0.6153],
         [ 0.5707, -3.0360, -2.4672, -1.6114,  3.1261, -1.0325, -1.9564,
          -0.6380],
         [-1.1955,  1.7788,  2.4887,  1.4483,  3.0263, -0.4825, -0.7313,
           2.6313],
         [ 1.2233,  0.8108,  1.1598,  0.6504,  0.4180,  2.0683,  0.4607,
          -1.4127],
         [ 2.0534, -1.6042, -1.3519, -2.1525,  2.2785, -0.3494, -0.9830,
          -2.0904]],

        [[ 1.1554, -0.0065,  2.0088,  0.1566,  0.3512,  2.2127,  1.9287,
           0.8258],
         [-2.7658,  1.2351, -1.6067, -0.9266, -0.3247, -0.1245, -1.9458,
          -0.1695],
         [ 0.7916,  0.5579,  1.1484,  0.0371,  1.3073,  1.5446,  0.3

In [190]:
##As can be seen above, different affinities for different nodes/tokens in all the batches.

In [191]:
##Now I have x that is (B, T, C) and wei that is (B, T, T), with varying numbers instead of all zeros and uniform numbers because
##it is data dependent now.

In [192]:
##Now let's mask this
tril = torch.tril(torch.ones(T, T))

In [194]:
wei = out.masked_fill(tril == 0, float('-inf'))

In [195]:
##Now we have our masked wei matrix, that contains the weights or affinities of all the tokens for the t'th token and the tokens
##preceeding it. 
##In short, how much of the information each token want to communicate to the t'th token.

In [196]:
##So if we take a look
wei[0]

tensor([[ 0.4384,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf],
        [ 0.9968, -1.8596,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf],
        [ 1.6637, -1.2497, -1.1042,    -inf,    -inf,    -inf,    -inf,    -inf],
        [-1.3585, -0.2854,  0.3539,  2.2091,    -inf,    -inf,    -inf,    -inf],
        [ 0.5707, -3.0360, -2.4672, -1.6114,  3.1261,    -inf,    -inf,    -inf],
        [-1.1955,  1.7788,  2.4887,  1.4483,  3.0263, -0.4825,    -inf,    -inf],
        [ 1.2233,  0.8108,  1.1598,  0.6504,  0.4180,  2.0683,  0.4607,    -inf],
        [ 2.0534, -1.6042, -1.3519, -2.1525,  2.2785, -0.3494, -0.9830, -2.0904]],
       grad_fn=<SelectBackward0>)

In [197]:
##As you can, these are pure affinities, and how much the tokens wanna talk to each other.

In [198]:
##Now if I normalize this using softmax along the rows, we won't be getting the uniform matrix, but we will get as to how much
##the tokens/nodes at each position want to communicate.
wei = F.softmax(wei, dim = -1)

In [199]:
##Now if we take a look at wei
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.9456, 0.0544, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.8952, 0.0486, 0.0562, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0223, 0.0651, 0.1234, 0.7892, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0711, 0.0019, 0.0034, 0.0080, 0.9155, 0.0000, 0.0000, 0.0000],
        [0.0069, 0.1353, 0.2752, 0.0972, 0.4712, 0.0141, 0.0000, 0.0000],
        [0.1561, 0.1033, 0.1465, 0.0880, 0.0698, 0.3634, 0.0728, 0.0000],
        [0.4031, 0.0104, 0.0134, 0.0060, 0.5049, 0.0365, 0.0194, 0.0064]],
       grad_fn=<SelectBackward0>)

In [200]:
##As you can see, this is telling us the amount of how much each token want to communicate to the t'th token in the first sequence
##in the batch.

In [201]:
##Now if I multiply this with x
out = wei @ x
out.shape

torch.Size([4, 8, 32])

In [203]:
##As you can see, we have the channel, dimension back as the final dimension.
out[0] ##This is the final output after matrix multiplication, the aggregated average, but it is data dependent now.

tensor([[ 1.8077e-01, -6.9988e-02, -3.5962e-01, -9.1520e-01,  6.2577e-01,
          2.5510e-02,  9.5451e-01,  6.4349e-02,  3.6115e-01,  1.1679e+00,
         -1.3499e+00, -5.1018e-01,  2.3596e-01, -2.3978e-01, -9.2111e-01,
          1.5433e+00,  1.3488e+00, -1.3964e-01,  2.8580e-01,  9.6512e-01,
         -2.0371e+00,  4.9314e-01,  1.4870e+00,  5.9103e-01,  1.2603e-01,
         -1.5627e+00, -1.1601e+00, -3.3484e-01,  4.4777e-01, -8.0164e-01,
          1.5236e+00,  2.5086e+00],
        [ 1.3491e-01, -7.9841e-02, -2.8517e-01, -8.5886e-01,  6.0036e-01,
          8.5760e-02,  8.3992e-01,  4.4633e-02,  3.1394e-01,  1.0542e+00,
         -1.2468e+00, -5.6369e-01,  1.5760e-01, -1.9567e-01, -9.0352e-01,
          1.4217e+00,  1.3650e+00, -1.7570e-01,  3.4372e-01,  8.9767e-01,
         -2.0085e+00,  5.8073e-01,  1.5563e+00,  4.6398e-01,  1.9808e-01,
         -1.5598e+00, -1.0524e+00, -3.2814e-01,  4.6577e-01, -6.7474e-01,
          1.5283e+00,  2.3503e+00],
        [ 8.2692e-02, -4.1261e-02, -2.75

In [204]:
torch.manual_seed(1337)  
B, T, C = 4, 8, 32  
x = torch.randn(B, T, C)

head_size = 16  ##Setting the head_size parameter as 16
query = nn.Linear(C, head_size, bias = False)
key = nn.Linear(C, head_size, bias = False)

##By forwarding the key and query on our data x, we get the queries and keys for each token in the sequences and it happens independently for each batch.
k = key(x)
q = query(x)

wei = q @ k.transpose(-2, -1)  ##Communication still has not happened yet, communication will happen now when we multiply q and k. This
                               ##This will give us the affinities. So wei becomes (B, T, T), for every row of B, we have a T square matrix
                               ##that is giving us the affinities between tokens. (B, T, 16) @ (B, 16, T) gives (B, T, T).
tril = torch.tril(torch.ones(T, T))
# wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float("-inf"))
wei = F.softmax(wei, dim = -1) 
out = wei @ x   ## The weighted aggregation now is a function in a data dependent manner between the keys and queries of these nodes/tokens.

In [206]:
out.shape

torch.Size([4, 8, 32])

In [207]:
wei

tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5877, 0.4123, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4457, 0.2810, 0.2733, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2220, 0.7496, 0.0175, 0.0109, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0379, 0.0124, 0.0412, 0.0630, 0.8454, 0.0000, 0.0000, 0.0000],
         [0.5497, 0.2187, 0.0185, 0.0239, 0.1831, 0.0062, 0.0000, 0.0000],
         [0.2576, 0.0830, 0.0946, 0.0241, 0.1273, 0.3627, 0.0507, 0.0000],
         [0.0499, 0.1052, 0.0302, 0.0281, 0.1980, 0.2657, 0.1755, 0.1474]],

        [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4289, 0.5711, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5413, 0.1423, 0.3165, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0635, 0.8138, 0.0557, 0.0669, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4958, 0.0758, 0.2224, 0.0156, 0.1905, 0.0000, 0.0000, 0.0000],
         [0.3957, 0.112

In [208]:
#### Every batch will have different wei, because every batch contains different tokens at different positions.
#### So it is now data dependent.

In [209]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5877, 0.4123, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4457, 0.2810, 0.2733, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2220, 0.7496, 0.0175, 0.0109, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0379, 0.0124, 0.0412, 0.0630, 0.8454, 0.0000, 0.0000, 0.0000],
        [0.5497, 0.2187, 0.0185, 0.0239, 0.1831, 0.0062, 0.0000, 0.0000],
        [0.2576, 0.0830, 0.0946, 0.0241, 0.1273, 0.3627, 0.0507, 0.0000],
        [0.0499, 0.1052, 0.0302, 0.0281, 0.1980, 0.2657, 0.1755, 0.1474]],
       grad_fn=<SelectBackward0>)

In [210]:
#### You can see it is not uniform now.

In [211]:
#### There is more part to a single self attention head.
#### That is when you do the aggregation, we do not aggregate the tokens exactly. We produce one more value here
#### and we call that the 'Value'.
#### So in the same way, that we produced key and query, in the same way we produce value.
#### And them, we aggregate that value instead of the tokens that is x.

In [236]:
torch.manual_seed(1337)  ##setting up the seed for reproducibility
B, T, C = 4, 8, 32  ##Setting up the dimensions
x = torch.randn(B, T, C)  ##Initializing a random tensor x of dimension (B, T, C)

head_size = 16  ##Setting up the head size
key = nn.Linear(C, head_size, bias = False)  ##Creating the key
query = nn.Linear(C, head_size, bias = False)  ##Creating the query
value = nn.Linear(C, head_size, bias = False)  ##Creating the value, just like query and key

k = key(x)  ##Forwarding on x to get the keys
q = query(x)  ##Forwarding on x to get the queries
v = value(x)  ##Forwarding on x to get the values

wei = q @ k.transpose(-2, -1)  ##Doing matrix multiplication between q and k to set up the communication or affinities between tokens.
tril = torch.tril(torch.ones(T, T))  ##Creating the lower triangular matrix to help us with masking and weighted aggregation in the incremental manner.
wei = wei.masked_fill(tril == 0, float('-inf'))  ##Applying masking on the wei matrix, to avoid the interaction and communication between past and future tokens.

wei = F.softmax(wei, dim = -1)  ##Normalizing using softmax

out = wei @ v  ##Aggegating the elements/vector v, instead of tokens x

out.shape

torch.Size([4, 8, 16])

In [215]:
#### You can think of x as kind of like private information to this token, so x is private to this token.
#### So I am a 5th token and I have some identity and my information is kept in vector x.
#### Now for the purposes of this single head, here is 'What I am interested in?', here is 'What do I contain?' and
#### if you find me interesting, here is 'what I will communicate to you?' and this is stored in 'Value', v.
#### So v is the thing that is aggregated for the purposes of this single head between the different nodes.

In [216]:
####And basically this is the Attention Mechanism.

### There are few notes about Attention Mechanism:
1. Attention is a communication mechanism. You can think about it as a communication mechanism, where you have a number of nodes in a directed graph where you have edges pointing between those, and what happens is every node has some vector of information, and it has to aggregate that information via a weighted sum for all the nodes that point to it. And this is done in a data dependent manner. We have 8 nodes/tokens because we have max sequence length as 8. And the first node is only pointed to itself, the second node is pointed to by the first node and itself, all the way upto the 8th node, which is pointed to by all the previous nodes and itself. This is the order that we have in our graph, basically in any auto-regressing language modeling. But in general, Attention can be applied to any arbitrary directed graph. It is just a communication mechanism between the nodes.
2. There is not notion of space, so attention simply acts over a set of vectors in the graph. And so by default, these nodes has no idea where they are positioned in the space. That's why we need to encode them positionally and sort of give them some information that anchors to a specific position, so they sort of know where they are.
3. Elements across the batch dimensions never talk to each other. They are always processed independently. This is batch matrix multiply, that applies basically a matrix multiplication in parallel across the batch dimensions.
4. The last note is that, according to the paper, there is something called scaling, and it is called the scaled attention, in which they are multiplying by the 1/sqrt(head_size). They do this to normalize, and it is kind of like important normalization to have.

In [218]:
##Let's look into the scaled attention or the scaling factor for normalizing the values
##So if we have
k = torch.randn(B, T, head_size)
q = torch.randn(B, T, head_size)
wei =  q @ k.transpose(-2, -1)

In [219]:
k.var()

tensor(1.0700)

In [220]:
q.var()

tensor(0.9006)

In [222]:
wei.var()

tensor(18.0429)

In [223]:
#### If you have unit gaussian inputs, that is 0 mean and unit variance.
#### So, k and q are unit gaussian and if you do wei naively, you will see that wei is actually, the variance will be only over the
#### head_size which is 16.
#### But if you multiply by 1/sqrt(head_size), then the variance of wei will be 1 and will be preserved.

In [228]:
k = torch.randn(B, T, head_size)
q = torch.randn(B, T, head_size)
wei =  q @ k.transpose(-2, -1) * head_size**-0.5

In [229]:
k.var()

tensor(0.9416)

In [230]:
q.var()

tensor(1.0104)

In [231]:
wei.var()

tensor(1.0879)

In [232]:
#### Now why it is important, we know that this wei is feed into softmax, so it is really important especially and initialization that
#### wei be fairly diffused. 

In [237]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)

In [238]:
#### As we can see, the wei had fairly diffused numbers here. 
#### Now the problem is because of softmax, if wei takes on very positive and very negative numbers, softmax wil actually 
#### converge to one-hot vectors.

In [239]:
##To illustrate that
##Let's say we are applying softmax to a tensor of values that are very close to 0 then we are gonna get the diffused thing out of
##softmax

In [240]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim = -1)

tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])

In [241]:
##As you can see above, diffused values we got after doing the softmax

In [243]:
##But the moment I take these exact same values and start sharpening it, making it bigger by multiplying by 8 for example
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]) * 8, dim = -1)

tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])

In [244]:
## You will see that softmax will start to sharpen towards the max, so we are sharpening towards whatever the number here is the
## highest. So we do not want these values ot be so extreme, especially at initialization otherwise softmax will be too peaky.
## And we are simply aggregating information from a single node, every node just aggregates information from like a single other node
## this is not what we want especially at initialization, we want information to be aggregated from all the other nodes preceeding it.

In [245]:
##So let's take our self attention knowledge and implement the scaling factor to it
torch.manual_seed(1337)  ##setting up the seed for reproducibility
B, T, C = 4, 8, 32  ##Setting up the dimensions
x = torch.randn(B, T, C)  ##Initializing a random tensor x of dimension (B, T, C)

head_size = 16  ##Setting up the head size
key = nn.Linear(C, head_size, bias = False)  ##Creating the key
query = nn.Linear(C, head_size, bias = False)  ##Creating the query
value = nn.Linear(C, head_size, bias = False)  ##Creating the value, just like query and key

k = key(x)  ##Forwarding on x to get the keys
q = query(x)  ##Forwarding on x to get the queries
v = value(x)  ##Forwarding on x to get the values

wei = q @ k.transpose(-2, -1)  ##Doing matrix multiplication between q and k to set up the communication or affinities between tokens.
tril = torch.tril(torch.ones(T, T))  ##Creating the lower triangular matrix to help us with masking and weighted aggregation in the incremental manner.
wei = wei.masked_fill(tril == 0, float('-inf'))  ##Applying masking on the wei matrix, to avoid the interaction and communication between past and future tokens.

wei = wei * head_size**-0.5  ##Applying scaling for normalizing, so that, the softmax does not get too peaky towards the maximum value, especially at initializaiton.

wei = F.softmax(wei, dim = -1)  ##Normalizing using softmax

out = wei @ v  ##Aggegating the elements/vector v, instead of tokens x

out.shape

torch.Size([4, 8, 16])

In [246]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3966, 0.6034, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3069, 0.2892, 0.4039, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3233, 0.2175, 0.2443, 0.2149, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1479, 0.2034, 0.1663, 0.1455, 0.3369, 0.0000, 0.0000, 0.0000],
        [0.1259, 0.2490, 0.1324, 0.1062, 0.3141, 0.0724, 0.0000, 0.0000],
        [0.1598, 0.1990, 0.1140, 0.1125, 0.1418, 0.1669, 0.1061, 0.0000],
        [0.0845, 0.1197, 0.1078, 0.1537, 0.1086, 0.1146, 0.1558, 0.1553]],
       grad_fn=<SelectBackward0>)

In [247]:
##Now if we see our wei, it is good to go now.
##A little hint, we are going to learn these q, k and v vectors while training the model or a transformer so the information can be learned and better over time.

#### This is all about self attention. Let's take this knowledge of self-attention for a spin.

In [249]:
x.shape

torch.Size([4, 8, 32])

In [250]:
B, T, C = x.shape
scores = x.view(B*T, C)

In [251]:
scores.shape

torch.Size([32, 32])

In [254]:
scores = scores[:, -1]

In [255]:
scores.shape

torch.Size([32])

In [256]:
x = x[:, -1, :]

In [257]:
x.shape

torch.Size([4, 32])

In [260]:
x

tensor([[-5.8188e-01, -2.2083e-01,  1.3537e-02, -3.0574e-01, -3.0384e-02,
          8.2161e-01,  3.8670e-04, -4.4742e-01,  8.2040e-01, -1.5178e+00,
          6.1587e-01, -1.8648e+00, -9.7773e-01,  6.3224e-02, -4.5483e-01,
         -4.1474e-01,  1.4987e+00, -3.9867e-02, -8.0510e-01, -1.1624e+00,
          4.2716e-01, -2.8192e-01, -1.2773e-02, -8.7792e-01, -3.2248e-01,
          1.8299e-01, -9.3030e-01, -1.2488e+00,  1.1192e+00, -1.9079e+00,
         -5.2756e-01,  1.0807e+00],
        [ 6.4547e-01, -3.3132e-01, -1.0390e+00,  9.1116e-01,  1.2984e+00,
          5.5509e-01, -4.6531e-01, -5.5186e-01,  1.1925e+00, -6.6420e-01,
         -9.1163e-03, -1.1712e+00,  4.8306e-01,  3.5048e-01, -5.7443e-01,
          1.2531e+00, -6.7409e-01,  3.9710e-01,  1.9287e-01, -2.1749e+00,
          1.6730e+00, -4.2359e-02, -1.1758e-01,  1.0546e+00, -1.5693e-02,
          2.4782e-01,  5.0760e-01, -9.0286e-01,  1.7872e+00,  8.9457e-02,
         -3.7475e-01, -4.7815e-01],
        [ 7.2012e-02,  1.1080e+00,  7.29

In [258]:
probs = F.softmax(x, dim  = -1)

In [259]:
probs

tensor([[0.0162, 0.0233, 0.0294, 0.0214, 0.0281, 0.0660, 0.0290, 0.0185, 0.0659,
         0.0064, 0.0537, 0.0045, 0.0109, 0.0309, 0.0184, 0.0192, 0.1299, 0.0279,
         0.0130, 0.0091, 0.0445, 0.0219, 0.0286, 0.0121, 0.0210, 0.0348, 0.0114,
         0.0083, 0.0888, 0.0043, 0.0171, 0.0855],
        [0.0379, 0.0143, 0.0070, 0.0494, 0.0727, 0.0346, 0.0125, 0.0114, 0.0654,
         0.0102, 0.0197, 0.0062, 0.0322, 0.0282, 0.0112, 0.0695, 0.0101, 0.0295,
         0.0241, 0.0023, 0.1058, 0.0190, 0.0177, 0.0570, 0.0195, 0.0254, 0.0330,
         0.0080, 0.1186, 0.0217, 0.0136, 0.0123],
        [0.0285, 0.0802, 0.0549, 0.0382, 0.0105, 0.0077, 0.0295, 0.0281, 0.0452,
         0.0857, 0.0288, 0.0268, 0.0172, 0.0202, 0.0020, 0.0222, 0.0100, 0.0300,
         0.1363, 0.0149, 0.0061, 0.0204, 0.0155, 0.0121, 0.0269, 0.0422, 0.0488,
         0.0055, 0.0121, 0.0394, 0.0100, 0.0442],
        [0.0477, 0.0257, 0.0350, 0.0415, 0.0215, 0.1225, 0.1050, 0.0038, 0.0266,
         0.0024, 0.0344, 0.0310, 0.0047,

In [262]:
next_x = torch.multinomial(probs, num_samples = 1)

In [264]:
next_x

tensor([[13],
        [ 7],
        [15],
        [16]])

In [265]:
x_new = torch.cat((x, next_x), dim = 1)

In [266]:
x_new.shape

torch.Size([4, 33])

In [267]:
x_new

tensor([[-5.8188e-01, -2.2083e-01,  1.3537e-02, -3.0574e-01, -3.0384e-02,
          8.2161e-01,  3.8670e-04, -4.4742e-01,  8.2040e-01, -1.5178e+00,
          6.1587e-01, -1.8648e+00, -9.7773e-01,  6.3224e-02, -4.5483e-01,
         -4.1474e-01,  1.4987e+00, -3.9867e-02, -8.0510e-01, -1.1624e+00,
          4.2716e-01, -2.8192e-01, -1.2773e-02, -8.7792e-01, -3.2248e-01,
          1.8299e-01, -9.3030e-01, -1.2488e+00,  1.1192e+00, -1.9079e+00,
         -5.2756e-01,  1.0807e+00,  1.3000e+01],
        [ 6.4547e-01, -3.3132e-01, -1.0390e+00,  9.1116e-01,  1.2984e+00,
          5.5509e-01, -4.6531e-01, -5.5186e-01,  1.1925e+00, -6.6420e-01,
         -9.1163e-03, -1.1712e+00,  4.8306e-01,  3.5048e-01, -5.7443e-01,
          1.2531e+00, -6.7409e-01,  3.9710e-01,  1.9287e-01, -2.1749e+00,
          1.6730e+00, -4.2359e-02, -1.1758e-01,  1.0546e+00, -1.5693e-02,
          2.4782e-01,  5.0760e-01, -9.0286e-01,  1.7872e+00,  8.9457e-02,
         -3.7475e-01, -4.7815e-01,  7.0000e+00],
        [ 7.20

In [269]:
x_new2 = torch.arange(8)

In [270]:
x_new2

tensor([0, 1, 2, 3, 4, 5, 6, 7])