In [1]:
import sys

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

  cpu = _conversion_method_template(device=torch.device("cpu"))


### Loading the Dataset

In [3]:
with open("input.txt", "r") as input_file:
    text = input_file.read()

In [4]:
##Creating the vocabulary
chars = sorted(list(set(text))) ##All the unique chars in our dataset
vocab_size = len(chars)
print("Vocabulary", "".join(chars))
print("Length of our vocabulary", vocab_size)  ##These are basically all the elements from our dataset which the model will see/generate or emit.

Vocabulary 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Length of our vocabulary 65


### Creating the Simple Tokenizer

In [5]:
##Creating encoder and decoder
stoi = { ch:i for i, ch in enumerate(chars)}
itos = { i:ch for i, ch in enumerate(chars)}
encode = lambda string: [stoi[c] for c in string]
decode = lambda l: "".join([itos[c] for c in l])

In [6]:
##Loading and converting the data into a sequence of integers
import torch
data = torch.tensor(encode(text), dtype = torch.long)  ##Converting into the pytroch tensor

In [7]:
print(data[:1000])

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
        47, 59, 57,  1, 47, 57,  1, 41, 

### Splitting the Data

In [8]:
n = int(0.9 * len(data))
print(n)
train_data = data[:n]
val_data = data[n:]

1003854


### Setting up Hyperparameters

In [9]:
batch_size = 32
max_seq_length = 8
max_iters = 5000
vocab_size = 65
n_embd = 32
head_size = 16
learning_rate = 1e-3
device = "cuda" if torch.cuda.is_available() else 'cpu'

### Function to get the batch

In [10]:
##Function to get a batch
def get_batch(split):
    data = train_data if split == 'train' else val_data
    index = torch.randint(len(data) - max_seq_length, (batch_size,))
    xb = torch.stack([data[i: i + max_seq_length] for i in index])
    yb = torch.stack([data[i + 1: i + max_seq_length + 1] for i in index])
    return xb, yb

### Creating a simple model architecture

In [11]:
class Transformer(nn.Module):

    ##Creating the constructor
    def __init__(self):
        super().__init__()
        self.embedding_table = nn.Embedding(vocab_size, n_embd)  ##Creating the embedding look-up table for our tokens
        self.lm_head = nn.Linear(embeddings, vocab_size)  ##Adding the Linear layer to get the final scores/outputs, (B, T, vocab_size)
        self.positional_embedding_table = nn.Embedding(max_seq_length, n_embd)  ##We need the channels or vectors to be n_embd because we want to encode the token in way that what is its position in the sequence.
                                                                                ##So we need to have positional information of the token in the sequence so we encode it based on what ots position is in the seq. so (max_seq_length, n_embd)
                                                                                ##Table contains the embedding for the token based on its position in the sequence.
    
    ##Creating the forward method
    def forward(self, x, target = None):
        ##Getting the dimension of the x
        B, T, C = x.shape
        ##Getting the mebeddings of the tokens by enocding the identities of the tokens that are in x now, after we convert our data in the sequence of integers.
        embeddings = self.embedding_table(x)  ##x will be (B, T, n_embd)
        ##One more thing that people do is adding a positional embedding as well, to get the current position of the index/token in x.
        positional_embeddings = self.positional_embedding_table(torch.arange(T, device = device)) ##torch.arange are all the integers form 0 to T-1, they get embedded through the table to make (T, n_embd)    
        ##Concatenating the embeddings and positional embeddings
        x = embeddings + positional_embeddings  ##Torch come in picture and adds the batch dimension to positional embeddings and addition is done.
        ##Creating one linear layer for indirection, x contains both the feature info and the postional information
        logits = self.lm_head(x)  ##logits are (B, T, vocab_size)

        ##Calculating the loss
        if targets is None:
            loss = None
        else:
            B, T, C = embeddings.shape
            embeddings = embeddings.view(B*T, C)  ##Changing in the two dimensional vector, that is (B*T, C)
            targets = targets.view(B*T)  ##Targets are only going to be single dimensional so (B*T) that is target for each token in the sequence in the batch.
            loss = F.cross_entropy(embeddings, targets)
            
        return embeddings, loss

    ##Method to generate the response
    def generate(self, x, max_new_tokens):
        for _ in range(max_new_tokens):
            ##We get the scores/outputs by running the model on the inputs
            scores, loss = self(x)
            ##We only want to focus on the last time step to generate the next word
            scores = scores[:, -1, :] ##it becomes (B, C)
            ##Getting the probabilities using softmax
            probs = F.softmax(scores, dim = -1)
            ##Sampling out from the distribution
            x_next = torch.multinomial(probs, num_samples = 1) ##It becomes (B, 1)
            ##Concatenating the sampled token to the running sequence
            x = torch.cat((x, x_next), dim = 1)  ##It becomes (B, T+1)
            
        return x

### Implementing the single head of Self Attention

In [12]:
class SingleHead(nn.Module):

    ##Creating the constructor
    def __init__(self, head_size): ##We will need the head_size
        super().__init__()
        ##Head size
        self.head_size = head_size
        ##Creating the Linear layers for key, query and value, so these are the linear projections that we will apply to all of our nodes
        self.key = nn.Linear(n_embd, head_size, bias = False)
        self.query = nn.Linear(n_embd, head_size, bias = False)
        self.value = nn.Linear(n_embd, head_size, bias = False)
        ##Creating the buffer for tril, that is the lower triangular matrix
        self.register_buffer('tril', torch.tril(torch.ones(max_seq_length, max_seq_length)))
        
    def forward(self, x):
        print(x.shape)
        ##Getting the shapes
        B, T, C = x.shape
        ##Calculating the keys, queries and values vectors for all the nodes
        k = self.key(x)
        q = self.query(x)
        v = self.value(x)
        ##Calculating the wei matrix that will contain weights and applying scaling factor
        wei = q @ k.transpose(-2, -1) * C**-0.5 ##We get (B, T, T)
        ##Applying mask to prevent communication with the future tokens and also for weighted aggregation
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) ##Still (B, T, T)
        ##Applying softmax along the rows to normalize it so it sums up to 1 and we have affinity scores
        wei = F.softmax(wei, dim = -1)
        ##Aggregating v by doing matrix multiplication with the matrix wei
        out = wei @ v

        return out


### Creating the head in the Language Model

In [13]:
class Transformer(nn.Module):

    ##Creating the constructor
    def __init__(self):
        super().__init__()
        self.embedding_table = nn.Embedding(vocab_size, n_embd)  ##Creating the embedding look-up table for our tokens
        self.positional_embedding_table = nn.Embedding(max_seq_length, n_embd)  ##We need the channels or vectors to be n_embd because we want to encode the token in way that what is its position in the sequence.
                                                                                ##So we need to have positional information of the token in the sequence so we encode it based on what ots position is in the seq. so (max_seq_length, n_embd)
                                                                                ##Table contains the embedding for the token based on its position in the sequence.
        self.sa_head = SingleHead(n_embd)  ##Implementing the single head of self attention
        self.lm_head = nn.Linear(n_embd, vocab_size)  ##Adding the Linear layer to get the final scores/outputs, (B, T, vocab_size)
    ##Creating the forward method
    def forward(self, x, targets = None):
        ##Getting the dimension of the x
        B, T = x.shape
        ##Getting the mebeddings of the tokens by enocding the identities of the tokens that are in x now, after we convert our data in the sequence of integers.
        embeddings = self.embedding_table(x)  ##x will be (B, T, n_embd)
        ##One more thing that people do is adding a positional embedding as well, to get the current position of the index/token in x.
        positional_embeddings = self.positional_embedding_table(torch.arange(T, device = device)) ##torch.arange are all the integers form 0 to T-1, they get embedded through the table to make (T, n_embd)    
        ##Concatenating the embeddings and positional embeddings
        final_embeddings = embeddings + positional_embeddings  ##Torch come in picture and adds the batch dimension to positional embeddings and addition is done.
        ##Feeding to the self attention head
        x = self.sa_head(final_embeddings)
        ##Creating one linear layer for indirection, x contains both the feature info and the postional information, we can call it simple decoder language modeling head
        logits = self.lm_head(x)  ##logits are (B, T, vocab_size)

        ##Calculating the loss
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)  ##Changing in the two dimensional vector, that is (B*T, C)
            targets = targets.view(B*T)  ##Targets are only going to be single dimensional so (B*T) that is target for each token in the sequence in the batch.
            loss = F.cross_entropy(logits, targets)
            
        return logits, loss

    ##Method to generate the response
    def generate(self, x, max_new_tokens):
        for _ in range(max_new_tokens):
            ##Cropping the x, because now we are using positional embeddings, we can never have more than the max_seq_length coming in because if I do then our postional embedding look uptable will run out of scope because it only has embeddings up to max_seq_length
            idx_crop = x[:, -max_seq_length:]
            ##We get the scores/outputs by running the model on the inputs
            scores, loss = self(idx_crop)
            ##We only want to focus on the last time step to generate the next word
            scores = scores[:, -1, :] ##it becomes (B, C)
            ##Getting the probabilities using softmax
            probs = F.softmax(scores, dim = -1)
            ##Sampling out from the distribution
            x_next = torch.multinomial(probs, num_samples = 1) ##It becomes (B, 1)
            ##Concatenating the sampled token to the running sequence
            x = torch.cat((x, x_next), dim = 1)  ##It becomes (B, T+1)
            
        return x

In [14]:
m = Transformer().to(device)  ##Initializing the model

In [15]:
m  ##Architecture of the Transformer

Transformer(
  (embedding_table): Embedding(65, 32)
  (positional_embedding_table): Embedding(8, 32)
  (sa_head): SingleHead(
    (key): Linear(in_features=32, out_features=32, bias=False)
    (query): Linear(in_features=32, out_features=32, bias=False)
    (value): Linear(in_features=32, out_features=32, bias=False)
  )
  (lm_head): Linear(in_features=32, out_features=65, bias=True)
)

In [16]:
##Intitalizing an optimizer

In [17]:
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)

In [164]:
for steps in range(max_iters):
    ##Sampling out a batch from the training set
    xb, yb = get_batch('train')
    
    ##Evaluating the loss
    scores, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    ##Running backward propagation
    loss.backward()
    ##Updating the weights
    optimizer.step()
    print(f"On step: {steps}, the loss is: ", loss.item())

torch.Size([32, 8, 32])
On step: 0, the loss is:  4.239473342895508
torch.Size([32, 8, 32])
On step: 1, the loss is:  4.229648590087891
torch.Size([32, 8, 32])
On step: 2, the loss is:  4.206923007965088
torch.Size([32, 8, 32])
On step: 3, the loss is:  4.171305179595947
torch.Size([32, 8, 32])
On step: 4, the loss is:  4.156835079193115
torch.Size([32, 8, 32])
On step: 5, the loss is:  4.154654026031494
torch.Size([32, 8, 32])
On step: 6, the loss is:  4.126086711883545
torch.Size([32, 8, 32])
On step: 7, the loss is:  4.109684944152832
torch.Size([32, 8, 32])
On step: 8, the loss is:  4.069436073303223
torch.Size([32, 8, 32])
On step: 9, the loss is:  4.113122940063477
torch.Size([32, 8, 32])
On step: 10, the loss is:  4.028069496154785
torch.Size([32, 8, 32])
On step: 11, the loss is:  4.081264972686768
torch.Size([32, 8, 32])
On step: 12, the loss is:  4.0127153396606445
torch.Size([32, 8, 32])
On step: 13, the loss is:  4.075645446777344
torch.Size([32, 8, 32])
On step: 14, the lo

In [73]:
loss.item()

2.318453550338745

In [74]:
##Decoding using our model
print(decode(m.generate(x = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=1000)[0].tolist()))


Be
Ango fshecSeve;
A hel ithis ha: bere but soly mo tich thardesald thay ilvere om
Tounge homt btnory tosan ithey an besr my tidyo pil, omicor an ofr dut hon ondovexdy
I iso flofrhe piro yay mbie? te hepavetlyoul tel dim st oung:acofr thir mer bllloupr ut andn mulifanul thous thofr ould surt
DO is wheerr: ome foul silso cot yofrer fre go: wer, oqury ady out bura--h-.
Nouthe dthadum my lover fthis
Theorou isenand. ewe med towribanoud the bovimera pharmth, hmy ang!
Tilll, In therense ingter ofet hom se?

?'
I ven'swes be. he IPry GLalegghhofl Ither, nto; hes osk
lull tot Low then toedomy nvee lchet hothepothy arg ff is.

No lf artoom tak gal'sin her ispuine t
Nous akis hilerayard ld arcour wa let gay ancpro quies the't mburh Elaprat Rtangoso hay hecherre co od worino amul,
Forf fe:
 noun tr-thouls ssus heavereco hasse eat thea,
And, I k'tis, pieler, st mailejeing
'Kes rwe tht averauine brf fatne myoun the, pern son hasto whigse ite adger se'tu hat histre.

Acterf bourdak's ar'k hith eal

In [75]:
##Saving the model
torch.save(m, 'self-attn-5000_stps.pth')

In [77]:
##Loading the model
model = torch.load('self-attn-5000_stps.pth')

  model = torch.load('self-attn-5000_stps.pth')


### Multi-Head Attention
- Applying Multiple Attentions in parallel and concatenating the results.
- Tokens/Nodes in a sequence may have a lot of things to communicate between each other and single head attention may not be enough, so multiple headed attention is used to capture complex meanings and patterns between the tokens or nodes.

In [18]:
## Let's implement multi-headed attention

In [19]:
class MultiHeadAttention(nn.Module):

    ##Creating the constructor, we want mutiple heads of attention running in parallel, we can do this in pytorch like this
    def __init__(self, num_heads, head_size):
        super().__init__()
        ##Creating multiple heads, we run all the heads in parallel into a list
        self.heads = nn.ModuleList([SingleHead(head_size) for _ in range(num_heads)])

    ##Creating the forward method
    def forward(self, x):
        return torch.cat([h(x) for h in self.heads], dim = -1)  ##We concatenate all of the outputs and we do it along the channels/embeddings dimension, that is -1.
    

In [20]:
#### Now We do not have a single attention so let's make changes in our Transformer Model 
#### Now we do not have single attention that has a head size of 32, instead of havnig one communication channel, we now have 
#### 4 communication channels running in parallel.

#### And each one of these communication channels will be typically smaller correspondingly.
#### Because we have 4 communication channels, we want 8 dimensional self attention.
#### And so from each communication channel, we are getting together 8 dimensional vectors and we have 4 of them then
#### that concatenates together to give us n_embd that is our original embedding size that is 32.

#### One thing to keep in mind, we are implementing 4 heads of self attention so we divide n_embd by 4 to get the head_size.
#### Similarly, if we want 8 heads, we would multiply n_embd or the embedding_dimension by 8 to get the head_size that will be 4 in this case.
#### And it will be so on. So that when we concatenate them together along the head_size/channels dimension we get back our n_embd or 
#### original embeddings dimension.
#### n_embd is the embeddings dimension, it is not 32 right now but we will increase it later on.

In [21]:
class Transformer(nn.Module):

    ##Creating the constructor
    def __init__(self):
        super().__init__()
        self.embedding_table = nn.Embedding(vocab_size, n_embd)  ##Creating the embedding look-up table for our tokens
        self.positional_embedding_table = nn.Embedding(max_seq_length, n_embd)  ##We need the channels or vectors to be n_embd because we want to encode the token in way that what is its position in the sequence.
                                                                                ##So we need to have positional information of the token in the sequence so we encode it based on what ots position is in the seq. so (max_seq_length, n_embd)
                                                                                ##Table contains the embedding for the token based on its position in the sequence.
        self.sa_heads = MultiHeadAttention(4, n_embd//4)  ##Implementing 4 heads for the multi head of self attention that will run in parallel.
        self.lm_head = nn.Linear(n_embd, vocab_size)  ##Adding the Linear layer to get the final scores/outputs, (B, T, vocab_size)
    ##Creating the forward method
    def forward(self, x, targets = None):
        ##Getting the dimension of the x
        B, T = x.shape
        ##Getting the mebeddings of the tokens by enocding the identities of the tokens that are in x now, after we convert our data in the sequence of integers.
        embeddings = self.embedding_table(x)  ##x will be (B, T, n_embd)
        ##One more thing that people do is adding a positional embedding as well, to get the current position of the index/token in x.
        positional_embeddings = self.positional_embedding_table(torch.arange(T, device = device)) ##torch.arange are all the integers form 0 to T-1, they get embedded through the table to make (T, n_embd)    
        ##Concatenating the embeddings and positional embeddings
        final_embeddings = embeddings + positional_embeddings  ##Torch come in picture and adds the batch dimension to positional embeddings and addition is done.
        ##Feeding to the self attention head
        x = self.sa_heads(final_embeddings)  ##Applying multi-head attention
        ##Creating one linear layer for indirection, x contains both the feature info and the postional information, we can call it simple decoder language modeling head
        logits = self.lm_head(x)  ##logits are (B, T, vocab_size)

        ##Calculating the loss
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)  ##Changing in the two dimensional vector, that is (B*T, C)
            targets = targets.view(B*T)  ##Targets are only going to be single dimensional so (B*T) that is target for each token in the sequence in the batch.
            loss = F.cross_entropy(logits, targets)
            
        return logits, loss

    ##Method to generate the response
    def generate(self, x, max_new_tokens):
        for _ in range(max_new_tokens):
            ##Cropping the x, because now we are using positional embeddings, we can never have more than the max_seq_length coming in because if I do then our postional embedding look uptable will run out of scope because it only has embeddings up to max_seq_length
            idx_crop = x[:, -max_seq_length:]
            ##We get the scores/outputs by running the model on the inputs
            scores, loss = self(idx_crop)
            ##We only want to focus on the last time step to generate the next word
            scores = scores[:, -1, :] ##it becomes (B, C)
            ##Getting the probabilities using softmax
            probs = F.softmax(scores, dim = -1)
            ##Sampling out from the distribution
            x_next = torch.multinomial(probs, num_samples = 1) ##It becomes (B, 1)
            ##Concatenating the sampled token to the running sequence
            x = torch.cat((x, x_next), dim = 1)  ##It becomes (B, T+1)
            
        return x

In [22]:
m = Transformer().to(device)  ##Initializing the model

In [23]:
m

Transformer(
  (embedding_table): Embedding(65, 32)
  (positional_embedding_table): Embedding(8, 32)
  (sa_heads): MultiHeadAttention(
    (heads): ModuleList(
      (0-3): 4 x SingleHead(
        (key): Linear(in_features=32, out_features=8, bias=False)
        (query): Linear(in_features=32, out_features=8, bias=False)
        (value): Linear(in_features=32, out_features=8, bias=False)
      )
    )
  )
  (lm_head): Linear(in_features=32, out_features=65, bias=True)
)

In [24]:
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)

In [175]:
for steps in range(max_iters):
    ##Sampling out a batch from the training set
    xb, yb = get_batch('train')
    
    ##Evaluating the loss
    scores, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    ##Running backward propagation
    loss.backward()
    ##Updating the weights
    optimizer.step()
    print(f"On step: {steps}, the loss is: ", loss.item())

torch.Size([32, 8, 32])
torch.Size([32, 8, 32])
torch.Size([32, 8, 32])
torch.Size([32, 8, 32])
On step: 0, the loss is:  4.274183750152588
torch.Size([32, 8, 32])
torch.Size([32, 8, 32])
torch.Size([32, 8, 32])
torch.Size([32, 8, 32])
On step: 1, the loss is:  4.257466793060303
torch.Size([32, 8, 32])
torch.Size([32, 8, 32])
torch.Size([32, 8, 32])
torch.Size([32, 8, 32])
On step: 2, the loss is:  4.236926555633545
torch.Size([32, 8, 32])
torch.Size([32, 8, 32])
torch.Size([32, 8, 32])
torch.Size([32, 8, 32])
On step: 3, the loss is:  4.227233409881592
torch.Size([32, 8, 32])
torch.Size([32, 8, 32])
torch.Size([32, 8, 32])
torch.Size([32, 8, 32])
On step: 4, the loss is:  4.257778644561768
torch.Size([32, 8, 32])
torch.Size([32, 8, 32])
torch.Size([32, 8, 32])
torch.Size([32, 8, 32])
On step: 5, the loss is:  4.214492321014404
torch.Size([32, 8, 32])
torch.Size([32, 8, 32])
torch.Size([32, 8, 32])
torch.Size([32, 8, 32])
On step: 6, the loss is:  4.165422439575195
torch.Size([32, 8, 3

KeyboardInterrupt: 

In [94]:
loss.item()

2.1372950077056885

In [95]:
print(decode(m.generate(x = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=1000)[0].tolist()))


Bere gone wrarereen the ge this ut, hady urif trives ales
Cortesod four wor thout ge frald thum dilll I hou met and swor in agun, amis.

AUKENTI to papy are tha ha fand so thallafrin ham wit, I His: asm lide lors: lided, latpustly, kro!
Whis mwoulde wis sy por wit I whemof shond be hand
MLBEWAUT:
A the gromy his goll I the by dish now spe ich doighto bre-'fof Elodrs,
And dand de bot se
Thet, angesurde ho, my is de sun,
Gich mustu vis sound thest frece wrotls ut tee me mim hey;
Thaldasn own.

SWANGNINDUHEMESWuld tha whif the lord
iold bear re Rund the hay I hand bre! ace thind to rucse yourego,
MOn you, no yat ponoblond ealian, Hou to he wort, sou, 'and folt ace and my thale you morend aneme dun
AThes, lond.

JAUd not the Vut thas maund'llacours make re thape wther mandd sebloplecesigh to pon Im day:
Wal erand,
Is do mus tet for thightut thel, in so she sond plidy thight my mute hawards thant; lave ba,
Thath ay ma rom wont sow But mat you cow wand fat aufl I to-UCHENENEN:
Colagalt, son

In [96]:
##Saving the model
torch.save(m, "multi-head-4hds-self-attn.pth")

### Let's add the Feed Forward layer
- Adding Computation into the Network.
- This computation is on a per node level.

In [25]:
##Let's implement a simple feed forward layer

In [26]:
#### Before we had the multiple heads in our model but we were calculating the logits too quickly without actually doing
#### any computation. So the tokens looked at each other but did not really have a time to think on what they found from the other
#### tokens.

In [27]:
#### In Deef Forward, when it is applying Linear, this is on a per token level, all tokens do it independently.
#### So Self Attention is a communication, and once they have gathered all the data, now they need to think on that data
#### individually. That is why we added a feed forward layer right after the attention mechanism.

In [28]:
class FeedForward(nn.Module):
    ##Creating the constructor
    def __init__(self, n_embd):
        super().__init__()
        ##Creating the sequential layer with non-linearity
        self.ffn = nn.Sequential(
            nn.Linear(n_embd, n_embd),
            nn.ReLU()
        )

    ##Creating the forward method
    def forward(self, x):
        ##Forwarding the Network on x
        return self.ffn(x)

In [29]:
class Transformer(nn.Module):

    ##Creating the constructor
    def __init__(self):
        super().__init__()
        self.embedding_table = nn.Embedding(vocab_size, n_embd)  ##Creating the embedding look-up table for our tokens
        self.positional_embedding_table = nn.Embedding(max_seq_length, n_embd)  ##We need the channels or vectors to be n_embd because we want to encode the token in way that what is its position in the sequence.
                                                                                ##So we need to have positional information of the token in the sequence so we encode it based on what ots position is in the seq. so (max_seq_length, n_embd)
                                                                                ##Table contains the embedding for the token based on its position in the sequence.
        self.sa_heads = MultiHeadAttention(4, n_embd//4)  ##Implementing 4 heads for the multi head of self attention that will run in parallel.
        self.ffn = FeedForward(n_embd)  ##Implementing the Feed Foward Layer
        self.lm_head = nn.Linear(n_embd, vocab_size)  ##Adding the Linear layer to get the final scores/outputs, (B, T, vocab_size)
    ##Creating the forward method
    def forward(self, x, targets = None):
        ##Getting the dimension of the x
        B, T = x.shape
        ##Getting the mebeddings of the tokens by enocding the identities of the tokens that are in x now, after we convert our data in the sequence of integers.
        embeddings = self.embedding_table(x)  ##x will be (B, T, n_embd)
        ##One more thing that people do is adding a positional embedding as well, to get the current position of the index/token in x.
        positional_embeddings = self.positional_embedding_table(torch.arange(T, device = device)) ##torch.arange are all the integers form 0 to T-1, they get embedded through the table to make (T, n_embd)    
        ##Concatenating the embeddings and positional embeddings
        final_embeddings = embeddings + positional_embeddings  ##Torch come in picture and adds the batch dimension to positional embeddings and addition is done.
        ##Feeding to the self attention head
        x = self.sa_heads(final_embeddings)  ##Applying multi-head attention
        ##Applying Feed Foward layer right after the multi-headed attention so tokens can think on what they found
        x = self.ffn(x)
        ##Creating one linear layer for indirection, x contains both the feature info and the postional information, we can call it simple decoder language modeling head
        logits = self.lm_head(x)  ##logits are (B, T, vocab_size)

        ##Calculating the loss
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)  ##Changing in the two dimensional vector, that is (B*T, C)
            targets = targets.view(B*T)  ##Targets are only going to be single dimensional so (B*T) that is target for each token in the sequence in the batch.
            loss = F.cross_entropy(logits, targets)
            
        return logits, loss

    ##Method to generate the response
    def generate(self, x, max_new_tokens):
        for _ in range(max_new_tokens):
            ##Cropping the x, because now we are using positional embeddings, we can never have more than the max_seq_length coming in because if I do then our postional embedding look uptable will run out of scope because it only has embeddings up to max_seq_length
            idx_crop = x[:, -max_seq_length:]
            ##We get the scores/outputs by running the model on the inputs
            scores, loss = self(idx_crop)
            ##We only want to focus on the last time step to generate the next word
            scores = scores[:, -1, :] ##it becomes (B, C)
            ##Getting the probabilities using softmax
            probs = F.softmax(scores, dim = -1)
            ##Sampling out from the distribution
            x_next = torch.multinomial(probs, num_samples = 1) ##It becomes (B, 1)
            ##Concatenating the sampled token to the running sequence
            x = torch.cat((x, x_next), dim = 1)  ##It becomes (B, T+1)
            
        return x

In [30]:
m = Transformer().to(device)

In [31]:
m

Transformer(
  (embedding_table): Embedding(65, 32)
  (positional_embedding_table): Embedding(8, 32)
  (sa_heads): MultiHeadAttention(
    (heads): ModuleList(
      (0-3): 4 x SingleHead(
        (key): Linear(in_features=32, out_features=8, bias=False)
        (query): Linear(in_features=32, out_features=8, bias=False)
        (value): Linear(in_features=32, out_features=8, bias=False)
      )
    )
  )
  (ffn): FeedForward(
    (ffn): Sequential(
      (0): Linear(in_features=32, out_features=32, bias=True)
      (1): ReLU()
    )
  )
  (lm_head): Linear(in_features=32, out_features=65, bias=True)
)

In [32]:
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)

In [78]:
for steps in range(max_iters):
    ##Sampling out a batch from the training set
    xb, yb = get_batch('train')
    
    ##Evaluating the loss
    scores, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    ##Running backward propagation
    loss.backward()
    ##Updating the weights
    optimizer.step()
    print(f"On step: {steps}, the loss is: ", loss.item())

On step: 0, the loss is:  4.172155857086182
On step: 1, the loss is:  4.142795562744141
On step: 2, the loss is:  4.14162540435791
On step: 3, the loss is:  4.136049747467041
On step: 4, the loss is:  4.138381481170654
On step: 5, the loss is:  4.150176525115967
On step: 6, the loss is:  4.123851299285889
On step: 7, the loss is:  4.120871067047119
On step: 8, the loss is:  4.106868743896484
On step: 9, the loss is:  4.099303722381592
On step: 10, the loss is:  4.064326763153076
On step: 11, the loss is:  4.067160129547119
On step: 12, the loss is:  4.061840057373047
On step: 13, the loss is:  4.042304992675781
On step: 14, the loss is:  4.036181926727295
On step: 15, the loss is:  4.026295185089111
On step: 16, the loss is:  4.010344505310059
On step: 17, the loss is:  3.9778048992156982
On step: 18, the loss is:  4.005540370941162
On step: 19, the loss is:  3.946467638015747
On step: 20, the loss is:  3.9423656463623047
On step: 21, the loss is:  3.9028966426849365
On step: 22, the l

In [79]:
loss.item()

2.2478256225585938

In [80]:
print(decode(m.generate(x = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=1000)[0].tolist()))



Widl I thim Hellity, non sot wif tis thein firt?

Hith mam youncede drin, to, exre hall and
Teercle!
Hane, yar pour Wheed,
As to feark oown hatt bather mums to iffores, mut gorin beds sex nome hon,
ARWErif hat werience sore
Dor the won man dowthil,
Yo ght dee non tis he theis offes we me and lith and pensget lito howse.

CAURD: ivit ofcader wee me theme mur lifall in deadith inhe he
His IF ofdr bein.

ONTo lobs he; daper he
Hour?

Thorth dersirl wrlay, will I ce swake dronood I toingo!
I I Beray--
MVAUS:
Gen:
Duts now my to jI heis to shat he wen hir thour ofer'y'tt'd wit, tres.

AMENREOR VAS a An andelf!
Plousser therios:

Sile ill.

GRIWARAURY ICETANUSEg : wotclir Son woll.

MBENTEG:

Chat nothon me shoter thou nour thle arsut I anly roe wour, trer Pwen poulidemave rive thios tresunsess to your le:
Ton delf the oft Lre ryerncan:
Hit.
Teloour all.
CARUMENENCHOMo, I hatlly as freebrived lot nomntall ir swne, I sund CEbour amedest loot marth thouch and wer, eacke? Gearceem onee
Wearti

In [117]:
##Saving the model
torch.save(m, "multi-head-4ds-ffd-self-attn-5000-steps.pth")

### Adding Residual Connection and creating multiple blocks of attention and feed forward

In [33]:
#### We are going to intersperse the communication with the computation.
#### It is also what transformer does when it has blocks that ommunicate and then compute.
#### Then it groups them and replicates them.

In [34]:
#### We will create a block, a block basically intersperses the communcation and the computation.
#### The communication is done with attention and computation with the feed forward network on all the nodes independently.

In [35]:
#### We will also calculate the head_size based on numberof heads that we will pass as the argument while
#### creating the block and the n_embd.
#### We do this by dividing the n_embd with the number of heads and get the head_size.
#### We do it because after multi-headed attention when we concatenate the heads together and we get the n_embd back.

In [36]:
## Creating the block

In [37]:
class Block(nn.Module):
    ##Creating the Constructor

    def __init__(self, n_embd, n_heads):
        super().__init__()
        ##Calculating the head_size
        head_size = n_embd // n_heads
        ##Creating the multi-headed attention and the feed forward
        self.sa_heads = MultiHeadAttention(n_embd, head_size)
        self.ffn = FeedForward(n_embd)

    ##Creating the forwarrd method to forward on x
    def forward(self, x):
        ##Forwarding the Multi-Headed Attention
        x = self.sa_heads(x)
        ##Forwarding x on the Feed Forward layer
        x = self.ffn(x)
        return x

In [38]:
## Adding the blocks in our Trnasformers Network

In [39]:
class Transformer(nn.Module):

    ##Creating the constructor
    def __init__(self):
        super().__init__()
        self.embedding_table = nn.Embedding(vocab_size, n_embd)  ##Creating the embedding look-up table for our tokens
        self.positional_embedding_table = nn.Embedding(max_seq_length, n_embd)  ##We need the channels or vectors to be n_embd because we want to encode the token in way that what is its position in the sequence.
                                                                                ##So we need to have positional information of the token in the sequence so we encode it based on what ots position is in the seq. so (max_seq_length, n_embd)
                                                                                ##Table contains the embedding for the token based on its position in the sequence.
        self.blocks = nn.Sequential(    ##Creating the multiple blocks of Multi-Headed Attention and Feed Forward Sequentially.
            Block(n_embd, n_heads = 4),
            Block(n_embd, n_heads = 4),
            Block(n_embd, n_heads = 4),
        )
        self.lm_head = nn.Linear(n_embd, vocab_size)  ##Adding the Linear layer to get the final scores/outputs, (B, T, vocab_size)
    ##Creating the forward method
    def forward(self, x, targets = None):
        ##Getting the dimension of the x
        B, T = x.shape
        ##Getting the mebeddings of the tokens by enocding the identities of the tokens that are in x now, after we convert our data in the sequence of integers.
        embeddings = self.embedding_table(x)  ##x will be (B, T, n_embd)
        ##One more thing that people do is adding a positional embedding as well, to get the current position of the index/token in x.
        positional_embeddings = self.positional_embedding_table(torch.arange(T, device = device)) ##torch.arange are all the integers form 0 to T-1, they get embedded through the table to make (T, n_embd)    
        ##Concatenating the embeddings and positional embeddings
        final_embeddings = embeddings + positional_embeddings  ##Torch come in picture and adds the batch dimension to positional embeddings and addition is done.
        ##Feeding to the Multiple Blocks that are implemented sequentially.
        x = self.blocks(final_embeddings)  ##Adding multiple blocks of attention and feed forward.
        ##Creating one linear layer for indirection, x contains both the feature info and the postional information, we can call it simple decoder language modeling head
        logits = self.lm_head(x)  ##logits are (B, T, vocab_size)

        ##Calculating the loss
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)  ##Changing in the two dimensional vector, that is (B*T, C)
            targets = targets.view(B*T)  ##Targets are only going to be single dimensional so (B*T) that is target for each token in the sequence in the batch.
            loss = F.cross_entropy(logits, targets)
            
        return logits, loss

    ##Method to generate the response
    def generate(self, x, max_new_tokens):
        for _ in range(max_new_tokens):
            ##Cropping the x, because now we are using positional embeddings, we can never have more than the max_seq_length coming in because if I do then our postional embedding look uptable will run out of scope because it only has embeddings up to max_seq_length
            idx_crop = x[:, -max_seq_length:]
            ##We get the scores/outputs by running the model on the inputs
            scores, loss = self(idx_crop)
            ##We only want to focus on the last time step to generate the next word
            scores = scores[:, -1, :] ##it becomes (B, C)
            ##Getting the probabilities using softmax
            probs = F.softmax(scores, dim = -1)
            ##Sampling out from the distribution
            x_next = torch.multinomial(probs, num_samples = 1) ##It becomes (B, 1)
            ##Concatenating the sampled token to the running sequence
            x = torch.cat((x, x_next), dim = 1)  ##It becomes (B, T+1)
            
        return x

In [40]:
m = Transformer().to(device)

In [41]:
m

Transformer(
  (embedding_table): Embedding(65, 32)
  (positional_embedding_table): Embedding(8, 32)
  (blocks): Sequential(
    (0): Block(
      (sa_heads): MultiHeadAttention(
        (heads): ModuleList(
          (0-31): 32 x SingleHead(
            (key): Linear(in_features=32, out_features=8, bias=False)
            (query): Linear(in_features=32, out_features=8, bias=False)
            (value): Linear(in_features=32, out_features=8, bias=False)
          )
        )
      )
      (ffn): FeedForward(
        (ffn): Sequential(
          (0): Linear(in_features=32, out_features=32, bias=True)
          (1): ReLU()
        )
      )
    )
    (1): Block(
      (sa_heads): MultiHeadAttention(
        (heads): ModuleList(
          (0-31): 32 x SingleHead(
            (key): Linear(in_features=32, out_features=8, bias=False)
            (query): Linear(in_features=32, out_features=8, bias=False)
            (value): Linear(in_features=32, out_features=8, bias=False)
          )
    

In [42]:
optimizer = torch.optim.AdamW(m.parameters(), lr = learning_rate)

In [43]:
#### When we train it, it won't really give any good results, the reason is we are starting to get a pretty
#### deep neural network, and deep neural networks suffer from the optimization issues. 
#### So let's borrow one more idea from the tranformers paper to resolve those difficulties, which is adding residual connections.

In [147]:
class SingleHead(nn.Module):

    ##Creating the constructor
    def __init__(self, head_size): ##We will need the head_size
        super().__init__()
        ##Head size
        self.head_size = head_size
        ##Creating the Linear layers for key, query and value, so these are the linear projections that we will apply to all of our nodes
        self.key = nn.Linear(n_embd, head_size, bias = False)
        self.query = nn.Linear(n_embd, head_size, bias = False)
        self.value = nn.Linear(n_embd, head_size, bias = False)
        ##Creating the buffer for tril, that is the lower triangular matrix
        self.register_buffer('tril', torch.tril(torch.ones(max_seq_length, max_seq_length)))
        
    def forward(self, x):
        # print(x.shape)
        ##Getting the shapes
        B, T, C = x.shape
        ##Calculating the keys, queries and values vectors for all the nodes
        k = self.key(x)
        q = self.query(x)
        v = self.value(x)
        ##Calculating the wei matrix that will contain weights and applying scaling factor
        wei = q @ k.transpose(-2, -1) * C**-0.5 ##We get (B, T, T)
        ##Applying mask to prevent communication with the future tokens and also for weighted aggregation
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) ##Still (B, T, T)
        ##Applying softmax along the rows to normalize it so it sums up to 1 and we have affinity scores
        wei = F.softmax(wei, dim = -1)
        ##Aggregating v by doing matrix multiplication with the matrix wei
        out = wei @ v

        return out

In [148]:
#### There are two optimizations that significantly help with the depth of the these networks.
#### First one being, residual or skip connections and second is the Normalization.

In [149]:
## Now in our Multi head attention, we also have to introduce a projection (Read After the Block Module)
class MultiHeadAttention(nn.Module):

    ##Creating the constructor, we want mutiple heads of attention running in parallel, we can do this in pytorch like this
    def __init__(self, num_heads, head_size):
        super().__init__()
        ##Creating multiple heads, we run all the heads in parallel into a list
        self.heads = nn.ModuleList([SingleHead(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)

    ##Creating the forward method
    def forward(self, x):
        out =  torch.cat([h(x) for h in self.heads], dim = -1)  ##We concatenate all of the outputs and we do it along the channels/embeddings dimension, that is -1.
        out = self.proj(out)
        return out

In [150]:
## In Feed Forward Network, it is going to be the same thing, let's add the Projection (Read after the addition of Projection layer in the Multi Head Attention Block)
class FeedForward(nn.Module):
    ##Creating the constructor
    def __init__(self, n_embd):
        super().__init__()
        ##Creating the sequential layer with non-linearity
        self.ffn = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),  ##There was one more change according the paper, the inner layer of the Feed Forward Network shoukd be multiplied by 4, in terms of Channel / Embeddings Dimension. 
                                            ##So ,adding a bit of computaiton here, and growing that layer that is in the residual block on the side of the residual pathway.
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),  ##So this is the Projection layer, going back in the Residual Pathway.
        )

    ##Creating the forward method
    def forward(self, x):
        ##Forwarding the Network on x
        return self.ffn(x)

In [151]:
## So let's implement residual connections
## Gradient goes from Supervision to the Input, and the block (computation block) over time kicks in.
## So what we wanna do is:

In [152]:
class Block(nn.Module):
    ##Creating the Constructor

    def __init__(self, n_embd, n_heads):
        super().__init__()
        ##Calculating the head_size
        head_size = n_embd // n_heads
        ##Creating the multi-headed attention and the feed forward
        self.sa_heads = MultiHeadAttention(n_heads, head_size)
        self.ffn = FeedForward(n_embd)

    ##Creating the forwarrd method to forward on x
    def forward(self, x):
        ##Forwarding the Multi-Headed Attention, adding residual highways, or skip connections
        x = x + self.sa_heads(x)  ##So we have x, and then we fork off and do some communication and come back
        ##Forwarding x on the Feed Forward layer, adding residual highways, or skip connections
        x = x + self.ffn(x)  ##So we have x, then we fork off and do some computation and come back
        return x

In [153]:
class Transformer(nn.Module):

    ##Creating the constructor
    def __init__(self):
        super().__init__()
        self.embedding_table = nn.Embedding(vocab_size, n_embd)  ##Creating the embedding look-up table for our tokens
        self.positional_embedding_table = nn.Embedding(max_seq_length, n_embd)  ##We need the channels or vectors to be n_embd because we want to encode the token in way that what is its position in the sequence.
                                                                                ##So we need to have positional information of the token in the sequence so we encode it based on what ots position is in the seq. so (max_seq_length, n_embd)
                                                                                ##Table contains the embedding for the token based on its position in the sequence.
        self.blocks = nn.Sequential(    ##Creating the multiple blocks of Multi-Headed Attention and Feed Forward Sequentially.
            Block(n_embd, n_heads = 4),
            Block(n_embd, n_heads = 4),
            Block(n_embd, n_heads = 4)
        )
        self.lm_head = nn.Linear(n_embd, vocab_size)  ##Adding the Linear layer to get the final scores/outputs, (B, T, vocab_size)
    ##Creating the forward method
    def forward(self, x, targets = None):
        ##Getting the dimension of the x
        B, T = x.shape
        ##Getting the mebeddings of the tokens by enocding the identities of the tokens that are in x now, after we convert our data in the sequence of integers.
        embeddings = self.embedding_table(x)  ##x will be (B, T, n_embd)
        ##One more thing that people do is adding a positional embedding as well, to get the current position of the index/token in x.
        positional_embeddings = self.positional_embedding_table(torch.arange(T, device = device)) ##torch.arange are all the integers form 0 to T-1, they get embedded through the table to make (T, n_embd)    
        ##Concatenating the embeddings and positional embeddings
        final_embeddings = embeddings + positional_embeddings  ##Torch come in picture and adds the batch dimension to positional embeddings and addition is done.
        ##Feeding to the Multiple Blocks that are implemented sequentially.
        x = self.blocks(final_embeddings)  ##Adding multiple blocks of attention and feed forward.
        ##Creating one linear layer for indirection, x contains both the feature info and the postional information, we can call it simple decoder language modeling head
        logits = self.lm_head(x)  ##logits are (B, T, vocab_size)

        ##Calculating the loss
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)  ##Changing in the two dimensional vector, that is (B*T, C)
            targets = targets.view(B*T)  ##Targets are only going to be single dimensional so (B*T) that is target for each token in the sequence in the batch.
            loss = F.cross_entropy(logits, targets)
            
        return logits, loss

    ##Method to generate the response
    def generate(self, x, max_new_tokens):
        for _ in range(max_new_tokens):
            ##Cropping the x, because now we are using positional embeddings, we can never have more than the max_seq_length coming in because if I do then our postional embedding look uptable will run out of scope because it only has embeddings up to max_seq_length
            idx_crop = x[:, -max_seq_length:]
            ##We get the scores/outputs by running the model on the inputs
            scores, loss = self(idx_crop)
            ##We only want to focus on the last time step to generate the next word
            scores = scores[:, -1, :] ##it becomes (B, C)
            ##Getting the probabilities using softmax
            probs = F.softmax(scores, dim = -1)
            ##Sampling out from the distribution
            x_next = torch.multinomial(probs, num_samples = 1) ##It becomes (B, 1)
            ##Concatenating the sampled token to the running sequence
            x = torch.cat((x, x_next), dim = 1)  ##It becomes (B, T+1)
            
        return x

In [154]:
m = Transformer().to(device)

In [155]:
m

Transformer(
  (embedding_table): Embedding(65, 32)
  (positional_embedding_table): Embedding(256, 32)
  (blocks): Sequential(
    (0): Block(
      (sa_heads): MultiHeadAttention(
        (heads): ModuleList(
          (0-3): 4 x SingleHead(
            (key): Linear(in_features=32, out_features=8, bias=False)
            (query): Linear(in_features=32, out_features=8, bias=False)
            (value): Linear(in_features=32, out_features=8, bias=False)
          )
        )
        (proj): Linear(in_features=32, out_features=32, bias=True)
      )
      (ffn): FeedForward(
        (ffn): Sequential(
          (0): Linear(in_features=32, out_features=128, bias=True)
          (1): ReLU()
          (2): Linear(in_features=128, out_features=32, bias=True)
        )
      )
    )
    (1): Block(
      (sa_heads): MultiHeadAttention(
        (heads): ModuleList(
          (0-3): 4 x SingleHead(
            (key): Linear(in_features=32, out_features=8, bias=False)
            (query): Linear

In [156]:
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)

In [253]:
for steps in range(max_iters):
    ##Sampling out a batch from the training set
    xb, yb = get_batch('train')
    
    ##Evaluating the loss
    scores, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    ##Running backward propagation
    loss.backward()
    ##Updating the weights
    optimizer.step()
    print(f"On step: {steps}, the loss is: ", loss.item())

On step: 0, the loss is:  4.679819107055664
On step: 1, the loss is:  4.48321008682251
On step: 2, the loss is:  4.413363456726074
On step: 3, the loss is:  4.013591766357422
On step: 4, the loss is:  4.103349208831787
On step: 5, the loss is:  3.9582791328430176
On step: 6, the loss is:  3.92871356010437
On step: 7, the loss is:  3.960771322250366
On step: 8, the loss is:  3.76145339012146
On step: 9, the loss is:  3.630363941192627
On step: 10, the loss is:  3.7150826454162598
On step: 11, the loss is:  3.629748821258545
On step: 12, the loss is:  3.5141353607177734
On step: 13, the loss is:  3.5823779106140137
On step: 14, the loss is:  3.5229105949401855
On step: 15, the loss is:  3.5245330333709717
On step: 16, the loss is:  3.4431354999542236
On step: 17, the loss is:  3.340395450592041
On step: 18, the loss is:  3.4751358032226562
On step: 19, the loss is:  3.164785146713257
On step: 20, the loss is:  3.272310256958008
On step: 21, the loss is:  3.477116107940674
On step: 22, th

KeyboardInterrupt: 

### Applying Layer Norm

In [290]:
## Second innovation that is very helpful in optimizing deep neural networks is layer Normalization.

In [291]:
#### Layer Norm is very very similar to batch norm. batch Normalization basically just makes sure that across the batch
#### dimension, every individual neuron had unit gaussian distribution, so 0 mean and 1 standard deviation output.
#### So batch norm is normalizing every single column of the input. Now, rows are not going to be normalized by default.
#### So let's now implement layer norm. So now, we don't normalize the columns, we normalize the rows.

In [292]:
#### So, now for every individual example, let's say it is 32 dimensional vector, is normalized in this way.

In [293]:
#### In the transformers paper, actually, a few details have been changes from the transformers paper, and this is something
#### that actuallyslightly departs from the original paper. You see that the "Add and Norm", is applied after the transformation.
#### But now, it is basically common to apply the layer norm before the transformations, so there is a reshuffling of the layer
#### norms. SO this this called the "Pre=Norm Formulation", and this is the one we are going to use as well.

In [294]:
## So. let's now implement layer norm in out example.

In [295]:
## So, we apply it in the block, before the Self-Attention Mechanism and the Feed Forward Network.

In [296]:
class Block(nn.Module):
    ##Creating the Constructor
    def __init__(self, n_embd, n_heads):
        super().__init__()
        ##Calculating the head_size
        head_size = n_embd // n_heads
        ##Creating the multi-headed attention and the feed forward
        self.sa_heads = MultiHeadAttention(n_heads, head_size)
        self.ffn = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)  ##Intializing the layer norm 1, it needs the embedding dimension or the channels as the input
        self.ln2 = nn.LayerNorm(n_embd)  ##Initializing the layer norm 2

    ##Creating the forwarrd method to forward on x
    def forward(self, x):
        ## Here, the layer norms are applied immediately on x, before it goes into Self-Attention and Feed Forward Network
        ##Forwarding the Multi-Headed Attention, adding residual highways, or skip connections
        x = x + self.sa_heads(self.ln1(x))  ##So we have x, and then we fork off and do some communication and come back, also applied the layer norm immediately on x.
        ##Forwarding x on the Feed Forward layer, adding residual highways, or skip connections
        x = x + self.ffn(self.ln2(x))  ##So we have x, then we fork off and do some computation and come back, also applied the layer norm on x first.
        return x

In [297]:
#### The size of the Layer Norm above is n_embd, that is 32. So, when the Layer Norm is normalizing the features, the
#### normalization happens, the mean and variance are taking over 32 numbers. So, Batch and the time, the max sequence length,
#### act as Batch Dimensions.
#### So this is like a, per token transformation that normalizes the features(channels) and makes them unit gaussian at initialization.

In [298]:
#### But, because these layer norms inside themselves has the gamma and beta parameters, that are trainable.
#### The layer norm will eventually create outputs that might not be unit gaussian, but the Optimization will determine that.

In [299]:
#### There should be a layer norm typically, at the end of the transformer, right before the final linear layer,
#### that is lm_head (the language modeling head).

In [300]:
## So let's build our Transformer

In [301]:
class Transformer(nn.Module):

    ##Creating the constructor
    def __init__(self):
        super().__init__()
        self.embedding_table = nn.Embedding(vocab_size, n_embd)  ##Creating the embedding look-up table for our tokens
        self.positional_embedding_table = nn.Embedding(max_seq_length, n_embd)  ##We need the channels or vectors to be n_embd because we want to encode the token in way that what is its position in the sequence.
                                                                                ##So we need to have positional information of the token in the sequence so we encode it based on what ots position is in the seq. so (max_seq_length, n_embd)
                                                                                ##Table contains the embedding for the token based on its position in the sequence.
        self.blocks = nn.Sequential(    ##Creating the multiple blocks of Multi-Headed Attention and Feed Forward Sequentially.
            Block(n_embd, n_heads = 4),
            Block(n_embd, n_heads = 4),
            Block(n_embd, n_heads = 4),
            nn.LayerNorm(n_embd)  ##Adding the Layer Norm, after the transformer block and before the language modeling head that will
                                  ##decode the vocalbulary 
        )
        self.lm_head = nn.Linear(n_embd, vocab_size)  ##Adding the Linear layer to get the final scores/outputs, (B, T, vocab_size)
    ##Creating the forward method
    def forward(self, x, targets = None):
        ##Getting the dimension of the x
        B, T = x.shape
        ##Getting the mebeddings of the tokens by enocding the identities of the tokens that are in x now, after we convert our data in the sequence of integers.
        embeddings = self.embedding_table(x)  ##x will be (B, T, n_embd)
        ##One more thing that people do is adding a positional embedding as well, to get the current position of the index/token in x.
        positional_embeddings = self.positional_embedding_table(torch.arange(T, device = device)) ##torch.arange are all the integers form 0 to T-1, they get embedded through the table to make (T, n_embd)    
        ##Concatenating the embeddings and positional embeddings
        final_embeddings = embeddings + positional_embeddings  ##Torch come in picture and adds the batch dimension to positional embeddings and addition is done.
        ##Feeding to the Multiple Blocks that are implemented sequentially.
        x = self.blocks(final_embeddings)  ##Adding multiple blocks of attention and feed forward.
        ##Creating one linear layer for indirection, x contains both the feature info and the postional information, we can call it simple decoder language modeling head
        logits = self.lm_head(x)  ##logits are (B, T, vocab_size)

        ##Calculating the loss
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)  ##Changing in the two dimensional vector, that is (B*T, C)
            targets = targets.view(B*T)  ##Targets are only going to be single dimensional so (B*T) that is target for each token in the sequence in the batch.
            loss = F.cross_entropy(logits, targets)
            
        return logits, loss

    ##Method to generate the response
    def generate(self, x, max_new_tokens):
        for _ in range(max_new_tokens):
            ##Cropping the x, because now we are using positional embeddings, we can never have more than the max_seq_length coming in because if I do then our postional embedding look uptable will run out of scope because it only has embeddings up to max_seq_length
            idx_crop = x[:, -max_seq_length:]
            ##We get the scores/outputs by running the model on the inputs
            scores, loss = self(idx_crop)
            ##We only want to focus on the last time step to generate the next word
            scores = scores[:, -1, :] ##it becomes (B, C)
            ##Getting the probabilities using softmax
            probs = F.softmax(scores, dim = -1)
            ##Sampling out from the distribution
            x_next = torch.multinomial(probs, num_samples = 1) ##It becomes (B, 1)
            ##Concatenating the sampled token to the running sequence
            x = torch.cat((x, x_next), dim = 1)  ##It becomes (B, T+1)
            
        return x

In [302]:
m = Transformer().to(device)

In [303]:
m

Transformer(
  (embedding_table): Embedding(65, 64)
  (positional_embedding_table): Embedding(256, 64)
  (blocks): Sequential(
    (0): Block(
      (sa_heads): MultiHeadAttention(
        (heads): ModuleList(
          (0-3): 4 x Head(
            (key): Linear(in_features=64, out_features=16, bias=False)
            (query): Linear(in_features=64, out_features=16, bias=False)
            (value): Linear(in_features=64, out_features=16, bias=False)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (proj): Linear(in_features=64, out_features=64, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ffn): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=64, out_features=256, bias=True)
          (1): ReLU()
          (2): Linear(in_features=256, out_features=64, bias=True)
          (3): Dropout(p=0.0, inplace=False)
        )
      )
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln

In [304]:
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)

In [305]:
for steps in range(max_iters):
    ##Sampling out a batch from the training set
    xb, yb = get_batch('train')
    
    ##Evaluating the loss
    scores, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    ##Running backward propagation
    loss.backward()
    ##Updating the weights
    optimizer.step()
    print(f"On step: {steps}, the loss is: ", loss.item())

On step: 0, the loss is:  4.2884521484375
On step: 1, the loss is:  4.145564079284668
On step: 2, the loss is:  3.9970028400421143
On step: 3, the loss is:  3.8521084785461426
On step: 4, the loss is:  3.7702529430389404
On step: 5, the loss is:  3.690779209136963
On step: 6, the loss is:  3.599055051803589
On step: 7, the loss is:  3.5361344814300537
On step: 8, the loss is:  3.480156660079956
On step: 9, the loss is:  3.4440410137176514
On step: 10, the loss is:  3.422365188598633
On step: 11, the loss is:  3.431852340698242
On step: 12, the loss is:  3.3540198802948


KeyboardInterrupt: 

In [11]:
#### At this point, we have pretty complete transformer, that is the "Decoder Only Tranformer", so now we can try
#### scaling it up.

In [12]:
## Let's create some new variables

In [13]:
batch_size = 16
max_seq_length = 32
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_heads = 4
n_layer = 4
dropout = 0.2

In [14]:
## Come to this code cell after the next.
## We can also drop out when we calculate the affinities after the Softmax, in Single head Attention Mechanism.
## Now in our Multi head attention, we also have to introduce a projection (Read After the Block Module)
class SingleHead(nn.Module):
    
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(max_seq_length, max_seq_length)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)  
        q = self.query(x) 
       
        wei = q @ k.transpose(-2,-1) * C**-0.5 
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        v = self.value(x) 
        out = wei @ v 
        return out

In [15]:
## Come to this code cell after the next cell
## We can also dropout after the Multi-Head Attention as well.
## Now in our Multi head attention, we also have to introduce a projection (Read After the Block Module)
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([SingleHead(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [16]:
## Come on this after the next code cell
## Let's add dropout in the Feed Forward Network
## In Feed Forward Network, it is going to be the same thing, let's add the Projection (Read after the addition of Projection layer in the Multi Head Attention Block)
class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [17]:
class Block(nn.Module):
    def __init__(self, n_embd, n_heads):
        super().__init__()
        head_size = n_embd // n_heads
        self.sa_heads = MultiHeadAttention(n_heads, head_size)
        self.ffn = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa_heads(self.ln1(x))
        x = x + self.ffn(self.ln2(x))
        return x

In [18]:
class Transformer(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding_table = nn.Embedding(vocab_size, n_embd)  ##Creating the embedding look-up table for our tokens
        self.positional_embedding_table = nn.Embedding(max_seq_length, n_embd)  ##We need the channels or vectors to be n_embd because we want to encode the token in way that what is its position in the sequence.
                                                                                ##So we need to have positional information of the token in the sequence so we encode it based on what ots position is in the seq. so (max_seq_length, n_embd)
                                                                                ##Table contains the embedding for the token based on its position in the sequence.
        self.blocks = nn.Sequential(*[Block(n_embd, n_heads=n_heads) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) 
        self.lm_head = nn.Linear(n_embd, vocab_size)  ##Adding the Linear layer to get the final scores/outputs, (B, T, vocab_size)

    def forward(self, x, targets=None):
        B, T = x.shape
        word_embeddings = self.embedding_table(x)
        positional_embeddings = self.positional_embedding_table(torch.arange(T, device=device))
        final_embeddings = word_embeddings + positional_embeddings 
        x = self.blocks(final_embeddings)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, x, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cropped = x[:, -max_seq_length:]
            logits, loss = self(idx_cropped)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            x_next = torch.multinomial(probs, num_samples=1)
            x = torch.cat((x, x_next), dim=1)
        return x

In [19]:
m = Transformer()
m = m.to(device)

In [20]:
m

Transformer(
  (embedding_table): Embedding(65, 64)
  (positional_embedding_table): Embedding(32, 64)
  (blocks): Sequential(
    (0): Block(
      (sa_heads): MultiHeadAttention(
        (heads): ModuleList(
          (0-3): 4 x SingleHead(
            (key): Linear(in_features=64, out_features=16, bias=False)
            (query): Linear(in_features=64, out_features=16, bias=False)
            (value): Linear(in_features=64, out_features=16, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
        )
        (proj): Linear(in_features=64, out_features=64, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (ffn): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=64, out_features=256, bias=True)
          (1): ReLU()
          (2): Linear(in_features=256, out_features=64, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    

In [21]:
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

0.209729 M parameters


In [22]:
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)

In [32]:
@torch.no_grad()
def estimate_loss(m):
    out = {}
    m.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = m(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    # m.train()
    return out

In [27]:
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 4.3585, val loss 4.3559
step 100: train loss 2.7117, val loss 2.7158
step 200: train loss 2.5392, val loss 2.5352
step 300: train loss 2.4648, val loss 2.4683
step 400: train loss 2.3989, val loss 2.4024
step 500: train loss 2.3482, val loss 2.3559


KeyboardInterrupt: 

In [36]:
##Loading the model
model = torch.load("multi-head-4ds-ffd-4-block-resconnect-ln-drpt-self-attn-5000-steps-scaled-up.pth", map_location="cpu")

  model = torch.load("multi-head-4ds-ffd-4-block-resconnect-ln-drpt-self-attn-5000-steps-scaled-up.pth", map_location="cpu")


In [37]:
model

Transformer(
  (embedding_table): Embedding(65, 384)
  (positional_embedding_table): Embedding(256, 384)
  (blocks): Sequential(
    (0): Block(
      (sa_heads): MultiHeadAttention(
        (heads): ModuleList(
          (0-5): 6 x SingleHead(
            (key): Linear(in_features=384, out_features=64, bias=False)
            (query): Linear(in_features=384, out_features=64, bias=False)
            (value): Linear(in_features=384, out_features=64, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
        )
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (ffn): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=384, out_features=1536, bias=True)
          (1): ReLU()
          (2): Linear(in_features=1536, out_features=384, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
      (ln1): LayerNorm((384,), eps=1e-05, elementwise_affi

In [None]:
estimate_loss(model)

In [38]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=2000)[0].tolist()))


The dagger stream'd her moan. Lo, as with horses;
It is yours and undwelled with nobles of drums,
Edward, that libers, wears of dreadful peer, thlight
In the calm and the eye gripe, in conslead
For tuned without Clifford's speech?

CLIFFORD:
Why or it knows I am dean to hang on that hatr, which worse to you farewell;
I am for keep your love against well and done name!

KING RICHARD II:
Nay, I cannot need;
Four parle manage, but their bemoarriage
A thousand expressad ere the great
adversaging; and this the woushiness
Are in conting of cours of the tower
Doth that fair maid the ice pardon,
To make an envy ornamour taunt to stand a
Titatesmer's truth with all the choice may come on;
For this trust may you, indeed, continghtory,
And tremble to seless: if grim it first,
And if the state of be comforous.
That I have told it will colours
I' the open of man dotion, so much betted proport
Of ruival worm best, But his head
His name lives: on my deep sight tears,
And lodge a handkers, she doth m

In [40]:
##For Streaming
context = torch.zeros((1, 1), dtype=torch.long, device=device)

# Generate and stream tokens
max_new_tokens = 2000
generated = context

print("Streaming generation:\n")
for _ in range(max_new_tokens):
    output = model.generate(generated, max_new_tokens=1)
    next_token = output[:, -1:]  # Extract the last token
    generated = torch.cat((generated, next_token), dim=1)  # Update context with the new token
    
    # Decode the latest token and print it
    token_text = decode(next_token[0].tolist())
    print(token_text, end="", flush=True)  # Stream the output

print("\n\nGeneration complete!")

Streaming generation:

DUKE OF YORK:
My noble mother, by anight, how words he your own foot talk;
Within her your followers and bretures.
Yet, if thou call'd my last, 'tis dead,
Tread comfortable by date back and gentle Tybalt,
With ligh? God father, what as you speak,
And know, hark! what see comfort I injured?

MONTAGUE:
Why none so more pursues to her like
To Bolingbroke and off cents,
Bear to him, in sulsparent nearing in
hyer's pardon.' What find of this naxe?

MENENIUS:
Little to be shortly moved and challenge
By the frowning on one one words:
Nickly girls and brain,
That brought up Mercy pounning to declay.
Ah, if soft be impeach'd but bald,
And on the napking him, sever for me.

LEONTES:
What?
Why, then am I look'd bereft he note.

Second Lord:
Let him as have assistrength for the humour,
And convey'd can cut there is fourst,
You hot some marriage, never so stink,
I do appear you: some shrewd
Finding her father are haired by him
As he visits and a Christian did
granshal made th

In [43]:
print(text[:2000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.

