<a href="https://colab.research.google.com/github/Rohit-Singh12/Deep-LEARGNINGS/blob/main/Understanding%20LLM/Decoder_only_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Decoder only Transformer

## Generating dataset for training transformers

In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam

# Vocabulary
vocabulary = ['What', 'is', 'Machine', 'Learning', '<EOS>', 'Mathmematics']
token_to_id = {word: idx for idx, word in enumerate(vocabulary)}
id_to_token = {idx: word for word, idx in token_to_id.items()}

# Inputs and labels (Batch size: 2, Sequence length: 6)
inputs = torch.tensor([
    [token_to_id['What'], token_to_id['is'], token_to_id['Machine'],
     token_to_id['Learning'], token_to_id['<EOS>'], token_to_id['Mathmematics']],
    [token_to_id['Machine'], token_to_id['Learning'], token_to_id['is'],
     token_to_id['What'], token_to_id['<EOS>'], token_to_id['Mathmematics']]
])

labels = torch.tensor([
    [token_to_id['is'], token_to_id['Machine'], token_to_id['Learning'],
     token_to_id['<EOS>'], token_to_id['Mathmematics'], token_to_id['<EOS>']],
    [token_to_id['Learning'], token_to_id['is'], token_to_id['What'],
     token_to_id['<EOS>'], token_to_id['Mathmematics'], token_to_id['<EOS>']]
])

## Calculating Positional Encoding of the inputs

In [35]:
# Positional Encoding
## PE(pos, 2i)   = sin(pos / 10000^(2i/d_model))
## PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))
class PositionalEncoding(nn.Module):
    def __init__(self, d_model=5, max_len=10):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        embedding_index = torch.arange(0, d_model, 2)
        div_term = 1/torch.tensor(10000.0)**(embedding_index / d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term[:pe[:, 1::2].shape[1]])
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.shape[1], :].unsqueeze(0)




## Masked Self Attention

In [29]:
# Scaled Dot-Product Attention with Masking
class Attention(nn.Module):
    def __init__(self, d_model=5):
        super().__init__()
        self.d_model = d_model
        self.W_q = nn.Linear(d_model, d_model, bias=False)
        self.W_k = nn.Linear(d_model, d_model, bias=False)
        self.W_v = nn.Linear(d_model, d_model, bias=False)

    def forward(self, q, k, v, mask=None):
        #print("Dimension of q ", q.shape)
        q = self.W_q(q)  # (Batch, Seq, d_model)
        k = self.W_k(k)
        v = self.W_v(v)
        #print(q,k)
        attn_scores = torch.matmul(q, k.transpose(-2, -1)) / (self.d_model ** 0.5)  # (Batch, Seq, Seq)

        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, float('-inf'))

        attn_weights = F.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_weights, v)  # (Batch, Seq, d_model)
        return output


## Decoder only Transformer

In [30]:
# Decoder-Only Transformer Model
class DecoderOnlyTransformer(nn.Module):
    def __init__(self, n_tokens, d_model=5, max_len=10):
        super().__init__()
        self.embedding = nn.Embedding(n_tokens, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len)
        self.self_attention = Attention(d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_model * 2),
            nn.ReLU(),
            nn.Linear(d_model * 2, d_model)
        )
        self.norm2 = nn.LayerNorm(d_model)
        self.fc_out = nn.Linear(d_model, n_tokens)
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, token_ids, labels=None):
        # print("token ids", token_ids)
        x = self.embedding(token_ids)
        x = self.pos_encoding(x)
        # print("shape of x ", x.shape)
        batch_size, seq_len, _ = x.shape
        mask = torch.tril(torch.ones(seq_len, seq_len)).unsqueeze(0).expand(batch_size, -1, -1).to(x.device)
        # print("mask ", mask)
        attn_output = self.self_attention(x, x, x, mask)
        x = self.norm1(x + attn_output)

        ffn_output = self.ffn(x)
        x = self.norm2(x + ffn_output)
        # print(" shape of x after ffn", x.shape)
        logits = self.fc_out(x)  # (Batch, Seq, Vocab_size)
        # print(" shape after output", logits.shape)
        if labels is not None:
            loss = self.loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))
            return logits, loss
        return logits
    def generate(self, start_tokens):
        self.eval()  # Set model to evaluation mode
        generated_tokens = start_tokens.clone()

        while True:
            logits = self.forward(generated_tokens)  # Get logits for the current sequence
            # print(logits)
            next_token = torch.argmax(logits[:, -1, :], dim=-1).unsqueeze(1)  # Get most probable next token
            generated_tokens = torch.cat((generated_tokens, next_token), dim=1)  # Append next token

            if next_token.item() == token_to_id['<EOS>']:  # Stop if EOS token is reached
                break

        return generated_tokens

## Training the model

In [31]:
d_model = 5
max_len = 10
n_tokens = len(vocabulary)
lr = 0.01
epochs = 1000

model = DecoderOnlyTransformer(n_tokens, d_model, max_len)
optimizer = Adam(model.parameters(), lr=lr)

# Training Loop
for epoch in range(epochs):
    optimizer.zero_grad()
    logits, loss = model(inputs, labels)
    loss.backward() # Backpropragation step to update weight
    optimizer.step() # Update weights

    if epoch % 100 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item()}")


Epoch 0, Loss: 1.9208663702011108
Epoch 100, Loss: 0.02862042374908924
Epoch 200, Loss: 0.005945244804024696
Epoch 300, Loss: 0.002684066304937005
Epoch 400, Loss: 0.001552328933030367
Epoch 500, Loss: 0.0010218644747510552
Epoch 600, Loss: 0.0007280257996171713
Epoch 700, Loss: 0.0005466984584927559
Epoch 800, Loss: 0.00042633211705833673
Epoch 900, Loss: 0.000342007348081097


## Generating output


In [34]:
# Create an instance of the trained model
model = DecoderOnlyTransformer(n_tokens=len(vocabulary))

start_seq = torch.tensor([[token_to_id['What'], token_to_id['is'], token_to_id['Machine'], token_to_id['Learning']]])

output_tokens = model.generate(start_seq)

# Convert token IDs back to words
generated_sentence = [id_to_token[token.item()] for token in output_tokens[0]]
print("Generated Sentence:", " ".join(generated_sentence))


Generated Sentence: What is Machine Learning Learning Learning <EOS>
