# Introduction
- this notebook aims to build a sentence generator using wikisent2 dataset;
- for text generation, the focus is to use top_k sampling
- *** This is a decoder only transformer ***

# Import needed libraries

In [1]:
from tqdm import tqdm

import torch
from torch import nn
from transformers import AutoTokenizer

# Device agnostic code

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_default_device(device)
print(f"default device set to {device}")


default device set to cuda


# Prepare the data

In [3]:
text_path = "/kaggle/input/wikipedia-sentences/wikisent2.txt"

In [4]:
with open(text_path, 'r', encoding="utf-8") as f:
    lines = f.read().splitlines()[:100000]
    max_length = max([len(line) for line in lines])

In [5]:
gpt2tokenizer = AutoTokenizer.from_pretrained("gpt2")
gpt2tokenizer.pad_token = gpt2tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [6]:
def get_random_batch(lines, batch_size, max_length):
    random_lines_batch_idx = torch.randint(0, len(lines)-1, size=(batch_size,))
    
    batch_inputs = []
    batch_labels = []
    
    for idx in random_lines_batch_idx:
        #print(f"lines is -->{lines[idx]}")
        encoded_line = gpt2tokenizer.encode(
            lines[idx],
            max_length=max_length,
            padding="max_length",
            truncation=True,
            add_special_tokens=True,
        )
        
        #decoded_line = gpt2tokenizer.decode(encoded_line)
        batch_inputs.append(encoded_line)
        batch_labels.append(encoded_line[1:] + [gpt2tokenizer.pad_token_id])
        #print(encoded_line)
    batch_inputs = torch.tensor(batch_inputs, dtype=torch.long)
    batch_labels = torch.tensor(batch_labels, dtype=torch.long)
    return (batch_inputs, batch_labels)

In [7]:
print(f"max_length set to --> {max_length}")
sample_batch = get_random_batch(lines=lines, batch_size=1, max_length=max_length)
print(sample_batch[0].shape)
print(sample_batch[1].shape)
print(sample_batch[0].device)

max_length set to --> 1034
torch.Size([1, 1034])
torch.Size([1, 1034])
cuda:0


In [8]:
def batch_tester(num_samples, batch_size, max_length):
    for _ in range(num_samples):
        try:
            batch_sample = get_random_batch(lines=lines, batch_size=batch_size, max_length=max_length)
        except Exception as e:
            print(f"error:\n{e}")

In [9]:
lines = lines
batch_size = 64
max_length = max_length

batch_tester(num_samples=100, batch_size=batch_size, max_length=max_length)

In [10]:
gpt2tokenizer.vocab_size

50257

# Transformer decoder model

# Head
- not taking k and v since there is no encoder block

In [11]:
class Head(nn.Module):
    def __init__(self, n_embd, head_size, context_size, dropout):
        super(Head, self).__init__()

        self.Q = nn.Linear(in_features=n_embd, out_features=head_size)
        self.K = nn.Linear(in_features=n_embd, out_features=head_size)
        self.V = nn.Linear(in_features=n_embd, out_features=head_size)

        self.register_buffer("tril", torch.tril(torch.ones(context_size, context_size)))

        self.dropout_layer = nn.Dropout(dropout)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        B, T, C = x.shape
        q = self.Q(x) # (B, T, head_size)
        k = self.K(x) # (B, T, head_size)

        wei = q @ k.transpose(-2, -1) * (C ** -0.5) # (B, T, head_size) @ (B, head_size, T ) --> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T]==0, float('-inf')) # (B, T, T)
        wei = torch.softmax(wei, dim=-1)
        wei = self.dropout_layer(wei)

        v = self.V(x)

        x = wei @ v # (B, T, T) @ (B, T, head_size) --> (B, T, head_size)

        return x 

# Multiheaded attention
- just a simple multiheaded attention layer with dropout to avoid overffiting; thus better inference time

In [12]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, n_embd, n_heads, head_size, context_size, dropout):
        super(MultiHeadedAttention, self).__init__()

        self.heads = nn.ModuleList([Head(n_embd=n_embd, head_size=head_size, context_size=context_size, dropout=dropout) for _ in range(n_heads)]) # (B, T, n_heads*head_size)
        self.projection = nn.Linear(in_features=n_heads*head_size, out_features=n_embd)
        self.dropout_layer = nn.Dropout(dropout) 

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = torch.cat([head(x) for head in self.heads], dim=-1) # (B, T, n_heads*head_size)
        x = self.dropout_layer(self.projection(x))
        return x

# Feedforward

In [13]:
class FeedForward(nn.Module):
    def __init__(self, in_features, out_features, dropout):
        super().__init__()

        self.ffwrd_layer = nn.Sequential(
            nn.Linear(in_features=in_features, out_features=in_features*4),
            nn.ReLU(),
            nn.Linear(in_features=in_features*4, out_features=out_features),
            nn.Dropout(dropout)
        ) # (B, T, in_features)

    def forward(self, x: torch.Tensor):
        return self.ffwrd_layer(x)

# Attention block

In [14]:
class Block(nn.Module):
    def __init__(self, n_embd, n_heads, head_size, context_size, dropout):
        super(Block, self).__init__()

        self.multiheaded_attention = MultiHeadedAttention(n_embd=n_embd, n_heads=n_heads, head_size=head_size, context_size=context_size, dropout=dropout)
        self.ffwrd = FeedForward(in_features=n_embd, out_features=n_embd, dropout=dropout)
        self.layer_norm1 = nn.LayerNorm(n_embd)
        self.layer_norm2 = nn.LayerNorm(n_embd)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x + self.multiheaded_attention(self.layer_norm1(x))
        x = x + self.ffwrd(self.layer_norm2(x))
        return x

# Decoder model

In [15]:
class Decoder(nn.Module):
    def __init__(self, context_size, n_embd, n_heads, head_size, n_blocks, vocab_size, dropout, top_k):
        super(Decoder, self).__init__()
        self.context_size = context_size
        self.n_embd = n_embd
        self.vocab_size = vocab_size

        self.top_k = top_k
        
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(context_size, n_embd)

        self.blocks = nn.Sequential(*[Block(n_embd=n_embd, n_heads=n_heads, head_size=head_size, context_size=context_size, dropout=dropout) for _ in range(n_blocks)])
        self.layer_norm = nn.LayerNorm(n_embd)
        
        self.lm_head = nn.Linear(in_features=n_embd, out_features=vocab_size)
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        B, T = x.shape
        #print(f"context_size is {T}")
        if T > self.position_embedding_table.num_embeddings:
            raise ValueError(f"Sequence length {T} exceeds the maximum context size "
                             f"{self.position_embedding_table.num_embeddings}")
        
        positions = torch.arange(start=0, end=T, step=1)
        token_emb = self.token_embedding_table(x) # (B, T, C)
        pos_emb = self.position_embedding_table(positions) # (T, C)
        
        x = token_emb + pos_emb
        x = self.blocks(x)
        x = self.layer_norm(x)
        B, T, C = x.shape
        
        #x = self.lm_head(x.view(B*T, C)) # (B, T, vocab_size)
        x = self.lm_head(x) # (B, T, vocab_size)
        return x

    def greedy_sampler(self, context: torch.Tensor, max_new_tokens: int, eos_id: int):
        full_output = context
        context = context[:, -self.context_size:]

        for _ in range(max_new_tokens):
            context = context[:, -self.context_size:]
            logits = self(context) # (B, T, vocab_size)
            logits = logits[:, -1, :] # take the prediction for the last token (B, 1, vocab_size); whereas B is 1
            percents = torch.softmax(logits, dim=-1)
            pred = torch.argmax(percents, dim=1).view(1, -1)

            if pred.item() == eos_id:
                return full_output

            full_output = torch.cat([full_output, pred], dim=1)
            context = full_output  # update context with the new tokens

        return full_output

    def random_sampler(self, context: torch.Tensor, max_new_tokens: int, eos_id: int):
            full_output = context
            context = context[:, -self.context_size:] # limit the context_size
        
            for _ in range(max_new_tokens):
                context = context[:, -self.context_size:] 
                logits = self(context)
                logits = logits[:, -1, :]  # get the prediction for the last token only; (B, vocab_size)
                percents = torch.softmax(logits, dim=-1)  # (B, vocab_size)
                pred = torch.multinomial(percents, num_samples=1)  # (B, 1)
                
                if pred.item() == eos_id:
                    return full_output
                
                full_output = torch.cat([full_output, pred], dim=1)
                context = full_output  # update context with the new tokens
                
            return full_output 

    def top_k_sampler(self, context: torch.Tensor, max_new_tokens: int, eos_id: int):
        full_output = context
        context = context[:, -self.context_size:]
        
        for _ in range(max_new_tokens):
            context = context[:, -self.context_size:]
            logits = self(context)
            logits = logits[:, -1, :]
            top_k_logits, top_k_indices = torch.topk(logits, self.top_k) # both of shape (B, top_k)
            top_k_percents = torch.softmax(top_k_logits, dim=-1)
            pred = top_k_indices.gather(-1, torch.multinomial(top_k_percents, num_samples=1)) # gather across the last dim' (B, 1)

            if pred.item() == eos_id:
                return full_output
            
            full_output = torch.cat([full_output, pred], dim=1)
            context = full_output

        return full_output            

    def generate(self, context: torch.Tensor, max_new_tokens: int, eos_id: int, sampler_type: str = "random"):
        if sampler_type == "greedy":
            return self.greedy_sampler(context=context, max_new_tokens=max_new_tokens, eos_id=eos_id)
        elif sampler_type == "random":
            return self.random_sampler(context=context, max_new_tokens=max_new_tokens, eos_id=eos_id)
        elif sampler_type == "top_k":
            return self.top_k_sampler(context=context, max_new_tokens=max_new_tokens, eos_id=eos_id)

        


    """
    def generate(self, context: torch.Tensor, max_new_tokens: int, eos_id: int):
        full_output = context
        context = context[:, -self.context_size:] # limit the context_size
        
        for _ in range(max_new_tokens):
            context = context[:, -self.context_size:] 
            logits = self(context)
            logits = logits[-1, :].view(1, -1) # get the prediction for the last token only; (1, vocab_size)
            percents = torch.softmax(logits, dim=1) # (1, vocab_size)
            pred = torch.multinomial(percents, num_samples=1)
            if pred.view(-1).item() == eos_id:
                return full_output
            
            context = torch.cat([context, pred], dim=1)
            full_output = torch.cat([context, pred], dim=1)
            
            
        return full_output
        
        
    """
        

# Hyperparameters

In [16]:
n_embd = 256
context_size = max_length # how many tokens to look at a time; also affects positional_embedding_table
n_heads = 8
head_size = 32
n_blocks = 3

dropout = 0.2
top_k = 55

decoder = Decoder(
    context_size=context_size, 
    n_embd=n_embd, n_heads=n_heads, head_size=head_size, 
    n_blocks=n_blocks, vocab_size=gpt2tokenizer.vocab_size, 
    dropout=dropout, top_k=top_k
    )

# Text generator made by me

In [17]:
class ModelTextGenerator():
    def __init__(self, model: object, tokenizer: object, num_samples: int, max_new_tokens: int, eos_token_id: int):
        self.model = model
        self.num_samples = num_samples
        self.max_new_tokens = max_new_tokens
        self.tokenizer = tokenizer
        self.last_output = ""

        self.params_dict = {
            "model": model,
            "max_new_tokens": max_new_tokens,
            "num_samples": num_samples,
            "previous_outputs": []
        }
    
    @torch.no_grad
    def generate(self, starting_text: str, clear_outputs: bool = True, debug: bool = False, sampler_type: str = "random"):
        self.model.eval()
        full_text = ""
        #context = starting_text[-self.model.context_size:]
        
        #[-self.model.context:] cuts the context so to make predictions based on a certain context length; basically for optimization
        for _ in range(self.num_samples):
            full_text = ""
            encoded_text = self.tokenizer.encode(
                starting_text[-self.model.context_size:],
                truncation=True,
                add_special_tokens=True,
            )
            
            context = torch.tensor(encoded_text, dtype=torch.long).view(1, -1) # make it batched
            model_outputs = self.model.generate(context=context, max_new_tokens=self.max_new_tokens, eos_id=self.tokenizer.eos_token_id, sampler_type=sampler_type)
            text_output = self.tokenizer.decode(model_outputs.view(-1).tolist())
            full_text += text_output
            
            self.last_output = full_text
            self.params_dict["previous_outputs"].append(full_text)
            
    def update_params(self, model: object = None, max_length: int = None, num_samples: int = None, clear_outputs: bool = None):
            if clear_outputs:
                self.clear_outputs()
                
            updated_dict = {
                "model": model,
                "max_length": max_length,
                "num_samples": num_samples
            }
            
            for attribute, value in updated_dict.items():
                if value is not None:
                    self.params_dict[attribute] = value
                    setattr(self, attribute, value)
        
    def clear_outputs(self):
        self.params_dict["previous_outputs"] = []
        self.last_output = ""
        
    def print_outputs(self, last: bool = None):
        if last:
            print(self.last_output)
        else:
            for output in self.params_dict["previous_outputs"]:
                print(f"\n{output}\n\n")

# Sample from the model

In [18]:
decoder_generator = ModelTextGenerator(model=decoder, tokenizer=gpt2tokenizer, num_samples=2, max_new_tokens=10, eos_token_id=gpt2tokenizer.eos_token_id)
decoder_generator.generate(starting_text=".", sampler_type="random")
decoder_generator.print_outputs()
decoder

2024-07-12 01:46:41.942968: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-12 01:46:41.943092: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-12 01:46:42.066120: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered



. martyr 1918otiation Sanct flouratum SIM tailoribrarian gib



. botherKa descent806letterminimum Soci Harelege Sega




Decoder(
  (token_embedding_table): Embedding(50257, 256)
  (position_embedding_table): Embedding(1034, 256)
  (blocks): Sequential(
    (0): Block(
      (multiheaded_attention): MultiHeadedAttention(
        (heads): ModuleList(
          (0-7): 8 x Head(
            (Q): Linear(in_features=256, out_features=32, bias=True)
            (K): Linear(in_features=256, out_features=32, bias=True)
            (V): Linear(in_features=256, out_features=32, bias=True)
            (dropout_layer): Dropout(p=0.2, inplace=False)
          )
        )
        (projection): Linear(in_features=256, out_features=256, bias=True)
        (dropout_layer): Dropout(p=0.2, inplace=False)
      )
      (ffwrd): FeedForward(
        (ffwrd_layer): Sequential(
          (0): Linear(in_features=256, out_features=1024, bias=True)
          (1): ReLU()
          (2): Linear(in_features=1024, out_features=256, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
      (layer_norm1): LayerNorm

# Training loop

In [19]:
def train_model(model, lines, batch_size, loss_fn, optimizer, epochs):
    model.train()
    for epoch in tqdm(range(epochs)):
        batch_inputs, batch_labels = get_random_batch(lines=lines, batch_size=batch_size, max_length=max_length)
        B, T = batch_labels.shape
        logits = model(batch_inputs) # (B, T vocab_size)
        loss = loss_fn(logits.view(B*T, -1), batch_labels.view(-1)) # (B*T, vocab_size) matching (B*T)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if epoch % 25 == 0:
            print(f"loss for epoch {epoch} --> {loss}")

In [20]:
optimizer = torch.optim.AdamW(params=decoder.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

In [21]:
train_model(model=decoder, lines=lines, batch_size=8, loss_fn=loss_fn, optimizer=optimizer, epochs=250)

  0%|          | 1/250 [00:00<01:59,  2.08it/s]

loss for epoch 0 --> 11.23080062866211


 10%|█         | 26/250 [00:07<01:01,  3.65it/s]

loss for epoch 25 --> 0.2406264692544937


 20%|██        | 51/250 [00:14<00:54,  3.67it/s]

loss for epoch 50 --> 0.1925031691789627


 30%|███       | 76/250 [00:20<00:47,  3.66it/s]

loss for epoch 75 --> 0.21951882541179657


 40%|████      | 101/250 [00:27<00:40,  3.64it/s]

loss for epoch 100 --> 0.2181881219148636


 50%|█████     | 126/250 [00:34<00:33,  3.66it/s]

loss for epoch 125 --> 0.18306320905685425


 60%|██████    | 151/250 [00:41<00:27,  3.65it/s]

loss for epoch 150 --> 0.19811087846755981


 70%|███████   | 176/250 [00:48<00:20,  3.67it/s]

loss for epoch 175 --> 0.136253222823143


 80%|████████  | 201/250 [00:55<00:13,  3.67it/s]

loss for epoch 200 --> 0.2249298393726349


 90%|█████████ | 226/250 [01:01<00:06,  3.69it/s]

loss for epoch 225 --> 0.15054036676883698


100%|██████████| 250/250 [01:08<00:00,  3.66it/s]


# Model inference

In [22]:
decoder_generator = ModelTextGenerator(model=decoder, tokenizer=gpt2tokenizer, num_samples=2, max_new_tokens=100, eos_token_id=gpt2tokenizer.eos_token_id)
decoder_generator.generate(starting_text="Friday", sampler_type="top_k")
decoder_generator.print_outputs()


Friday to is of's population,, a tournament, an-.



Friday of the also, is a. is a from is a who is a to (.




# Model size in params and architecture

In [23]:
def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

count_parameters(decoder)

28416337

In [24]:
decoder

Decoder(
  (token_embedding_table): Embedding(50257, 256)
  (position_embedding_table): Embedding(1034, 256)
  (blocks): Sequential(
    (0): Block(
      (multiheaded_attention): MultiHeadedAttention(
        (heads): ModuleList(
          (0-7): 8 x Head(
            (Q): Linear(in_features=256, out_features=32, bias=True)
            (K): Linear(in_features=256, out_features=32, bias=True)
            (V): Linear(in_features=256, out_features=32, bias=True)
            (dropout_layer): Dropout(p=0.2, inplace=False)
          )
        )
        (projection): Linear(in_features=256, out_features=256, bias=True)
        (dropout_layer): Dropout(p=0.2, inplace=False)
      )
      (ffwrd): FeedForward(
        (ffwrd_layer): Sequential(
          (0): Linear(in_features=256, out_features=1024, bias=True)
          (1): ReLU()
          (2): Linear(in_features=1024, out_features=256, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
      (layer_norm1): LayerNorm