# Text Generator
Implementing a text generation model from scratch using a transformer (decoder only).\
Steps:
1. Tokenization
2. Input embedding
3. Positional encoding
4. Masking
5. Self-attention
6. Decoder stack
7. Predicting token probabilities

## Creating Training Data

In [1]:
#conda install pytorch torchvision torchaudio -c pytorch

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import math
import pandas as pd
import random
import matplotlib.pyplot as plt

In [3]:
class creating_data():
    def __init__(self, filepath):
        self.df = pd.read_csv(filepath)
    
    def save(self, path):
        self.df.to_csv(path)
    

In [4]:
# dataset = creating_data('medium_articles.csv')
# dataset.save('training_data.csv')

## Tokenization

In [5]:
class Tokenizer():
    def __init__(self):
        self.dictionary = {}
        self.reverse_dictionary = {}
        
        # adding special tokens
        self.__add_to_dict('<pad>')
        self.__add_to_dict('<unk>')
        
        # add characters and numbers to dictionary
        for i in range(10):
            self.__add_to_dict(str(i))
        
        for i in range(26):
            self.__add_to_dict(chr(ord('a') + i))
            self.__add_to_dict(chr(ord('A') + i))
            
        # adding space and punctuation
        for char in ['.', ' ', ',', '!', '?', '\n']:
            self.__add_to_dict(char)
        
    def __add_to_dict(self, character):
        if character not in self.dictionary:
            index = self.size()
            self.dictionary[character] = index
            self.reverse_dictionary[index] = character
            
    def tokenize(self, text):
        return [self.character_to_token(c) for c in text]
    
    def character_to_token(self, character):
        return self.dictionary.get(character, self.dictionary['<unk>'])
    
    def token_to_character(self, token):
        return self.reverse_dictionary.get(token, '<unk>')
    
    def size(self):
        return len(self.dictionary)

In [6]:
training_data = pd.read_csv('training_data.csv')
training_data = training_data['text']

In [7]:
training_data.head()

0    Photo by Josh Riemer on Unsplash\n\nMerry Chri...
1    Your Brain On Coronavirus\n\nA guide to the cu...
2    Mind Your Nose\n\nHow smell training can chang...
3    Passionate about the synergy between science a...
4    You’ve heard of him, haven’t you? Phineas Gage...
Name: text, dtype: object

In [8]:
training_data = training_data.to_numpy()

In [9]:
# instantiating tokenizer
tokenizer = Tokenizer()
tokenized_data = [tokenizer.tokenize(sentence) for sentence in training_data]

In [10]:
max_sequence_length = 20
# padding and truncating
padded_data = []

for tokens in tokenized_data:
    if len(tokens) < max_sequence_length:
        # padding
        tokens = [tokenizer.character_to_token('<pad>')] * (max_sequence_length - len(tokens)) + tokens
    else:
        # truncating
        tokens = tokens[:max_sequence_length]
    padded_data.append(tokens)

In [11]:
# converting data to tensors
tensor_data = [torch.tensor(tokens) for tokens in padded_data]

## Input Embeddings

In [12]:
class TokenEmbedding(torch.nn.Module):
    # model that converts tokens into embeddings
    
    def __init__(self, model_dim, num_tokens):
        super().__init__()
        self.embedding_layer = torch.nn.Embedding(
            num_embeddings = num_tokens,
            embedding_dim = model_dim
        )
        
    def forward(self, x):
        return self.embedding_layer(x)

In [13]:
model_dim = 50
num_tokens = tokenizer.size()

In [14]:
# initializing class
embedding_model = TokenEmbedding(model_dim, num_tokens)
# convert padded data to tensor
tensor_data = torch.stack(tensor_data)
embedded_data = embedding_model(tensor_data)

In [15]:
# Print the shape of the embedded data to verify
print("Shape of embedded data:", embedded_data.shape)

# Print the first embedded sequence for verification
print("First embedded sequence:", embedded_data[0])

Shape of embedded data: torch.Size([100, 20, 50])
First embedded sequence: tensor([[ 0.8844, -0.8275,  0.1637,  0.4399, -1.2731,  1.0260,  0.8135, -0.3600,
          0.4726,  0.3255, -1.5106, -0.6047, -0.5044, -1.2279, -0.1926,  0.6179,
          1.8251, -0.3024, -0.9104,  0.0415, -0.2966,  0.7469, -1.2024, -1.3591,
          0.7897, -0.1609, -0.5841, -0.2200, -0.4246,  1.8333, -0.4698,  1.0196,
         -0.2160,  0.5330,  0.3564,  1.9847,  1.1330,  0.6115, -0.6802, -1.2569,
          0.7539,  0.2378,  0.7264, -0.5260,  1.9869, -0.6171, -0.5291, -2.1909,
         -0.0544, -0.6772],
        [ 0.1019, -0.5571,  0.8181, -0.5467,  0.0855, -0.0229,  0.7933, -2.2315,
          2.0713,  0.5831, -0.2309, -0.1784,  0.7542, -1.4542,  0.8075, -0.1818,
         -0.7413,  0.1827, -1.1486,  1.0028,  0.3147, -0.1738, -1.2141,  0.3591,
         -0.3803,  0.2126, -0.4082, -0.7280,  0.9221,  0.2378,  2.2642, -0.1053,
         -0.8234, -1.6579,  1.5341, -0.7315, -1.4654,  0.2314,  2.3472, -0.4193,
      

## Positional Encoding

In [16]:
class PositionalEncoding(torch.nn.Module):
    def __init__(self, model_dim, max_sequence_length):
        super().__init__()
        self.model_dim = model_dim
    
        positional_encoding = np.zeros((max_sequence_length, model_dim))
        
        # calculating encoding for each position and dim
        for pos in range(max_sequence_length):
            for i in range(0, self.model_dim, 2):
                # sin to even indices
                positional_encoding[pos, i] = np.sin(pos / (10000 ** ((2 * i) / model_dim)))
                
                # cos to odd indices
                if i + 1 < self.model_dim:
                    positional_encoding[pos, i + 1] = np.cos(pos / (10000 ** ((2 * i) / model_dim)))
                    
        
        self.positional_encoding = torch.from_numpy(positional_encoding).unsqueeze(0).float()
            

    def forward(self, x):
        seq_len = x.size(1)
        return x + self.positional_encoding[:, : seq_len, :]
        

In [17]:
pos_encoding = PositionalEncoding(model_dim, max_sequence_length)
encoded_data = pos_encoding(embedded_data)

In [18]:
# Print the shape of the encoded data to verify
print("Shape of encoded data:", encoded_data.shape)

# Print the first encoded sequence for verification
print("First encoded sequence:", encoded_data[0])

Shape of encoded data: torch.Size([100, 20, 50])
First encoded sequence: tensor([[ 0.8844,  0.1725,  0.1637,  1.4399, -1.2731,  2.0260,  0.8135,  0.6400,
          0.4726,  1.3255, -1.5106,  0.3953, -0.5044, -0.2279, -0.1926,  1.6179,
          1.8251,  0.6976, -0.9104,  1.0415, -0.2966,  1.7469, -1.2024, -0.3591,
          0.7897,  0.8391, -0.5841,  0.7800, -0.4246,  2.8333, -0.4698,  2.0196,
         -0.2160,  1.5330,  0.3564,  2.9847,  1.1330,  1.6115, -0.6802, -0.2569,
          0.7539,  1.2378,  0.7264,  0.4740,  1.9869,  0.3829, -0.5291, -1.1909,
         -0.0544,  0.3228],
        [ 0.9434, -0.0168,  1.2786,  0.3409,  0.3126,  0.9510,  0.9027, -1.2375,
          2.1238,  1.5818, -0.2058,  0.8212,  0.7662, -0.4543,  0.8132,  0.8182,
         -0.7385,  1.1827, -1.1473,  2.0028,  0.3154,  0.8262, -1.2138,  1.3591,
         -0.3802,  1.2126, -0.4082,  0.2720,  0.9221,  1.2378,  2.2642,  0.8947,
         -0.8234, -0.6579,  1.5341,  0.2685, -1.4654,  1.2314,  2.3472,  0.5807,
        

## Masking and Attention

In [19]:
class MaskedSelfAttention(torch.nn.Module):
    def __init__(self, embedding_dimension, head_dimension):
        super().__init__()
        self.embedding_dimension = embedding_dimension
        self.head_dimension = head_dimension
        
        self.query_layer = torch.nn.Linear(self.embedding_dimension, self.head_dimension)
        self.key_layer = torch.nn.Linear(self.embedding_dimension, self.head_dimension)
        self.value_layer = torch.nn.Linear(self.embedding_dimension, self.head_dimension)
        self.softmax = torch.nn.Softmax(dim = -1)
        
    def forward(self, x, mask):
        # x dim - (batch_size, sequence_length, embedding_dim)
        # mask dim - (batch_size, sequence_length, head_dim)
        # output dim - (batch_size, sequence_length)
        
        query = self.query_layer(x)
        key = self.key_layer(x)
        value = self.value_layer(x)
        
        # calculating attention weights and scaling
        attention_weights = torch.matmul(query, key.transpose(-2, -1)) / np.sqrt(self.head_dimension)
        
        # masking
        if mask is not None:
            attention_weights = attention_weights.masked_fill(mask == 0, float('-inf'))
        
        attention_scores = self.softmax(attention_weights)
        return torch.matmul(attention_scores, value)

In [20]:
class MaskedMultiHeadedSelfAttention(torch.nn.Module):
    def __init__(self, embedding_dimension, num_heads):
        super().__init__()
        self.embedding_dimension = embedding_dimension
        self.head_dimension = embedding_dimension // num_heads
        self.num_heads = num_heads
        
        self.self_attentions = torch.nn.ModuleList(
            [MaskedSelfAttention(embedding_dimension, self.head_dimension) for _ in range(self.num_heads)]
        )
        
        self.output_layer = torch.nn.Linear(self.num_heads * self.head_dimension, self.embedding_dimension)
        
    def forward(self, x, mask):
        self_attention_outputs = [self_attention(x, mask) for self_attention in self.self_attentions]
        
        # concatenating outputs
        concatenated_outputs = torch.cat(self_attention_outputs, dim = 2)
        return self.output_layer(concatenated_outputs)

## Decoder

In [21]:
class DecoderLayer(torch.nn.Module):
    def __init__(self, embedding_dim, num_heads, feed_forward_dim, dropout_rate):
        super().__init__()
        
        self.multi_attention = MaskedMultiHeadedSelfAttention(embedding_dim, num_heads)
        self.feed_forward = FeedForward(embedding_dim, feed_forward_dim)
        self.dropout = torch.nn.Dropout(dropout_rate)
        
        self.layer_norm_1 = torch.nn.LayerNorm(embedding_dim)
        self.layer_norm_2 = torch.nn.LayerNorm(embedding_dim)
        
    def forward(self, x, mask):
        x_norm = self.layer_norm_1(x)
        attention_output = self.multi_attention(x_norm, mask)
        residual_output = x + self.dropout(attention_output)
        
        # feedforward block
        residual_output_norm = self.layer_norm_2(residual_output)
        feed_forward_output = self.feed_forward(residual_output_norm)
        
        if self.training:
            feed_forward_output = self.dropout(feed_forward_output)
            
        return residual_output + feed_forward_output

In [22]:
class DecoderStack(torch.nn.Module):
    def __init__(self, embedding_dim, num_layers, num_heads, feed_forward_dim, dropout_rate):
        super().__init__()
        
        self.decoder_layers = torch.nn.ModuleList(
            [DecoderLayer(embedding_dim, num_heads, feed_forward_dim, dropout_rate) for _ in range(num_layers)]
        )
        
    def forward(self, x, mask):
        outputs = x
        for layer in self.decoder_layers:
            outputs = layer(outputs, mask)
        
        return outputs

In [23]:
class FeedForward(torch.nn.Module):
    def __init__(self, embedding_dim, feed_forward_dim):
        super().__init__()
        self.linear_1 = torch.nn.Linear(embedding_dim, feed_forward_dim)
        self.linear_2 = torch.nn.Linear(feed_forward_dim, embedding_dim)
    
    def forward(self, x):
        x = self.linear_1(x)
        x = torch.nn.functional.relu(x)
        x = self.linear_2(x)
        
        return x

## Building the Model

In [24]:
class TextGenerator(torch.nn.Module):
    def __init__(self, num_tokens, max_sequence_length = 100, embedding_dim = 512, num_layers = 6, num_heads = 4, feed_forward_dim = None, dropout_rate = 0.1):
        super().__init__()
        self.num_tokens = num_tokens
        self.max_sequence_length = max_sequence_length
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        self.num_heads = num_heads
        
        if feed_forward_dim is None:
            self.feed_forward_dim = embedding_dim * 4
        else:
            self.feed_forward_dim = feed_forward_dim
        
        self.dropout_rate = dropout_rate
        
        self.token_embedding = TokenEmbedding(embedding_dim, num_tokens)
        self.positional_encoding = PositionalEncoding(embedding_dim, max_sequence_length)
        self.layer_norm = torch.nn.LayerNorm(embedding_dim)
        
        self.decoder = DecoderStack(embedding_dim, num_layers, num_heads, self.feed_forward_dim, dropout_rate)
        self.generator_head = GeneratorHead(embedding_dim, num_tokens)
        
    def forward(self, x, mask):
        token_embedding = self.token_embedding(x)
        positional_encoding = self.positional_encoding(token_embedding)
        positional_encoding_norm = self.layer_norm(positional_encoding)
        decoder_outputs = self.decoder(positional_encoding_norm, mask)
        generator_outputs = self.generator_head(decoder_outputs)
        
        return generator_outputs

In [25]:
class GeneratorHead(torch.nn.Module):
    def __init__(self, embedding_dim, num_tokens):
        super().__init__()
        self.linear = torch.nn.Linear(embedding_dim, num_tokens)
        
    def forward(self, x):
        return self.linear(x)

## Autoregressive Wrapper

In [26]:
class AutoregressiveWrapper(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
        
    def forward(self, x, mask):
        inputs, targets = x[:, :-1], x[:, 1:]
        mask = mask[:, :-1]
        
        output = self.model(inputs, mask)
        return output, targets
    
    def next_token_probabilities(self, x, mask, temperature = 1.0):
        logits = self.model(x, mask)[:, -1, :]
        
        if temperature != 1.0:
            logits /= temperature
        
        probabilities = torch.softmax(logits, dim = -1)
        
        return probabilities

## Training the Model

In [27]:
class Trainer:
    def __init__(self, model, tokenizer: Tokenizer, optimizer = None):
        self.model = model
        
        if optimizer is None:
            self.optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)
        else:
            self.optimizer = optimizer
        
        self.tokenizer = tokenizer
        self.loss_function = torch.nn.CrossEntropyLoss()
        
    def train(self, data, epochs, batch_size):
        loss_epoch = []
        
        for epoch in range(epochs):
            losses = []
            random.shuffle(data)
            
            batches = []
            for i in range(0, len(data), batch_size):
                sequence = torch.tensor(data[i: i + batch_size], dtype = torch.long)
                mask_tensor = torch.ones_like(sequence)
                mask_tensor[sequence == self.tokenizer.character_to_token('<pad>')] = 0
                
                batches.append((sequence, mask_tensor))
                
            for batch in batches:
                self.model.train()
                
                input_tensor = torch.zeros((batch_size, self.model.model.max_sequence_length + 1), dtype = torch.long)
                mask_tensor = torch.zeros((batch_size, self.model.model.max_sequence_length + 1), dtype = torch.long)
                
                for i, inp in enumerate(batch[0]):
                    input_tensor[i, :len(inp)] = inp
                
                for i, mask in enumerate(batch[1]):
                    mask_tensor[i, :len(mask)] = mask
                    
                model_output, target = self.model(input_tensor, mask_tensor)
                
                loss = self.loss_function(model_output.transpose(1, 2), target)
                loss.backward()
                
                nn.utils.clip_grad_norm_(self.model.parameters(), 0.5)
                self.optimizer.step()
                self.optimizer.zero_grad()
                losses.append(loss.item())
                
            epoch_loss = np.average(losses)
            loss_epoch.append(epoch_loss)
            print(f"Epoch: {epoch}, Loss: {epoch_loss}")
        
        return loss_epoch

## Generator

In [28]:
class Generator:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        
    def pad_left(self, sequence, final_length, padding_token):
        return [padding_token] * (final_length - len(sequence)) + sequence
        
    def generate(self, max_tokens, prompt = None, temperature = 1.0, eos_token = None, padding_token = 0):
        self.model.eval()
        
        if prompt is None:
            start_tokens = [padding_token]
        else:
            start_tokens = self.tokenizer.tokenize(prompt)
            
        input_tensor = torch.tensor(
            self.pad_left(start_tokens, self.model.max_sequence_length, padding_token), dtype = torch.long
        ).unsqueeze(0)
        

        for _ in range(max_tokens):
            x = input_tensor[:, -self.model.max_sequence_length:]
            
            mask = torch.ones_like(x)
            mask[x == padding_token] = 0
            
            next_token_prob = self.model.next_token_probabilities(x = x, temperature = temperature, mask = mask)
            
            next_token = torch.multinomial(next_token_prob, num_samples = 1)
            
            input_tensor = torch.cat([input_tensor, next_token.unsqueeze(0)], dim = 1)
            
            if eos_token is not None and next_token.item() == eos_token:
                break
        
        generated_tokens = input_tensor[0].tolist()
        return ''.join([self.tokenizer.token_to_character(token) for token in generated_tokens if token != padding_token])

## Running

In [29]:
def create_training_sequences(max_sequence_length, tokenized_data):
    sequences = []
    for i in range(0, len(tokenized_data) - max_sequence_length):
        sequences.append(tokenized_data[i: i + max_sequence_length + 1])
    
    return sequences

In [30]:
def tokenize_and_pad_training_data(max_sequence_length, tokenizer, training_data):
    tokenized_data = tokenizer.tokenize(training_data)
    padded_data = [tokenizer.character_to_token('<pad>')] * max_sequence_length + tokenized_data
    
    return padded_data

In [31]:
class Run(torch.nn.Module):
    def __init__(self, embedding_dim = 256, max_sequence_length = 50):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.max_sequence_length = max_sequence_length
        
    def run(self, prompt):
        tokenizer = Tokenizer()
        num_tokens = tokenizer.size()
        
        model = AutoregressiveWrapper(TextGenerator(
            embedding_dim = self.embedding_dim,
            num_tokens = num_tokens,
            num_heads = 4,
            num_layers = 3, 
            dropout_rate = 0.1,
            max_sequence_length = self.max_sequence_length
        ))
        
        training_data = pd.read_csv('training_data.csv')['text'].to_numpy()
        training_data = '. '.join(training_data)
        
        tokenized_and_padded_training_data = tokenize_and_pad_training_data(self.max_sequence_length, tokenizer, training_data)
        sequences = create_training_sequences(self.max_sequence_length, tokenized_and_padded_training_data)
        
        # training
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = model.to(device)
        
        optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)
        trainer = Trainer(model, tokenizer, optimizer)
        loss_per_epoch = trainer.train(sequences, epochs = 100, batch_size = 16)
        
        # Plot the loss per epoch in log scale
        plt.plot(loss_per_epoch)
        plt.yscale('log')
        plt.ylabel('Loss')
        plt.xlabel('Epoch')
        plt.show()
        
        # generate text
        max_tokens = 400
        generator = Generator(model, tokenizer)
        generated_text = generator.generate(
            max_tokens = max_tokens, prompt = prompt, padding_token = tokenizer.character_to_token('<pad>')
        )
        
        print(generated_text.replace('<pad>', ''))

In [32]:
runner = Run()
runner.run(prompt = "Photo by")

  _torch_pytree._register_pytree_node(


RuntimeError: The size of tensor a (16) must match the size of tensor b (50) at non-singleton dimension 1