# Text Generator
Implementing a text generation model from scratch using a transformer (decoder only).\
Steps:
1. Tokenization
2. Vectorization
3. Positional encoding
4. Masking
5. Self-attention
6. Decoder stack
7. Predicting token probabilities

## Creating Training Data

In [None]:
#conda install pytorch torchvision torchaudio -c pytorch

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import math
import pandas as pd

In [38]:
class creating_data():
    def __init__(self, filepath):
        self.df = pd.read_csv(filepath)
    
    def save(self, path):
        self.df.to_csv(path)
    

In [None]:
# dataset = creating_data('medium_articles.csv')
# dataset.save('training_data.csv')

## Tokenization

In [27]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import math
import pandas as pd

In [28]:
class Tokenizer():
    def __init__(self):
        self.dictionary = {}
        self.reverse_dictionary = {}
        
        # adding special tokens
        self.__add_to_dict('<pad>')
        self.__add_to_dict('<unk>')
        
        # add characters and numbers to dictionary
        for i in range(10):
            self.__add_to_dict(str(i))
        
        for i in range(26):
            self.__add_to_dict(chr(ord('a') + i))
            self.__add_to_dict(chr(ord('A') + i))
            
        # adding space and punctuation
        for char in ['.', ' ', ',', '!', '?', '\n']:
            self.__add_to_dict(char)
        
    def __add_to_dict(self, character):
        if character not in self.dictionary:
            index = self.size()
            self.dictionary[character] = index
            self.reverse_dictionary[index] = character
            
    def tokenize(self, text):
        return [self.dictionary.get(c, self.dictionary['<unk>']) for c in text]
    
    def character_to_token(self, character):
        return self.dictionary.get(character, self.dictionary['<unk>'])
    
    def token_to_character(self, token):
        return self.reverse_dictionary.get(token, '<unk>')
    
    def size(self):
        return len(self.dictionary)

In [29]:
training_data = pd.read_csv('training_data.csv')
training_data = training_data['text']

In [30]:
training_data.head()

0    Photo by Josh Riemer on Unsplash\n\nMerry Chri...
1    Your Brain On Coronavirus\n\nA guide to the cu...
2    Mind Your Nose\n\nHow smell training can chang...
3    Passionate about the synergy between science a...
4    You’ve heard of him, haven’t you? Phineas Gage...
Name: text, dtype: object

In [31]:
training_data = training_data.to_numpy()

In [32]:
# instantiating tokenizer
tokenizer = Tokenizer()
tokenized_data = [tokenizer.tokenize(sentence) for sentence in training_data]

In [35]:
max_sequence_length = 20
# padding and truncating
padded_data = []

for tokens in tokenized_data:
    if len(tokens) < max_sequence_length:
        # padding
        tokens = [tokenizer.character_to_token('<pad>')] * (max_sequence_length - len(tokens)) + tokens
    else:
        # truncating
        tokens = tokens[:max_sequence_length]
    padded_data.append(tokens)

In [37]:
# converting data to tensors
tensor_data = [torch.tensor(tokens) for tokens in padded_data]

## Input Embeddings

In [41]:
class TokenEmbedding(torch.nn.Module):
    # model that converts tokens into embeddings
    
    def __init__(self, model_dim, num_tokens):
        super().__init__()
        self.embedding_layer = torch.nn.Embedding(
            num_embeddings = num_tokens,
            embedding_dim = model_dim
        )
        
    def forward(self, x):
        return self.embedding_layer(x)

In [40]:
model_dim = 50
num_tokens = tokenizer.size()

In [45]:
# initializing class
embedding_model = TokenEmbedding(model_dim, num_tokens)
# convert padded data to tensor
tensor_data = torch.stack(tensor_data)
embedded_data = embedding_model(tensor_data)

In [47]:
# Print the shape of the embedded data to verify
print("Shape of embedded data:", embedded_data.shape)

# Print the first embedded sequence for verification
print("First embedded sequence:", embedded_data[0])

Shape of embedded data: torch.Size([100, 20, 50])
First embedded sequence: tensor([[ 8.5259e-01,  2.5915e-01,  1.2070e-01, -3.9863e-01, -2.3215e+00,
         -1.4551e+00,  1.6208e-01,  9.0236e-01,  4.6822e-01,  5.6436e-01,
          1.2565e+00, -3.8793e-02, -9.3262e-01, -1.0947e+00,  1.3738e-01,
          8.1687e-02, -2.0471e+00,  3.6765e-01,  1.6817e+00,  6.0889e-01,
          7.4221e-01, -3.1976e-02, -2.0586e+00,  2.3220e+00,  1.7538e-01,
          5.1019e-01, -1.5991e+00, -5.1244e-01, -9.6798e-01, -2.2054e-01,
          9.2633e-01,  1.2702e+00,  9.7206e-01,  1.1339e-01, -2.1605e-01,
         -1.0807e+00,  1.0384e+00,  3.5989e-01,  4.1306e-01, -5.1856e-01,
          5.1901e-01,  9.1237e-01,  8.0048e-01,  8.0138e-01, -4.2621e-01,
          7.3701e-01, -7.3618e-02, -6.0010e-01,  1.6909e-01, -1.8172e+00],
        [-9.3831e-01, -3.8516e-01, -3.1127e-01, -6.8238e-01,  1.8700e-01,
         -5.2606e-01,  1.7337e-01, -8.0887e-03, -1.0562e+00,  7.7446e-01,
         -1.0280e+00, -7.5757e-01, -