In [45]:
# imports
import fitz
import re
import tiktoken
import torch
from torch.utils.data import DataLoader, Dataset

Using with open doesn't work well for pdf so i used pymupdf to parse the pdf and get the text from it. I also removed the first 4845 characters from the text because they were the index and just titles, so we can use the real text from the pdf. 

In [46]:
# Getting file

file_path = "relativity.pdf"
doc = fitz.open(file_path)
raw_text = ""
for page in doc:
    raw_text += page.get_text()
raw_text = raw_text[4845:]
print(len(raw_text))
print(raw_text[:200])


185038
In your schooldays most of you who read this book made acquaintance with the noble
building of Euclid's geometry, and you remember — perhaps with more respect than love
— the magnificent structure, on


In [47]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))
print(preprocessed[:50])

35454
['In', 'your', 'schooldays', 'most', 'of', 'you', 'who', 'read', 'this', 'book', 'made', 'acquaintance', 'with', 'the', 'noble', 'building', 'of', 'Euclid', "'", 's', 'geometry', ',', 'and', 'you', 'remember', '—', 'perhaps', 'with', 'more', 'respect', 'than', 'love', '—', 'the', 'magnificent', 'structure', ',', 'on', 'the', 'lofty', 'staircase', 'of', 'which', 'you', 'were', 'chased', 'about', 'for', 'uncounted', 'hours']


In [48]:
# creating the vocabulary
all_words = sorted(set(preprocessed))  # using set here so that duplicates are removed.
vocab_size = len(all_words)
vocab_size

3211

In [49]:
vocab = {token:integer for integer, token in enumerate(all_words)}
for i, item in enumerate(vocab.items()) :
    print(item)
    if i >= 10:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
('+', 5)
('+43', 6)
(',', 7)
('-', 8)
('-axis', 9)
('-rays', 10)


In [50]:
# building our own tokenizer class

class Tokenizer1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {integer:item for item, integer in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)                      
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text
    
tokenizer = Tokenizer1(vocab)

text = """"We will, we will, you!" By"""
ids = tokenizer.encode(text)
print(ids)

[1, 514, 3150, 7, 3126, 3150, 7, 3198, 0, 1, 202]


This encoder is only built on the vocab which the class got from the book, so it doesn't yet understand or have any kind of encoding value for words it doesn't know like rock or Queen so i had to remove them from the statement to get a encoded list. We will use byte-pair encoding or <unk> token to incorporate unknown vocab words. 

In [51]:
tokenizer.decode(ids)


'" We will, we will, you!" By'

Now to mitigate the problem we faced above, of unknown words for the vocab. We will use special characters. The two special characters going to be used are : <EndOfText> indicating the end of content from a particular text and <unk> indicating an unknown character which is not found in the defined vocab.

In [52]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer, token in enumerate(all_tokens)}

for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('æther-drift', 3208)
('ϖ', 3209)
('—', 3210)
('<|endoftext|>', 3211)
('<|unk|>', 3212)


Now let's create a new version of the tokenizer including the two new special tokens.

In [53]:
class Tokenizer2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {v:k for k, v in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([.,;:>!?_"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int
            else "<|unk|>" for item in preprocessed
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text
    
tokenizer = Tokenizer2(vocab)
text1 = "When you sit with a beautiful girl 2 hours seem like 2 minutes"
text2 = "When you put your hand in a hot kettle, 2 minutes seem like 2 hours"
    
text = "<|endoftext|>".join([text1, text2])
print(tokenizer.encode(text))
    

[517, 3198, 3212, 3154, 539, 781, 3212, 53, 1674, 2620, 1888, 53, 3212, 3212, 517, 3198, 2388, 3199, 1632, 1711, 539, 3212, 3212, 7, 53, 3212, 2620, 1888, 53, 1674]


In [54]:
print(tokenizer.decode(tokenizer.encode(text)))

When you <|unk|> with a beautiful <|unk|> 2 hours seem like 2 <|unk|> <|unk|> When you put your hand in a <|unk|> <|unk|>, 2 <|unk|> seem like 2 hours


In [55]:
tokenizer = tiktoken.get_encoding("gpt2")
text += "einsteinworkamazing"
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[2215, 345, 1650, 351, 257, 4950, 2576, 362, 2250, 1283, 588, 362, 2431, 50256, 2215, 345, 1234, 534, 1021, 287, 257, 3024, 40231, 11, 362, 2431, 1283, 588, 362, 1711, 325, 11962, 1818, 321, 4070]


In [56]:
strings = tokenizer.decode(integers)
print(strings)

When you sit with a beautiful girl 2 hours seem like 2 minutes<|endoftext|>When you put your hand in a hot kettle, 2 minutes seem like 2 hourseinsteinworkamazing


The byte pair encoding is the current best encoding method and it is used in models like chatgpt too. After figuring out this tokenizing part, we will figure out data sampling in a sliding window. For this we will use the pytorch's builtin dataset and dataloaders to create an efficient way of retrieving data when we train our LLM. 

In [57]:
encoded_text = tokenizer.encode(raw_text)
print(len(encoded_text))
context_size = 4

x = encoded_text[:context_size]
y = encoded_text[1:context_size+1]

print(f"x: {x}")
print(f"y:      {y}")

42086
x: [818, 534, 5513, 78]
y:      [534, 5513, 78, 727]


After using the encoder, we will create the dataset class and dataloarders.

In [58]:
class GPTDatasetV1(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        
        token_ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
        assert len(token_ids) > max_length, "Number of tokenized inputs must at least be equal to max_length+1"
        for i in range(1, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i: max_length+i]
            target_chunk = token_ids[i+1: i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
            
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]

Here, max_length is the size of each input sample the dataset will provide, for example, a max_length of 5 would give an input chunk like [1,2,3,4,5]. The stride is defined as how many tokens the window will shift, if stride is max_length then there would be no overlapping. 

In [59]:
def create_dataloader_v1(text, batch_size = 4, max_length = 256, stride = 128, shuffle = True, drop_last = True, num_workers = 0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(text, tokenizer, max_length, stride)
    dataloader = DataLoader(dataset, shuffle=shuffle, batch_size=batch_size, drop_last=drop_last, num_workers=num_workers)
    return dataloader

dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=8, stride=2, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[ 534, 5513,   78,  727,  592,  749,  286,  345]]), tensor([[5513,   78,  727,  592,  749,  286,  345,  508]])]


In [60]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[  78,  727,  592,  749,  286,  345,  508, 1100]]), tensor([[ 727,  592,  749,  286,  345,  508, 1100,  428]])]


In [61]:
inputs, targets = next(data_iter)
print(inputs)
print(targets)

tensor([[ 592,  749,  286,  345,  508, 1100,  428, 1492]])
tensor([[ 749,  286,  345,  508, 1100,  428, 1492,  925]])


The next thing we are going to create is the embedding matrix. The embedding matrix is a matrix of shape of the dimensions of the vector we want to create for the embedding and the number of unique tokens in the vocabulary. Initially all the weights in the embedding matrix are randomized, then while training the model, these weights are finetuned using backpropogation to minimize the loss function. So basically in each run, it sees that by modifying which weights in in direction and magnitude, is the loss function decreasing by using the gradient descent method. The token vectors we created above don't enclose any kind of semantic meaning, to add semantic meaning to them we create these vectors which initially face in random directions in a high dimensional space but with training they find particular directions where they show some semantic meaning. So for each token id, a embedding vector is created initially for each token in the input. For each token ID, the embedding vector is fixed and shared across all occurrences, regardless of context. However, as the input embedding passes through the model’s layers, it is transformed based on the surrounding words, allowing the model to represent different meanings in different contexts.

In [62]:
# creating the embedding matrix

output_dimension = 256
vocab_size = 50257
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dimension)
max_length = 4 
dataloader_v2 = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length, stride = max_length, shuffle=False
)
data_iter = iter(dataloader_v2)
inputs, targets = next(data_iter)
print(inputs)
print(targets)

tensor([[  534,  5513,    78,   727],
        [  592,   749,   286,   345],
        [  508,  1100,   428,  1492],
        [  925, 35552,   351,   262],
        [15581,   198, 16894,   286],
        [48862,   312,   338, 22939],
        [   11,   290,   345,  3505],
        [  851,  3737,   351,   517]])
tensor([[ 5513,    78,   727,   592],
        [  749,   286,   345,   508],
        [ 1100,   428,  1492,   925],
        [35552,   351,   262, 15581],
        [  198, 16894,   286, 48862],
        [  312,   338, 22939,    11],
        [  290,   345,  3505,   851],
        [ 3737,   351,   517,  2461]])


In [64]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [65]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dimension)

print(pos_embedding_layer.weight)

Parameter containing:
tensor([[ 1.7396, -0.5382, -0.4037,  ...,  0.7914,  1.2251, -0.8515],
        [-0.3755,  0.0399, -0.9273,  ..., -0.3964,  0.1143, -0.0184],
        [-1.0424, -0.3457,  0.3157,  ...,  0.1365,  0.4241, -0.6689],
        [ 1.1800, -1.1487, -0.6358,  ..., -1.1628,  0.8155,  0.1252]],
       requires_grad=True)


In [66]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)
print(pos_embeddings)

torch.Size([4, 256])
tensor([[ 1.7396, -0.5382, -0.4037,  ...,  0.7914,  1.2251, -0.8515],
        [-0.3755,  0.0399, -0.9273,  ..., -0.3964,  0.1143, -0.0184],
        [-1.0424, -0.3457,  0.3157,  ...,  0.1365,  0.4241, -0.6689],
        [ 1.1800, -1.1487, -0.6358,  ..., -1.1628,  0.8155,  0.1252]],
       grad_fn=<EmbeddingBackward0>)


In [67]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
