# Creation of a tokenizer based on LLMs from scratch book

In [2]:
with open("the_verdict.txt","r",encoding="utf-8") as f:
    raw_text = f.read()
print(f"Number of characters: {len(raw_text)}")
print(f"First 99 characters:\n{raw_text[:99]}")


Number of characters: 20479
First 99 characters:
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


## The scope of this tokenizer is to process each word and special character and embbed it

In [3]:
import re
text = "Hellow world. Hello again-- from re."
#result = re.split(r'(\s)', text)
result = re.split(r'([,.:,?_!"()\']|--|\s)', text)
print(result)
result = [item for item in result if item.strip()]
print(result)

['Hellow', ' ', 'world', '.', '', ' ', 'Hello', ' ', 'again', '--', '', ' ', 'from', ' ', 're', '.', '']
['Hellow', 'world', '.', 'Hello', 'again', '--', 'from', 're', '.']


In [4]:
preprocessed = re.split(r'([,.:,?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))

4669


In [5]:
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


## Convert the tokens in token_ids. Firstly we need to define a vocabulary
The vocabulary contains every token from the text and it should be written in alphabetical order

In [24]:
all_tokens = sorted(set(preprocessed))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab_size = len(all_tokens)
print(vocab_size)

1145


In [27]:
vocab = {token:integer for integer,token in enumerate(all_tokens)}
# for i, item in enumerate(vocab.items()):
#     print(item)
#     if i>=50:
#         break

for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)    

('younger', 1140)
('your', 1141)
('yourself', 1142)
('<|endoftext|>', 1143)
('<|unk|>', 1144)


In [28]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self,text):
        preprocessed = re.split(r'([,.:,?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self,ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'"])', r'\1', text)
        return text

In [33]:
tokenizer = SimpleTokenizerV1(vocab)
text1 = "hello my friend."
text2 = "It's me Mario!"
sentance = " <|endoftext|> ".join((text1,text2))
print(sentance)
ids = tokenizer.encode(sentance)
print(ids)

hello my friend. <|endoftext|> It's me Mario!
[1144, 705, 481, 7, 1143, 57, 2, 861, 671, 1144, 0]


In [35]:
print(tokenizer.decode(ids))

<|unk|> my friend. <|endoftext|> It' s me <|unk|>!


## Using BPE(Byte pair encoding)

In [37]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

In [40]:
integers = tokenizer.encode(sentance, allowed_special={"<|endoftext|>"})
print(integers)

[31373, 616, 1545, 13, 220, 50256, 632, 338, 502, 10682, 0]


In [42]:
print(tokenizer.decode(integers))

hello my friend. <|endoftext|> It's me Mario!


## Sliding window for answer generation

In [44]:
import tiktoken

with open("the_verdict.txt","r",encoding="utf-8") as f:
    raw_text = f.read()

tokenizer = tiktoken.get_encoding("gpt2")
enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [52]:
# For educational reasons we omit the first 50 tokens

enc_sample = enc_text[50:]
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size]

print(f"x: {x}")
print(f"        {y}")


# Simulation of predictions
for i in range(1,context_size):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(context, "-->", desired)
for i in range(1,context_size):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(tokenizer.decode(context), "-->", tokenizer.decode([desired]))

x: [290, 4920, 2241, 287]
        [4920, 2241, 287]
[290] --> 4920
[290, 4920] --> 2241
[290, 4920, 2241] --> 287
 and -->  established
 and established -->  himself
 and established himself -->  in


## Creation of a data loader

In [54]:
"""
-> we need to provide two tensors
1) The input tensor
2) The output tensor
"""
import torch
from torch.utils.data import Dataset, DataLoader
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]