## Tokenization

In [1]:
with open('the-verdict.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()
    
print('Total number of character": ', len(raw_text))
print(raw_text[:99])  # Print the first 500 characters

Total number of character":  20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [2]:
import re

text = "Hello world. This, is a text."
result = re.split(r'(\s)', text)

print(result)  # ['Hello', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'text.']

['Hello', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'text.']


In [3]:
# preprocessed = re.split(r'([,.?_!"()\']--|\s)', raw_text)
preprocessed = re.findall(r"[A-Za-z]+(?:'[A-Za-z]+)?|[0-9]+|[^\w\s]", raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]

print(preprocessed[:30])  # Print the first 20 tokens

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '-', '-', 'though', 'a', 'good', 'fellow', 'enough', '-', '-', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that']


In [4]:
 print(len(preprocessed))

4669


In [5]:
# finding the unique tokens
unique_tokens = sorted(set(preprocessed))
print(len(unique_tokens))


1155


In [6]:
vocab = {token:integer for integer, token in enumerate(unique_tokens)}

In [7]:
for i, item in enumerate(vocab.items()):
    if i < 50:
        print(item)

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('-', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
("Don't", 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
("Gisburn's", 39)
('Gisburns', 40)
('Grafton', 41)
('Greek', 42)
('Grindle', 43)
("Grindle's", 44)
('Grindles', 45)
('HAD', 46)
('Had', 47)
('Hang', 48)
('Has', 49)


In [8]:
preprocessed = re.findall(r"[A-Za-z]+(?:'[A-Za-z]+)?|[0-9]+|[^\w\s]", raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]

all_tokens = sorted(list(set(preprocessed)))

all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token: integer for integer, token in enumerate(all_tokens)}

In [9]:
len(all_tokens)

1157

In [10]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1152)
('your', 1153)
('yourself', 1154)
('<|endoftext|>', 1155)
('<|unk|>', 1156)


In [11]:
class SimpleTokenizer:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i: s for s, i in vocab.items()}

    def encode(self, text):
        preprocessed = re.findall(r"[A-Za-z]+(?:'[A-Za-z]+)?|[0-9]+|[^\w\s]", text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s([,.!?;:](?:--)?)(\s|$)', r'\1\2', text)
        return text
# ...existing code...

In [12]:
tokenizer = SimpleTokenizer(vocab)

sample_text = "Hello, world! This is a test."
sample_text_2 = "In the sublit terraces of the palace, the light danced."

text = "<|endoftext|>".join((sample_text, sample_text_2))
print("Original Text: ", text)

Original Text:  Hello, world! This is a test.<|endoftext|>In the sublit terraces of the palace, the light danced.


In [13]:
tokenizer.encode(text)

[1156,
 5,
 1156,
 0,
 108,
 600,
 126,
 1156,
 7,
 1156,
 1156,
 1156,
 1156,
 1156,
 61,
 1009,
 1156,
 1005,
 740,
 1009,
 1156,
 5,
 1009,
 643,
 1156,
 7]

In [14]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, <|unk|>! This is a <|unk|>. <|unk|> <|unk|> <|unk|> <|unk|> <|unk|> In the <|unk|> terraces of the <|unk|>, the light <|unk|>.'

Seems like the tokenization is working however, there will be some issue working with LLM if I leave the tokenization this way without further preprocessing as it is to be seen that there are lots of tokens that have no meaning. So I am going to preprocess the text further so that my LLM could understand the text better.

## BytePair Encoding

In [16]:
import tiktoken 
print('version:', tiktoken.__version__)

version: 0.12.0


In [17]:
tokenizer = tiktoken.get_encoding("gpt2")

In [18]:
tokenizer.n_vocab

50257

In [19]:
# lets see how many tokenizers we got 

tiktoken.list_encoding_names()

['gpt2',
 'r50k_base',
 'p50k_base',
 'p50k_edit',
 'cl100k_base',
 'o200k_base',
 'o200k_harmony']

In [20]:
text = (
    "Hello, do you like tea? <|endoftext|> I prefer coffee!, In the sulit terraces of the palace, the light danced."
    "of someunknownword."
)

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 314, 4702, 6891, 28265, 554, 262, 33154, 270, 8812, 2114, 286, 262, 20562, 11, 262, 1657, 39480, 13, 1659, 617, 34680, 4775, 13]


In [21]:
strings = tokenizer.decode(integers)
print(strings)

Hello, do you like tea? <|endoftext|> I prefer coffee!, In the sulit terraces of the palace, the light danced.of someunknownword.


## Data Sampling

In [22]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [23]:
enc_sample = enc_text[50:]

In [26]:
context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1 : context_size + 1]

print(f"x: {x}")
print(f"y:      {y}")

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


In [27]:
# lets visualize whats going on

for i in range(context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(f"Context: {context} -> Next token: {desired}")

Context: [] -> Next token: 290
Context: [290] -> Next token: 4920
Context: [290, 4920] -> Next token: 2241
Context: [290, 4920, 2241] -> Next token: 287
Context: [290, 4920, 2241, 287] -> Next token: 257


In [28]:
for i in range(context_size + 1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(tokenizer.decode(context), "->", tokenizer.decode([desired]))

 ->  and
 and ->  established
 and established ->  himself
 and established himself ->  in
 and established himself in ->  a


In [32]:
from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i : i + max_length]
            target_chunk = token_ids[i + 1 : i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

def __len__(self):
    return len(self.input_ids)

def __getitem__(self, idx):
    return self.input_ids[idx], self.target_ids[idx]
    

In [33]:
def create_dataloader(txt, batch_size=4, max_length=256, 
                      stride=128, shuffle=True, drop_last=True,
                      num_workers=0):
    
    tokenizer = tiktoken.get_encoding("gpt2")

    dataset = MyDataset(txt, tokenizer, max_length, stride)

    dataloader = Dataloader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader