In [25]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

print("#of characters: ", len(raw_text))
print(raw_text[:99])

#of characters:  20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


We want to tokenize the short story into individual words and characters for the tokenizer and then put ids for embeddings for LLM training

Use re for regex to split any given text

In [26]:
import re

text = "Hello, everyone. My name is Vir."
split_text1 = re.split(r'(\s)', text) #will split whereaver white space is found
print(split_text1)

['Hello,', ' ', 'everyone.', ' ', 'My', ' ', 'name', ' ', 'is', ' ', 'Vir.']


In [27]:
split_text2 = re.split(r'([,.]|\s)', text) #will split by whitespace comma and period
print(split_text2) 

split_text2 = [item for item in split_text2 if item.strip()]
print(split_text2)

['Hello', ',', '', ' ', 'everyone', '.', '', ' ', 'My', ' ', 'name', ' ', 'is', ' ', 'Vir', '.', '']
['Hello', ',', 'everyone', '.', 'My', 'name', 'is', 'Vir', '.']


Remove whitespaces in the event that they are not relevant, might be relevant for pyhton coding etc. 

In [28]:
split_text3 = re.split(r'([,.!?:;_]|\s|--)', text)
split_text3 = [item for item in split_text3 if item.strip()]
print(split_text3) #This is the simple tokenization scheme, different for LLM (Byte Pair Scheme)

['Hello', ',', 'everyone', '.', 'My', 'name', 'is', 'Vir', '.']


In [29]:
split_raw_text = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in split_raw_text if item.strip()]
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [30]:
print(len(preprocessed)) # now we need to convert to token ids

4690


Build Sorted Vocabulary in order to do token ids. sorted alphabetically and then by index, each unique token is mapped to a unique token id

In [31]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

1130


In [32]:
vocab = {token:integer for integer,token in enumerate(all_words)}

In [33]:
for i, item in enumerate(vocab.items()) :
    print(item)
    if i >= 50 :
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


We will need a way to convert token ids back to text for decoder

In [34]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab #encode
        self.int_to_str = {i:s for s, i in vocab.items()} #flip for decoder
    
    def encode(self, text) :
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)

        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids) :
        text = " ".join([self.int_to_str[i] for i in ids])
        #replace spaces before punctuation
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text


In [35]:
tokenizer = SimpleTokenizerV1(vocab) #converted to tokens and assigned ids

text = """It's the last he painted, you know, 
            Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 67, 7, 38, 851, 1108, 754, 793, 7]


In [36]:
tokenizer.decode(ids)

"It' s the last he painted, you know, Mrs. Gisburn said with pardonable pride."

What if we have a sentence where we don't have a word like hello in the tokenizer training set, we will get a key error
ChatGPT uses special context tokens for words that might not exist in the vocabulary

Support unknown token and the end of text token, add end of text token between text sources eg. between books

In [40]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer, token in enumerate(all_tokens)} #enumerate makes alphabetical

In [41]:
len(vocab.items())

1132

In [42]:
for i, item in enumerate(list(vocab.items())[-5:]) :
    print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [50]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab #encode
        self.int_to_str = {i:s for s, i in vocab.items()} #flip for decoder
    
    def encode(self, text) :
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)

        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        preprocessed = [
            item if item in self.str_to_int
            else '<|unk|>' for item in preprocessed #if not present in vocabulary then we assign it to the unkown token
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids) :
        text = " ".join([self.int_to_str[i] for i in ids])
        #replace spaces before punctuation
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text


In [51]:
tokenizer2 = SimpleTokenizerV2(vocab)
text1 = "Hello, I am Vir"
text2 = "in the sunlit terraces of the palace"

text = "<|endoftext|>".join((text1, text2))
print(text)



Hello, I am Vir<|endoftext|>in the sunlit terraces of the palace


In [52]:
tokenizer2.decode(tokenizer2.encode(text))

'<|unk|>, I am <|unk|> the sunlit terraces of the <|unk|>'

# Special Context Tokens'

[BOS] Beginning of Sequence

[EOS] End of Sequence

[PAD] Might have texts of varying lengths, so we pad