In [1]:
with open("/Users/pranavisriya/Documents/Courses/LLM-from-scratch/tokenizer/data/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text=f.read()

print("Total number of characters:", len(raw_text))
print(raw_text[:99])

Total number of characters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [3]:
import re
text="Hello, world. This, is a test."
result=re.split(r"[\s]", text)
print(result)
result=re.split(r"([.,\s])", text)
print(result)

['Hello,', 'world.', 'This,', 'is', 'a', 'test.']
['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


In [4]:
result=[item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [7]:
text="Hello, world. Is this-- a test?"
result=re.split(r'([.,:;?_!"()\']|--|\s--)', text)
result=[item.strip() for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'Is this', '--', 'a test', '?']


In [10]:
preprocessed=re.split(r'([,:;?_!"()\']|--|\s)', raw_text)
preprocessed=[item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [9]:
print(len(preprocessed))

1312


## Creating TokenIDs

In [11]:
all_words=sorted(set(preprocessed))
vocab_size=len(all_words)

print("Vocab size:", vocab_size)

Vocab size: 1212


In [12]:
vocab={token:integer for integer, token in enumerate(all_words)}

In [13]:
for i, token in enumerate(list(vocab)[:50]):
    print(i, token)

0 !
1 "
2 '
3 (
4 )
5 ,
6 --
7 .
8 :
9 ;
10 ?
11 A
12 Ah
13 Among
14 And
15 Are
16 Arrt
17 As
18 At
19 Be
20 Begin
21 Burlington
22 But
23 By
24 Carlo
25 Chicago
26 Claude
27 Come
28 Croft
29 Destroyed
30 Devonshire
31 Don
32 Dubarry
33 Emperors
34 Florence.
35 For
36 Gallery
37 Gideon
38 Gisburn
39 Gisburns
40 Grafton
41 Greek
42 Grindle
43 Grindle.
44 Grindles.
45 HAD
46 Had
47 Hang
48 Has
49 He


## Create Token class
1)Create encode method token -> token id
2)decode method token id -> token

In [16]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int=vocab
        self.int_to_str={i:s for s, i in vocab.items()}

    def encode(self, text):
        preprocessed=re.split(r'([,:;?_!"()\']|--|\s)', text)
        preprocessed=[item.strip() for item in preprocessed if item.strip()]
        token_ids=[self.str_to_int[token] for token in preprocessed]
        return token_ids
    
    def decode(self, token_ids):
        tokens=[self.int_to_str[token_id] for token_id in token_ids]
        text=" ".join(tokens)
        text=re.sub(r'\s([.,:;?_!"()\'])', r'\1', text)
        return text

In [17]:
tokenizer=SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know," 
           Mrs. Gisburn said with pardonable pride."""
encode=tokenizer.encode(text)
print("Encoded:", encode)

Encoded: [1, 57, 2, 907, 1058, 638, 559, 793, 5, 1208, 631, 5, 1, 68, 38, 908, 1188, 803, 847]


In [18]:
tokenizer.decode(encode)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [19]:
text = "Hello, do you like tea?"
print(tokenizer.encode(text))

KeyError: 'Hello'

Hello is not present in the vocab. So, it is howing error. To handle this we create another class where it replaces unkown words with <|unk|> and end of the text with <|endoftext|>

In [20]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|unk|>", "<|endoftext|>"])
vocab= {token:integer for integer, token in enumerate(all_tokens)}


In [21]:
len(vocab.items())

1214

In [22]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1209)
('your', 1210)
('yourself', 1211)
('<|unk|>', 1212)
('<|endoftext|>', 1213)


In [23]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int 
            else "<|unk|>" for item in preprocessed
        ]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

In [24]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))

print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [25]:
tokenizer.encode(text)

[1212,
 5,
 373,
 1208,
 667,
 1043,
 10,
 1213,
 56,
 1058,
 1022,
 1053,
 766,
 1058,
 1212,
 7]

In [26]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'