## Tokenization

In [71]:
with open('the-verdict.txt','r',encoding='utf-8') as f:
    verdict = f.read()

import re
alltext = re.findall(r'\w+|[,.!?"\']|--',verdict)


alltext = sorted(set(alltext)) #sort the text

print(alltext)

vocab = {key:value for key,value in enumerate(alltext)} ## vocab
# for i,item in enumerate(vocab.items()):
#     if(i<50):
#         print(i,item)



['!', '"', "'", ',', '--', '.', '?', 'A', 'Ah', 'Among', 'And', 'Are', 'Arrt', 'As', 'At', 'Be', 'Begin', 'Burlington', 'But', 'By', 'Carlo', 'Chicago', 'Claude', 'Come', 'Croft', 'Destroyed', 'Devonshire', 'Don', 'Dubarry_', 'Emperors', 'Florence', 'For', 'Gallery', 'Gideon', 'Gisburn', 'Gisburns', 'Grafton', 'Greek', 'Grindle', 'Grindles', 'HAD', 'Had', 'Hang', 'Has', 'He', 'Her', 'Hermia', 'His', 'How', 'I', 'If', 'In', 'It', 'Jack', 'Jove', 'Just', 'Lord', 'Made', 'Miss', 'Money', 'Monte', 'Moon', 'Mr', 'Mrs', 'My', 'Never', 'No', 'Now', 'Nutley', 'Of', 'Oh', 'On', 'Once', 'Only', 'Or', 'Perhaps', 'Poor', 'Professional', 'Renaissance', 'Rickham', 'Riviera', 'Rome', 'Russian', 'Sevres', 'She', 'Stroud', 'Strouds', 'Suddenly', 'That', 'The', 'Then', 'There', 'They', 'This', 'Those', 'Though', 'Thwing', 'Thwings', 'To', 'Usually', 'Venetian', 'Victor', 'Was', 'We', 'Well', 'What', 'When', 'Why', 'Yes', 'You', '_I', '_am_', '_famille', '_felt_', '_has_', '_have_', '_jardiniere_', '_min

In [63]:
class SimpleClassTokenizer:
    def __init__(self,vocab):
        self.str_to_int = {s:i for i,s in vocab.items()}
        self.int_to_str = vocab
    
    def encode(self,text):
        preprocessed = re.findall(r'\w+|[,.!?"\']|--',text)
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
     
    def decode(self,ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.!?"()\'])', r'\1', text)
        return text


In [None]:
tokenizer = SimpleClassTokenizer(vocab)

text1 = """"It's the last he painted, you know," 
           Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text1)
print(ids)
tokenizer.decode(ids)


[1, 52, 2, 863, 1002, 611, 542, 756, 3, 1140, 605, 3, 1, 63, 5, 34, 864, 1122, 765, 806, 5]


'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [65]:
# print(alltext)
alltext2 = alltext
alltext2.extend(['<|endoftext|>','<|unk|>']) ## Add endline
print(len(alltext2))
vocab = {i:s for i,s in enumerate(alltext2)}
print(len(vocab))


1146
1146


In [None]:
alltext.clear()

In [66]:
class SimpleClassTokenizerV2:
    def __init__(self,vocab):
        self.str_to_int = {s:i for i,s in vocab.items()}
        self.int_to_str = vocab

    def encoding(self,text):
        preprocessed = re.findall(r'\w+|--|[,.:;\-()?!"\']',text)
        preprocessed = [item if item in self.str_to_int else '<|endoftext|>' for item in preprocessed]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decoding(self,ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+(--|[,.:;\-()?!"\'])',r'\1',text)
        return text


In [None]:
tokenizer = SimpleClassTokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text0 = " <|endoftext|> ".join((text1, text2))

print(text0)
ids = tokenizer.encoding(text0)
print(ids)
print(tokenizer.decoding(ids))

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.
[1144, 3, 370, 1140, 636, 989, 6, 1144, 51, 1002, 970, 998, 732, 1002, 1144, 5]
<|endoftext|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|endoftext|>.


## Byte Pair Encoding

In [77]:
import tiktoken
tokenizer = tiktoken.get_encoding('gpt2')
token_ids = tokenizer.encode(verdict,allowed_special={"<|endoftext|>"})
text = tokenizer.decode(token_ids)
print(len(token_ids))
print(len(text))
print(text[:50])

5145
20479
I HAD always thought Jack Gisburn rather a cheap g


## Create Input and Output Pairs