# Creating Tokens

In [179]:
with open("the-verdict (1).txt","r",encoding="utf-8") as f:
    raw_text = f.read()

print("Total number of characters:", len(raw_text))
print(raw_text[:99])

Total number of characters: 20480
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


#### Goal is to tokenize all the characters in the txt file that can be later turned into embeddings for LLMs

In [181]:
import re

text = "Hello, world. This, is a test."
result = re.split(r'(\s)',text)

print(result) # result of individual words , whitespaces an punctuation characters

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [182]:
result = re.split(r'([,.]|\s)',text)
print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


In [183]:
# This will prevent the spaces to be considered as tokens
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [184]:
text = "Hello, world. Is this-- a test?"

result = re.split(r'([,.:;?_!"()|\-\s]+)', text)
result = [item.strip() for item in result if item.strip()]
print(result)


['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [185]:
preprocessed = re.split(r'([,.:;?_!"()|\-\s])',raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]

print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '-', '-', 'though', 'a', 'good', 'fellow', 'enough', '-', '-', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that']


In [186]:
print(len(preprocessed))

4692


 # Creating Token IDs

In [188]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

print(vocab_size)

1163


In [189]:
 vocab = {token:integer for integer, token in enumerate(all_words)}

In [190]:
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break 

('!', 0)
('"', 1)
("'", 2)
("'Are", 3)
("'It's", 4)
("'coming'", 5)
("'done'", 6)
("'subject", 7)
("'technique'", 8)
("'way", 9)
('(', 10)
(')', 11)
(',', 12)
('-', 13)
('.', 14)
(':', 15)
(';', 16)
('?', 17)
('A', 18)
('Ah', 19)
('Among', 20)
('And', 21)
('Arrt', 22)
('As', 23)
('At', 24)
('Be', 25)
('Begin', 26)
('Burlington', 27)
('But', 28)
('By', 29)
('Carlo', 30)
('Chicago', 31)
('Claude', 32)
('Come', 33)
('Croft', 34)
('Destroyed', 35)
('Devonshire', 36)
("Don't", 37)
('Dubarry', 38)
('Emperors', 39)
('Florence', 40)
('For', 41)
('Gallery', 42)
('Gideon', 43)
('Gisburn', 44)
("Gisburn's", 45)
('Gisburns', 46)
('Grafton', 47)
('Greek', 48)
('Grindle', 49)
("Grindle's", 50)


In [191]:
class SimpleTokenizer:
    def __init__(self,vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
        
    def encode(self,text):
        preprocessed = re.split(r'([,.:;?_!"()\-\s])',text)
        processed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed if s.strip() and s in self.str_to_int]
        return ids
        
    def decode(self,ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r's+([,.?!"()\'])',r'\1', text)
        return text

#### Instantiate a tokenizer object from the tokenizer class

In [193]:
tokenizer = SimpleTokenizer(vocab)

text = """"" It's the last he painted, you know,"
             Mrs. Gisburn said to pardonable pride."""

ids = tokenizer.encode(text)
print(ids)

[1, 1, 69, 1016, 627, 552, 772, 12, 1157, 621, 12, 1, 81, 14, 44, 879, 1046, 781, 822, 14]


In [194]:
tokenizer.decode(ids)

'" " It\'s the last he painted , you know , " Mrs . Gisburn said to pardonable pride .'

# Special Context Tokens

### These are something that are used to deal with the unknown words which
### are not present in the training vocubulary or training data.

#### Here simpleTokenizer is modified

In [197]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>","<|unk|>"])

vocab = {token:integer for integer, token in enumerate(all_tokens)}

In [198]:
len(vocab.items())

1165

In [199]:
for i, item in enumerate(list(vocab.items())[-3:]):
    print(item)

('yourself', 1162)
('<|endoftext|>', 1163)
('<|unk|>', 1164)


In [200]:
# Modification in SimpleTokenizer

class SimpleTokenizerV2:
    def __init__(self,vocab):
        self.str_to_int = vocab
        self.int_to_str = {i: s for s, i in vocab.items()}
        
    def encode(self,text):
        preprocessed = re.split(r'([,.:;?_!"()\-\s])',text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int
            else "<|unk|>" for item in preprocessed 
        ]
        ids = [self.str_to_int[s] for s in preprocessed if s.strip() and s in self.str_to_int]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

In [231]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "in the sunlit terrace of the palace."

text = " <|endoftext|> ".join((text1,text2))

print(text)


Hello, do you like tea? <|endoftext|> in the sunlit terrace of the palace.


In [233]:
tokenizer.encode(text)

[1164,
 12,
 378,
 1157,
 653,
 1004,
 17,
 1163,
 592,
 1016,
 986,
 1011,
 748,
 1016,
 1164,
 14]

In [235]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like tea? <|endoftext|> in the sunlit terrace of the <|unk|>.'