## Token Creation

In [23]:
with open("/content/verdict.txt.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 20480
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [1]:
import re

In [14]:
text = "hello World,when it comes to Programmer their preferred word is Hello World"

token = re.split(r'(\s)',text) # splitting the sentence based on spaces

print(f"Tokens : {token}")

Tokens : ['hello', ' ', 'World,when', ' ', 'it', ' ', 'comes', ' ', 'to', ' ', 'Programmer', ' ', 'their', ' ', 'preferred', ' ', 'word', ' ', 'is', ' ', 'Hello', ' ', 'World']


In [20]:
token = re.split(r'([.,]|\s)',text) # split the sentence based on white spaces and special characters such as comma and fullstop

print(f"Tokens : {token}")

Tokens : ['Hello', ',', '', ' ', 'world', '.', '', ' ', 'Is', ' ', 'this--', ' ', 'a', ' ', 'test?']


In [21]:
token = [word for word in token if word.strip()] #To remove white spaces strip function is specified

print(f"Tokens : {token}")

Tokens : ['Hello', ',', 'world', '.', 'Is', 'this--', 'a', 'test?']


In [22]:
# Usage of Tokenization on raw text in single pass
text = "Hello, world. Is this-- a test?"
token = re.split(r'([,.:;?_!"()\']|--|\s)', text)
token = [word for word in token if word.strip()]
print(token)

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [24]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [25]:
print(len(preprocessed))


4690


## Token ID

In [26]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

print(vocab_size)

1130


In [27]:
vocab = {token:integer for integer,token in enumerate(all_words)}

In [28]:
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


In [31]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)

        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [32]:
tokenizer = SimpleTokenizerV1(vocab)

text = """It's the last he painted, you know,"Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [33]:
tokenizer.decode(ids)


'It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [43]:
#text = "Hello, do you like tea?"
#print(tokenizer.encode(text))

The above code will through an error because the SimpleTokenizerV1 class contains only the encoder and decoder for the words in the vocabulary it can't handle the unknown Words

To Handle the out of vocabulary Problem , an token for unknown words are created.

In [35]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer,token in enumerate(all_tokens)}

In [36]:
len(vocab.items())


1132

In [37]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [38]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int
            else "<|unk|>" for item in preprocessed
        ]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

In [39]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))

print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [40]:
tokenizer.encode(text)


[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]

In [41]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'