Creating Tokens

In [1]:
with open("the-verdict.txt","r",encoding = "utf-8") as f:
    raw_text = f.read()

print("Total number of characters :", len(raw_text))
print(raw_text[:99])

Total number of characters : 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


We'll use a python library named "Regular Expression" to split the text on the basis of white spaces.

In [4]:
import re
text = "Hello, There. This, is a test"
result = re.split(r'(\s)',text)
print(result)

['Hello,', ' ', 'There.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test']


In [5]:
result = re.split(r'([,.]|\s)',text)
print(result)

['Hello', ',', '', ' ', 'There', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test']


In [12]:
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'There', '.', 'This', ',', 'is', 'a', 'test']


In [14]:
# now preprocessing the dataset
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)',raw_text)
preprocessed = [item for item in preprocessed if item.strip()]
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [16]:
print(len(preprocessed))

4690


Creating token ID's

In [23]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

1130


In [38]:
# now we need to assign token id to each word of vocab also called as encoding

In [40]:
vocab = {token : integer for integer,token in enumerate(all_words)}

In [42]:
for i, item in enumerate(vocab.items()):
    print(item)
    if i>50:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)
('His', 51)


In [50]:
class SimpleTokenizerV1:
    def __init__(self,vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;"?()\']|--|\s)',text)
        preprocessed = [item for item in preprocessed if item.strip()]
        ids = [ self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        # here we join the tokens individually
        text = " ".join([self.int_to_str[i] for i in ids])
        # replace spaces before the puntuations
        text = re.sub(r'\s+([,.!"()\'])', r'\1',text)
        return text

In [54]:
# instantiate new tokenizer object from the SimpleTokenizerV1 class
tokenizer = SimpleTokenizerV1(vocab)

text = """It's the last he painted, you know,"Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [56]:
tokenizer.decode(ids)

'It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

Above we have implemented tokenisation and we have seen that tokenizer with the help of encode and decode method is capable of applying it on the text based ongiven statement.
Now lets see if it can do it on the sample text that is not contained in the training set.

In [60]:
text = "Hello, do you like tea?"

In [62]:
print(tokenizer.encode(text))

KeyError: 'Hello'

In [64]:
# we can see that due to limited tokens dataset here comes the error. Hence to train the model for LLM's we use large datasets

Adding Special Context Tokens 
In this section we will modify the tokens to handle unknown words
Particularly we will modify the vocabulary and tokenizer we implemented in previous section, SimpleTokenizerV2 to support new tokens,<|unk|> and <|endoftext|>

In [74]:
all_tokens = sorted(set(preprocessed))
all_tokens.extend(["<|endoftext|>","<|unk|>"])

vocab = {token:integer for integer,token in enumerate(all_tokens)}

In [78]:
len(vocab.items())

1132

In [80]:
# last vocab has 1130 length and new vocab has 2 more tokens

In [94]:
for i,item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [150]:
class SimpleTokenizerV2:
    def __init__(self,vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self,text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int 
                       else "<|unk|>" for item in preprocessed]
        
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self,ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # replace spaces before punchuations
        text = re.sub(r'\s+([,.;:?!"()\'])',r'\1',text)
        return text

In [152]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))

print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [154]:
tokenizer.encode(text)

[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]

In [156]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'

The tokenizer used by GPT does not use an '<|unk|>' token for out of vocabulary words. Instead it uses byte pair encoding tokenizer, which breaks down the words into subwords units.