# Creation of a tokenizer based on LLMs from scratch book

In [2]:
with open("the_verdict.txt","r",encoding="utf-8") as f:
    raw_text = f.read()
print(f"Number of characters: {len(raw_text)}")
print(f"First 99 characters:\n{raw_text[:99]}")


Number of characters: 20479
First 99 characters:
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


## The scope of this tokenizer is to process each word and special character and embbed it

In [3]:
import re
text = "Hellow world. Hello again-- from re."
#result = re.split(r'(\s)', text)
result = re.split(r'([,.:,?_!"()\']|--|\s)', text)
print(result)
result = [item for item in result if item.strip()]
print(result)

['Hellow', ' ', 'world', '.', '', ' ', 'Hello', ' ', 'again', '--', '', ' ', 'from', ' ', 're', '.', '']
['Hellow', 'world', '.', 'Hello', 'again', '--', 'from', 're', '.']


In [4]:
preprocessed = re.split(r'([,.:,?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))

4669


In [5]:
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


## Convert the tokens in token_ids. Firstly we need to define a vocabulary
The vocabulary contains every token from the text and it should be written in alphabetical order

In [6]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

1143


In [15]:
vocab = {token:integer for integer,token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
    print(item)
    if i>=50:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Carlo;', 25)
('Chicago', 26)
('Claude', 27)
('Come', 28)
('Croft', 29)
('Destroyed', 30)
('Devonshire', 31)
('Don', 32)
('Dubarry', 33)
('Emperors', 34)
('Florence', 35)
('For', 36)
('Gallery', 37)
('Gideon', 38)
('Gisburn', 39)
('Gisburns', 40)
('Grafton', 41)
('Greek', 42)
('Grindle', 43)
('Grindles', 44)
('HAD', 45)
('Had', 46)
('Hang', 47)
('Has', 48)
('He', 49)
('Her', 50)


In [16]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self,text):
        preprocessed = re.split(r'([,.:,?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self,ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'"])', r'\1', text)
        return text

In [18]:
tokenizer = SimpleTokenizerV1(vocab)
text = """
Well!--even through the prism of Hermia's tears I felt able to face the fact with equanimity.
"""
ids = tokenizer.encode("pula")
print(ids)

[54, 45, 152, 1014, 58, 39, 828, 117, 261, 491, 6, 1013, 117, 505, 440, 397, 6, 919, 591, 1089, 717, 513, 972, 1027, 671, 1027, 540, 998, 5, 574, 999, 543, 730, 555, 501, 5, 538, 519, 375, 555, 757, 5, 669, 117, 851, 1114, 5, 160, 402, 553, 574, 117, 1078, 735, 999, 86, 7, 3, 101, 54, 828, 1014, 591, 1133, 535, 212, 87, 742, 35, 7, 4, 1, 95, 543, 730, 555, 501, 1, 6, 998, 1089, 1101, 999, 1125, 247, 591, 7, 54, 249, 540, 68, 7, 38, 102, 6, 555, 610, 26, 908, 6, 331, 555, 1053, 118, 7, 1, 74, 302, 591, 2, 861, 503, 1027, 877, 999, 1071, 730, 705, 779, 2, 1095, 1064, 244, 54, 364, 2, 981, 1009, 730, 998, 5, 67, 7, 84, 6, 999, 654, 1027, 16, 590, 148, 54, 1009, 730, 7, 1, 95, 1129, 5, 735, 68, 7, 102, 2, 861, 641, 5, 701, 593, 116, 858, 116, 180, 1013, 1005, 1100, 837, 574, 159, 394, 1081, 730, 685, 7, 14, 591, 1089, 719, 739, 999, 68, 7, 103, 1109, 696, 7, 46, 719, 999, 415, 51, 29, 5, 183, 999, 610, 41, 37, 893, 5, 940, 671, 213, 39, 2, 861, 1, 66, 1, 1027, 867, 5, 1121, 987, 574, 544, 

In [14]:
print(tokenizer.decode(ids))

I HAD always thought Jack Gisburn rather a cheap genius -- though a good fellow enough -- so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera.( Though I rather thought it would have been Rome or Florence.)" The height of his glory" -- that was what the women called it. I can hear Mrs. Gideon Thwing -- his last Chicago sitter -- deploring his unaccountable abdication." Of course it' s going to send the value of my picture' way up; but I don' t think of that, Mr. Rickham -- the loss to Arrt is all I think of." The word, on Mrs. Thwing' s lips, multiplied its _ rs _ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn' s" Moon-dancers" to say, with tears in her eyes :" We shall not look upon its like again"? Well!