In [1]:
import os
with open("verdict.txt","r") as f:
  raw_text = f.read()
print("Total characters is :",len(raw_text))
print(raw_text[:100])

Total characters is : 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g


# Stage 1 : Creating the Embeddings from raw text input

Creating the vocabulary - mapping of all unique tokens(words/special characters) to their token IDs

In [2]:
import re
result = re.split(r'([.,:;_?!"()\']|--|\s)',raw_text)
fin_res = [item for item in result if item.strip()]
all_words = sorted(list(set(fin_res)))
all_words.extend(["<|unk|>","<|endoftext|>"])
vocab_size = len(all_words)
vocab = {token:integer for integer,token in enumerate(all_words)}
print("Vocabulary size : ",vocab_size)
for i,item in enumerate(vocab.items()):
  print(item)
  if i==25:
    break

Vocabulary size :  1132
('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)


Tokenizer class that can encode as well as decode

In [3]:
class tokenizerv1:
  def __init__(self,vocab):
    self.str_to_int = vocab
    self.int_to_str = {v:k for k,v in vocab.items()}
  def encode(self,text):
    tokens = re.split(r'([.,:;_?!"()\']|--|\s)',text)
    fin_res = [item for item in tokens if item.strip()]
    fin_res = [item if item in self.str_to_int else "<|unk|>" for item in fin_res]
    return [self.str_to_int[item] for item in fin_res]
  def decode(self,ids):
    text = " ".join([self.int_to_str[id] for id in ids])
    text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
    return text


In [4]:
tokenizer = tokenizerv1(vocab)
text = "Well!--even through the prism of Hermia's tears I felt able to face the fact with equanimity. Poor Jack Gisburn! The women had made him--it was fitting that they should mourn him."
ids = tokenizer.encode(text)
print(ids)

[108, 0, 6, 399, 1007, 988, 795, 722, 50, 2, 850, 976, 53, 436, 117, 1016, 418, 988, 420, 1108, 395, 7, 80, 57, 38, 0, 93, 1112, 514, 654, 546, 6, 585, 1077, 444, 987, 994, 879, 687, 546, 7]


In [5]:
tokenizer.decode(ids)

"Well! -- even through the prism of Hermia' s tears I felt able to face the fact with equanimity. Poor Jack Gisburn! The women had made him -- it was fitting that they should mourn him."

Now testing out tokenizer after adding tokens for unknown and endoftext:

In [9]:
print(len(vocab))
text = "even through the prism of Hermia's tears I felt Tanvi to face"
print(tokenizer.encode(text))
text2 = "Poor Jack Gisburn! The women had made him"
text3 = " <|endoftext|> ".join((text,text2))
ids = tokenizer.encode(text3)
print(tokenizer.decode(ids))

1132
[399, 1007, 988, 795, 722, 50, 2, 850, 976, 53, 436, 1130, 1016, 418]
even through the prism of Hermia' s tears I felt <|unk|> to face <|endoftext|> Poor Jack Gisburn! The women had made him


Trying out the BPE tokenizer

In [11]:
import importlib
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
text = "I felt the happiest at DiagonAlley with Hagrid <|endoftext|>"
ids = tokenizer.encode(text,allowed_special={"<|endoftext|>"})
print(ids)

[40, 2936, 262, 49414, 379, 6031, 1840, 2348, 1636, 351, 21375, 6058, 220, 50256]


In [12]:
tokenizer.decode(ids)

'I felt the happiest at DiagonAlley with Hagrid <|endoftext|>'