## **CREATING TOKENS**

In [4]:
with open("the-verdict.txt", "r", encoding ="utf-8") as f:
  raw_text = f.read()
print("Total number of characters:", len(raw_text))
print(raw_text[:99])

Total number of characters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [5]:
import re
sent = "Hello, I am waiting for my turn since morning."
res = re.split(r'(\s)', sent)
print(res)

['Hello,', ' ', 'I', ' ', 'am', ' ', 'waiting', ' ', 'for', ' ', 'my', ' ', 'turn', ' ', 'since', ' ', 'morning.']


In [6]:
res = re.split(r'([,.] |\s)', sent)
print(res)

['Hello', ', ', 'I', ' ', 'am', ' ', 'waiting', ' ', 'for', ' ', 'my', ' ', 'turn', ' ', 'since', ' ', 'morning.']


In [7]:
result = [i for i in res if i.strip()]
print(result)

['Hello', ', ', 'I', 'am', 'waiting', 'for', 'my', 'turn', 'since', 'morning.']


# **Removing whitepaces**

In [8]:
sentence  = " Wonderful ! , you have made a   beautiful picture -- . Could you make it for me ? "

res_re = re.split(r'([,."!?;:\'_()[]{}]|--|\s)', sentence )
res_y = [item.strip() for item in res_re if item.strip()]
print(res_y)

['Wonderful', '!', ',', 'you', 'have', 'made', 'a', 'beautiful', 'picture', '--', '.', 'Could', 'you', 'make', 'it', 'for', 'me', '?']


# **Applying this method to the entire "the-verdict"**

In [9]:
preprocessed = re.split(r'([,."!?;:\'_()[]{}]|\(|\)|--|\s)', raw_text)
res_pre = [item.strip() for item in preprocessed if item.strip()]
print(res_pre[:200])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that,', 'in', 'the', 'height', 'of', 'his', 'glory,', 'he', 'had', 'dropped', 'his', 'painting,', 'married', 'a', 'rich', 'widow,', 'and', 'established', 'himself', 'in', 'a', 'villa', 'on', 'the', 'Riviera.', '(', 'Though', 'I', 'rather', 'thought', 'it', 'would', 'have', 'been', 'Rome', 'or', 'Florence.', ')', '"The', 'height', 'of', 'his', 'glory"', '--', 'that', 'was', 'what', 'the', 'women', 'called', 'it.', 'I', 'can', 'hear', 'Mrs.', 'Gideon', 'Thwing', '--', 'his', 'last', 'Chicago', 'sitter', '--', 'deploring', 'his', 'unaccountable', 'abdication.', '"Of', 'course', "it's", 'going', 'to', 'send', 'the', 'value', 'of', 'my', 'picture', "'way", 'up;', 'but', 'I', "don't", 'think', 'of', 'that,', 'Mr.', 'Rickham', '--', 'the', 'loss', 'to', 'Arrt', 'is', 'all', 'I', 't

In [10]:
print(len(res_pre))

3834


# **Creating Token IDs**

### **Creating a list of unique tokens & sorting them alphabetically to determine vocabulary size**

In [11]:
words = sorted(set(res_pre))
print(len(words))

1435


In [12]:
vocab = {i:index for index , i in enumerate(words)}
print(vocab)

{'"': 0, '"Ah': 1, '"Ah,': 2, '"Be': 3, '"Begin': 4, '"By': 5, '"Come': 6, '"Destroyed': 7, '"Don\'t': 8, '"Gisburns"': 9, '"Grindles."': 10, '"Hang': 11, '"Has': 12, '"How': 13, '"I': 14, '"I\'d': 15, '"If': 16, '"It': 17, '"It\'s': 18, '"Jack': 19, '"Money\'s': 20, '"Moon-dancers"': 21, '"Mr.': 22, '"Mrs.': 23, '"My': 24, '"Never': 25, '"Never,"': 26, '"Of': 27, '"Oh,': 28, '"Once,': 29, '"Only': 30, '"Or': 31, '"That': 32, '"The': 33, '"Then': 34, '"There': 35, '"There:': 36, '"This': 37, '"We': 38, '"Well,': 39, '"What': 40, '"When': 41, '"Why': 42, '"Yes': 43, '"Yes,': 44, '"You': 45, '"but': 46, '"deadening': 47, '"dragged': 48, '"effects";': 49, '"interesting":': 50, '"lift': 51, '"obituary"': 52, '"strongest,"': 53, '"strongly"': 54, '"sweetly"': 55, "'Are": 56, "'It's": 57, "'coming'": 58, "'done'": 59, "'subject.'": 60, "'technique'": 61, "'way": 62, '(': 63, ')': 64, '--': 65, '.': 66, '."': 67, 'A': 68, 'Among': 69, 'And': 70, 'And,': 71, 'Arrt': 72, 'As': 73, 'At': 74, 'Bu


*   Creating Class
*   Converting token to token IDs
*   Converting token IDs to text



In [18]:
class tokens_encode_decode:
  def __init__(self, vocab):
    self.str_2_int = vocab # this vocab is a dictionary in the form of tokens : index
    self.int_2_str = {index:token for token,index in vocab.items()}
  def encode(self, text):
    pre_pro = re.split(r'([,.:;"\'()]|--|\s)', text)
    pre_pro = [item.strip() for item in pre_pro if item.strip()]
    ids  = [self.str_2_int[index] for index in pre_pro]
    return ids
  def decode(self, ids):
    text = " ".join([self.int_2_str[token] for token in ids])
    text = re.sub(r'([,.;:\'?"!]|--|\s)',r'\1',text)
    return text



In [19]:
sol = tokens_encode_decode(vocab)

txt = "Chicago is a  place to"
ids = sol.encode(txt)
print(ids)


[81, 752, 205, 1003, 1284]


In [20]:
back_to_text = sol.decode(ids)
print(back_to_text)

Chicago is a place to
