Reading the file

In [1]:
with open('Token.txt', "r", encoding = "utf-8" ) as f:
    raw_text = f.read()

print("Total number of character:", len(raw_text))

Total number of character: 10972


In [2]:
print(raw_text[:99])

The morning—if it could even be called that, given the barely-there light and oppressive silence—cr


Splitting the text on spaces

In [3]:
import re

retext = "Hello, world. This is test"

In [4]:
result = re.split(r'(\s)', retext)
print(result)

['Hello,', ' ', 'world.', ' ', 'This', ' ', 'is', ' ', 'test']


Splitting on special characters

In [5]:
result = re.split(r'([,.]|\s)', retext)
print(result)


['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ' ', 'is', ' ', 'test']


Removing whitespaces

In [6]:
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', 'is', 'test']


Handling other punctuations

In [7]:
text = "Hello, world. Is this -- a test?"
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'Is', ' ', 'this', ' ', '', '--', '', ' ', 'a', ' ', 'test', '?', '']


Applying on the main text

In [8]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed)



Taking first thirty words

In [9]:
print(preprocessed[:30])

['The', 'morning—if', 'it', 'could', 'even', 'be', 'called', 'that', ',', 'given', 'the', 'barely-there', 'light', 'and', 'oppressive', 'silence—crept', 'in', 'slowly', ',', 'lazily', ',', 'as', 'if', 'unsure', 'whether', 'to', 'arrive', 'at', 'all', ';']


Counting number of words

In [10]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

896


Creating a dictionary with the corresponding words and token ids

In [None]:
vocab = {token : integer for integer, token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
    print(item)
    if i>=50:
        break

('!', 0)
('"', 1)
('&', 2)
("'", 3)
('(', 4)
(')', 5)
('*', 6)
('**The', 7)
('**URGENT**', 8)
('**“For', 9)
('*Is', 10)
('*brace', 11)
('*built', 12)
('*cared', 13)
('*held', 14)
('*kept*', 15)
('*remembered*', 16)
('*thud*', 17)
('*“Thorne', 18)
(',', 19)
('-', 20)
('--', 21)
('.', 22)
('2', 23)
('37', 24)
('6', 25)
(':', 26)
(';', 27)
('?', 28)
('A', 29)
('AM', 30)
('And', 31)
('As', 32)
('At', 33)
('Because', 34)
('Behind', 35)
('Better', 36)
('Brutus', 37)
('But', 38)
('Check', 39)
('Clockmaker’s', 40)
('Curious', 41)
('Delay', 42)
('Demanding', 43)
('Drawn', 44)
('Eleanor', 45)
('Elias', 46)
('Finally', 47)
('Finish', 48)
('Flashing', 49)
('For', 50)


Creating a simple tokenizer

In [12]:
class simpletokenizer:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i : s for s,i in vocab.items()}
    
    # def encode(self, text):
    #     preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
    #     preprocessed = [item.strip() for item in preprocessed if item.strip()]
    #     ids = [self.str_to_int[s] for s in preprocessed] 
    #     return ids
    
    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed if s in self.str_to_int] 
        # skips unknown tokens here
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) 
        return text     

Testing the tokenizer on a random text

In [13]:
tokenizer = simpletokenizer(vocab)
text = "Curious and unable to resist, Elias touched it."
ids = tokenizer.encode(text)
print(ids)

[41, 138, 789, 770, 636, 19, 46, 776, 425, 22]


Decoding the values/words/tokens

In [14]:
print(tokenizer.decode(ids))

Curious and unable to resist, Elias touched it.
