In [44]:
import re

In [45]:
text = "This, is some text that does'nt mean any thing."
result = re.split(r'([,.]|\s)', text)

print(result)

['This', ',', '', ' ', 'is', ' ', 'some', ' ', 'text', ' ', 'that', ' ', "does'nt", ' ', 'mean', ' ', 'any', ' ', 'thing', '.', '']


In [46]:
result = [item for item in result if item.strip()]
print(result)

['This', ',', 'is', 'some', 'text', 'that', "does'nt", 'mean', 'any', 'thing', '.']


In [47]:
with open("bht.txt", "r", encoding = "utf-8") as f:
    raw_text = f.read()
    
print("Total characters available :", len(raw_text))
print(raw_text[:99])

Total characters available : 375476
A Brief History of Time - Stephen Hawking
Chapter 1 - Our Picture of the Universe
Chapter 2 - Space


In [48]:
raw_data = re.split(r'(\s|[.,!?;:"\'()\[\]{}<>|\\/`~@#$%^&*_+=-]|\.\.\.)', raw_text)
'''
Removal of white spaces is crucial aspect to consider if the data 
is like codes or where the spaces matter then include the spaces 
in the data otherwise its better to not use them.
'''

raw_data = [token for token in raw_data if token.strip()]
print("Total tokens available :",len(raw_data))
print(raw_data[:99])

Total tokens available : 73010
['A', 'Brief', 'History', 'of', 'Time', '-', 'Stephen', 'Hawking', 'Chapter', '1', '-', 'Our', 'Picture', 'of', 'the', 'Universe', 'Chapter', '2', '-', 'Space', 'and', 'Time', 'Chapter', '3', '-', 'The', 'Expanding', 'Universe', 'Chapter', '4', '-', 'The', 'Uncertainty', 'Principle', 'Chapter', '5', '-', 'Elementary', 'Particles', 'and', 'the', 'Forces', 'of', 'Nature', 'Chapter', '6', '-', 'Black', 'Holes', 'Chapter', '7', '-', 'Black', 'Holes', 'Ain', "'", 't', 'So', 'Black', 'Chapter', '8', '-', 'The', 'Origin', 'and', 'Fate', 'of', 'the', 'Universe', 'Chapter', '9', '-', 'The', 'Arrow', 'of', 'Time', 'Chapter', '10', '-', 'Wormholes', 'and', 'Time', 'Travel', 'Chapter', '11', '-', 'The', 'Unification', 'of', 'Physics', 'Chapter', '12', '-', 'Conclusion', 'Glossary', 'Acknowledgments', '&', 'About', 'The']


In [67]:
all_tokens = sorted(list(set(raw_data)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

print(len(all_tokens))

5088


In [74]:
vocab = {token:integer for integer,token in enumerate(all_tokens)}
for i, token in enumerate(vocab.items()):
    print(token)
    if i>= 20:
        break
        
# print(vocab)

('!', 0)
('"', 1)
('&', 2)
("'", 3)
('(', 4)
(')', 5)
('+', 6)
(',', 7)
('-', 8)
('.', 9)
('/', 10)
('0', 11)
('00', 12)
('000', 13)
('000000003335640952', 14)
('004th', 15)
('1', 16)
('10', 17)
('100', 18)
('11', 19)
('12', 20)


In [69]:
class TokenizerV1:
    def __init__(self, vocab):
        self.str_int = vocab
        self.int_str = {i:s for s,i in vocab.items()}
        
    def encode(self, text):
        preprocessed = re.split(r'(\s|[.,!?;:"\'()\[\]{}<>|\\/`~@#$%^&*_+=-]|\.\.\.)', text)
        
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_str[i] for i in ids])
        text = re.sub(r'\s+([.,!?;:"\'()\[\]{}<>|\\/`~@#$%^&*_+=-]|\.\.\.)', r'\1', text)
        return text

In [70]:
tokenizer = TokenizerV1(vocab)

text = """On the observational side, by far the most important development has been the measurement of fluctuations in
the cosmic microwave background radiation by COBE (the Cosmic Background Explorer satellite) and other
collaborations."""

ids = tokenizer.encode(text)
print(ids)


[723, 4596, 3428, 4249, 7, 1537, 2433, 4596, 3300, 2832, 2035, 2703, 1419, 4596, 3214, 3453, 2505, 2839, 4596, 1879, 3248, 1385, 3860, 1537, 252, 4, 4596, 312, 208, 389, 4136, 5, 1207, 3508, 1683, 9]


In [71]:
tokenizer.decode(ids)


'On the observational side, by far the most important development has been the measurement of fluctuations in the cosmic microwave background radiation by COBE( the Cosmic Background Explorer satellite) and other collaborations.'

In [75]:
class TokenizerV2:
    def __init__(self, vocab):
        self.str_int = vocab
        self.int_str = {i:s for s,i in vocab.items()}
        
    def encode(self, text):
        preprocessed = re.split(r'(\s|[.,!?;:"\'()\[\]{}<>|\\/`~@#$%^&*_+=-]|\.\.\.)', text)
        
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        preprocessed = [
            item if item in self.str_int
            else "<|unk|>" for item in preprocessed
        ]
        ids = [self.str_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_str[i] for i in ids])
        text = re.sub(r'\s+([.,!?;:"\'()\[\]{}<>|\\/`~@#$%^&*_+=-]|\.\.\.)', r'\1', text)
        return text

In [76]:
tokenizer = TokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "Hello, would you like some Coffee?"

text = "<endoftext|>".join((text1, text2))

print(text)

ids = tokenizer.encode(text)
print(ids)

Hello, do you like tea?<endoftext|>Hello, would you like some Coffee?
[5087, 7, 2133, 5003, 3084, 5087, 130, 5087, 5087, 5087, 5087, 5087, 7, 4986, 5003, 3084, 4325, 5087, 130]


In [77]:
tokenizer.decode(tokenizer.encode(text))


'<|unk|>, do you like<|unk|>?<|unk|><|unk|><|unk|><|unk|><|unk|>, would you like some<|unk|>?'