In [None]:
import re
with open("Data/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()



In [8]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
                                
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [15]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
sorted_words = sorted(set(preprocessed))
vocab = {token:integer for integer, token in enumerate(sorted_words)}
tokenizer = SimpleTokenizerV1(vocab)


In [23]:
enc_text = tokenizer.encode(raw_text)

print("Encoded text (ids): ", enc_text[:10])

Encoded text (ids):  [53, 44, 149, 1003, 57, 38, 818, 115, 256, 486]


In [20]:
context_size = 5

input_text = enc_text[:context_size]
target_text = enc_text[1:context_size+1]

print("Input text (ids): ", input_text)
print("Target text (ids):    ", target_text)

Input text (ids):  [53, 44, 149, 1003, 57]
Target text (ids):     [44, 149, 1003, 57, 38]


In [66]:
for i in range (len(enc_text) - context_size):
    input_text = enc_text[:i]
    target_text = enc_text[i]
    print("Input text (ids): ", input_text,"->", target_text)

    if i == 10:
        break



Input text (ids):  [] -> 53
Input text (ids):  [53] -> 44
Input text (ids):  [53, 44] -> 149
Input text (ids):  [53, 44, 149] -> 1003
Input text (ids):  [53, 44, 149, 1003] -> 57
Input text (ids):  [53, 44, 149, 1003, 57] -> 38
Input text (ids):  [53, 44, 149, 1003, 57, 38] -> 818
Input text (ids):  [53, 44, 149, 1003, 57, 38, 818] -> 115
Input text (ids):  [53, 44, 149, 1003, 57, 38, 818, 115] -> 256
Input text (ids):  [53, 44, 149, 1003, 57, 38, 818, 115, 256] -> 486
Input text (ids):  [53, 44, 149, 1003, 57, 38, 818, 115, 256, 486] -> 6
