In [1]:
# convert the text into embeddings
train_test = "The quick brown fox jumps over the lazy dog and chased swift playfully"

In [2]:
import re
tokenize = re.split(r'([,./"]|--|\s)',train_test)
train_tokenize = [item for item in tokenize if item is not None and item.strip()]
vocab = {item:val for val,item in enumerate(train_tokenize)}
print(train_tokenize)
print(vocab)


['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', 'and', 'chased', 'swift', 'playfully']
{'The': 0, 'quick': 1, 'brown': 2, 'fox': 3, 'jumps': 4, 'over': 5, 'the': 6, 'lazy': 7, 'dog': 8, 'and': 9, 'chased': 10, 'swift': 11, 'playfully': 12}


In [3]:
# we can add more regex for this if we want 
class TokenizersV1:
    def __init__(self,vocab):
        self.word_to_int = vocab
        self.int_to_word = {i:s for s,i in vocab.items()}
    
    def encode(self,word):
        word = word.lower()
        preprocess = re.split(r'([,.?]|--|\s)',word)
        preprocess = [item for item in preprocess if item is not None and item.strip()]
        itr = [self.word_to_int[i] for i in preprocess]
        return itr
    
    def decode(self,ids):
        text = " ".join([self.int_to_word[i] for i in ids])
        text = re.sub(r'\s+([.,?/])','/1',text)
        return text

tokenize = TokenizersV1(vocab)
sample = "The Brown dog playfully chased the swift fox"
encoded = tokenize.encode(sample)
decoded = tokenize.decode(encoded)
print(encoded)
print(decoded)


[6, 2, 8, 12, 10, 6, 11, 3]
the brown dog playfully chased the swift fox


In [4]:
# tokenizer that handles unknown words
train_data = "The quick brown fox jumps over the lazy dog and chased swift playfully"
word = re.split(r"([.,?]|--|\s)",train_data)
word = [item for item in word if item is not None and item.strip()]
word.extend(["<|endoftext|>", "<|unk|>"])
word = {item:idx for idx,item in enumerate(word)} 
vocab = word
print(word)


{'The': 0, 'quick': 1, 'brown': 2, 'fox': 3, 'jumps': 4, 'over': 5, 'the': 6, 'lazy': 7, 'dog': 8, 'and': 9, 'chased': 10, 'swift': 11, 'playfully': 12, '<|endoftext|>': 13, '<|unk|>': 14}


In [5]:

class TokenizersV2:
    def __init__(self,vocab):
        self.word_to_int = vocab
        self.int_to_word = {i:s for s,i in vocab.items()}
    
    def encode(self, text):
        text = text.lower()
        words = re.split(r'([,.?_/\[\]]|--|\s)', text)
        words = [item for item in words if item is not None and item.strip()]
        words = [item if item in self.word_to_int else "<|unk|>" for item in words]
        ids = [self.word_to_int[word] for word in words]
        return ids
    
    def decode(self,ids):
        decode  = " ".join([self.int_to_word[i] for i in ids])
        decode = re.sub(r'\s + ([,.;:?\ ])',r'/1',decode)
        return decode
    

tokenize = TokenizersV2(vocab)
sample = "The Brown dog playfully chased the swift fox virat bmw"
encoded = tokenize.encode(sample)
decoded = tokenize.decode(encoded)
print(encoded)
print(decoded)

[6, 2, 8, 12, 10, 6, 11, 3, 14, 14]
the brown dog playfully chased the swift fox <|unk|> <|unk|>
