In [1]:
from pathlib import Path

In [2]:
dataloc = Path("./Data/input.txt")

In [3]:
with open (dataloc, 'r', encoding='utf-8') as datafile:
    data = datafile.read()

In [5]:
print(data[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [6]:
vocab = sorted(set(data))
print(f"Vocabulary Size: {len(vocab)}")
print(f"Vocabulary: {vocab}")

Vocabulary Size: 65
Vocabulary: ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [7]:
class tokenizer:
    def __init__(self, mode = 'char', max_vocab_size=None):
        self.mode = mode
        self.max_vocab_size = max_vocab_size

        self.vocab = set()
        self.stoi = {}
        self.itos = {}

    def train(self, dataset):
        if self.mode == 'char':
            self.vocab = sorted(set(dataset))
            self.stoi = {c:i for i, c in enumerate(self.vocab)}
            self.itos = {i:c for i, c in enumerate(self.vocab)}
        elif self.mode == 'word':
            self.vocab = sorted(set(dataset.split()))
            self.stoi = {s:i for i, s in enumerate(self.vocab)}
            self.itos = {i:s for i, s in enumerate(self.vocab)}
        else:
            print(f"{self.mode} not supported yet! We only support 'char' and 'word'!")

    def encode(self, text):
        if len(self.stoi) == 0:
            raise ValueError("First run train method on your data!")
        
        if self.mode == 'char':
            return [self.stoi[c] for c in text]
        elif self.mode == 'word':
            return [self.stoi[c] for c in text.split()]
        else:
            raise ValueError(f"{self.mode} not supported yet! We only support 'char' and 'word'!")

    def decode(self, ints):
        if len(self.itos) == 0:
            raise ValueError("First run train method on your data!")
        
        tlists = [self.itos[i] for i in ints]
        if self.mode == 'char':
            return "".join(tlists)
        elif self.mode == 'word':
            return " ".join(tlists)
        else: 
            raise ValueError(f"{self.mode} not supported yet! We only support 'char' and 'word'!")

In [8]:
t = tokenizer('char')
t.train(data)
encoded = t.encode("Hello World!!")
encoded

[20, 43, 50, 50, 53, 1, 35, 53, 56, 50, 42, 2, 2]

In [9]:
decoded = t.decode(encoded)
decoded

'Hello World!!'