In [None]:
import numpy as np
from nltk.tokenize import WordPunctTokenizer


class DumbTextEncoder:
    def __init__(self, source_text: str):
        self.tokenizer = WordPunctTokenizer()
        self.tokens = set(
            map(str.lower, self.tokenizer.tokenize(source_text))
        )

        # Index method
        self.index_vocab = {  # For Encoder
            token: i for i, token in enumerate(self.tokens)
        }
        self.inv_index_vocab = {  # For Decoder
            value: key
            for key, value in self.index_vocab.items()
        }

        # One hot method
        self.onehot_vocab = {
            token: np.eye(len(self.tokens))[i]
            for i, token in enumerate(self.tokens)
        }


    def encode(self, text: str, strategy: str):
        tokens = list(
            map(str.lower, self.tokenizer.tokenize(text))
        )
        vocab = (
            self.index_vocab
            if strategy == 'index'
            else self.onehot_vocab
            if strategy == 'one_hot'
            else None
        )
        unk_value = (
            -1
            if strategy == 'index'
            else np.zeros(len(self.tokens))
        )
        return [
            vocab.get(token, unk_value)
            for token in tokens
        ]

    def decode(self, numbers):
        return ' '.join([
            self.inv_index_vocab.get(num, 'UNK')
            for num in numbers
        ])


In [None]:
dumb_encoder = DumbTextEncoder('Hello how are you? What is you name?')

dumb_encoder.tokens

{'?', 'are', 'hello', 'how', 'is', 'name', 'what', 'you'}

In [None]:
dumb_encoder.encode('Hi, How are you? My name is Daniel Johnston.', 'index')

[-1, -1, 0, 4, 5, 7, -1, 3, 2, -1, -1, -1]

In [None]:
dumb_encoder.encode('Hi, How are you? My name is Daniel Johnston.', 'one_hot')

[array([0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0.]),
 array([1., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 1., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 1., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 1.]),
 array([0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 1., 0., 0., 0., 0.]),
 array([0., 0., 1., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0.])]

In [None]:
dumb_encoder.decode([-1, -1, 0, 4, 5, 7, -1, 3, 2, -1, -1, -1])

'UNK UNK how are you ? UNK name is UNK UNK UNK'