In [None]:
class WhitespaceTokenizer:
    def __init__(self):
        self.stoi = {}  # string -> index
        self.itos = {}  # index -> string

    def fit(self, texts):
        vocab = set()
        for text in texts:
            vocab.update(text.strip().split())

        vocab = sorted(vocab)
        self.stoi = {tok: i for i, tok in enumerate(vocab)}
        self.itos = {i: tok for tok, i in self.stoi.items()}

    def tokenize(self, text):
        return text.strip().split()

    def encode(self, text):
        return [self.stoi[tok] for tok in self.tokenize(text)]

    def decode(self, indices):
        return " ".join(self.itos[i] for i in indices)

In [None]:
texts = [
    "hello world",
    "hello machine learning",
    "deep learning world",
    """
Alice was beginning to get very tired of sitting by her sister on the
bank, and of having nothing to do: once or twice she had peeped into
the book her sister was reading, but it had no pictures or
conversations in it, “and what is the use of a book,” thought Alice
“without pictures or conversations?”

So she was considering in her own mind (as well as she could, for the
hot day made her feel very sleepy and stupid), whether the pleasure of
making a daisy-chain would be worth the trouble of getting up and
picking the daisies, when suddenly a White Rabbit with pink eyes ran
close by her.

There was nothing so _very_ remarkable in that; nor did Alice think it
so _very_ much out of the way to hear the Rabbit say to itself, “Oh
dear! Oh dear! I shall be late!” (when she thought it over afterwards,
it occurred to her that she ought to have wondered at this, but at the
time it all seemed quite natural); but when the Rabbit actually _took a
watch out of its waistcoat-pocket_, and looked at it, and then hurried
on, Alice started to her feet, for it flashed across her mind that she
had never before seen a rabbit with either a waistcoat-pocket, or a
watch to take out of it, and burning with curiosity, she ran across the
field after it, and fortunately was just in time to see it pop down a
large rabbit-hole under the hedge.

In another moment down went Alice after it, never once considering how
in the world she was to get out again.

    """
]

tokenizer = WhitespaceTokenizer()
tokenizer.fit(texts)

encoded = tokenizer.encode("hello world")
print("Encoded:", encoded)

decoded = tokenizer.decode(encoded)
print("Decoded:", decoded)

In [None]:
# This will error
tokenizer.encode("Alice chased a rabbit down the hole")

In [None]:
class WhitespaceTokenizer:
    def __init__(self):
        self.stoi = {}
        self.itos = {}

    def fit(self, texts):
        vocab = set()
        for text in texts:
            vocab.update(text.strip().split())

        vocab = sorted(vocab)

        # reserve index 0 for unknown
        self.stoi = {"<UNK>": 0}
        for i, tok in enumerate(vocab, start=1):
            self.stoi[tok] = i

        self.itos = {i: tok for tok, i in self.stoi.items()}

    def tokenize(self, text):
        return text.strip().split()

    def encode(self, text):
        return [self.stoi.get(tok, 0) for tok in self.tokenize(text)]

    def decode(self, indices):
        return " ".join(self.itos.get(i, "<UNK>") for i in indices)

In [None]:
tokenizer = WhitespaceTokenizer()
tokenizer.fit(texts)

wonderland = tokenizer.encode("Alice chased a rabbit down the hole")
print(wonderland)

In [None]:
import math

def normalize_l2(vec):
    norm = math.sqrt(sum(x*x for x in vec))
    if norm == 0:
        return [0 for _ in vec]
    return [x / norm for x in vec]

print(normalize_l2(wonderland))