In [1]:
from linear_algebra import dot, Vector
import math
import random

In [2]:
def cosine_similarity(v1: Vector, v2: Vector) -> float:
    return dot(v1, v2) / math.sqrt(dot(v1, v1) * dot(v2, v2))

In [3]:
colors = ["red", "green", "blue", "yellow", "black", ""]
nouns = ["bed", "car", "boat", "cat"]
verbs = ["is", "was", "seems"]
adverbs = ["very", "quite", "extremely", ""]
adjectives = ["slow", "fast", "soft", "hard"]

In [4]:
def make_sentence() -> str:
    return " ".join([
    "The",
    random.choice(colors),
    random.choice(nouns),
    random.choice(verbs),
    random.choice(adverbs),
    random.choice(adjectives),
    "."
    ])
NUM_SENTENCES = 50
random.seed(0)
sentences = [make_sentence() for _ in range(NUM_SENTENCES)]

In [5]:
from typing import List
Tensor = list

In [6]:
class Vocabulary:
    def __init__(self, words: List[str] = None) -> None:
        self.w2i: Dict[str, int] = {} # mapping word -> word_id
        self.i2w: Dict[int, str] = {} # mapping word_id -> word
        for word in (words or []): # If words were provided,
            self.add(word) # add them.
    @property
    def size(self) -> int:
        """how many words are in the vocabulary"""
        return len(self.w2i)
    
    def add(self, word: str) -> None:
        if word not in self.w2i: # If the word is new to us:
            word_id = len(self.w2i) # Find the next id.
            self.w2i[word] = word_id # Add to the word -> word_id map.
            self.i2w[word_id] = word # Add to the word_id -> word map.
    
    def get_id(self, word: str) -> int:
        """return the id of the word (or None)"""
        return self.w2i.get(word)
    
    def get_word(self, word_id: int) -> str:
        """return the word with the given id (or None)"""
        return self.i2w.get(word_id)
    
    def one_hot_encode(self, word: str) -> Tensor:
        word_id = self.get_id(word)
        assert word_id is not None, f"unknown word {word}"
        return [1.0 if i == word_id else 0.0 for i in range(self.size)]

In [7]:
vocab = Vocabulary(["a", "b", "c"])
assert vocab.size == 3, "there are 3 words in the vocab"
assert vocab.get_id("b") == 1, "b should have word_id 1"
assert vocab.one_hot_encode("b") == [0, 1, 0]
assert vocab.get_id("z") is None, "z is not in the vocab"
assert vocab.get_word(2) == "c", "word_id 2 should be c"
vocab.add("z")
assert vocab.size == 4, "now there are 4 words in the vocab"
assert vocab.get_id("z") == 3, "now z should have id 3"
assert vocab.one_hot_encode("z") == [0, 0, 0, 1]

In [8]:
import json
def save_vocab(vocab: Vocabulary, filename: str) -> None:
    with open(filename, 'w') as f:
        json.dump(vocab.w2i, f) # Only need to save w2i
def load_vocab(filename: str) -> Vocabulary:
    vocab = Vocabulary()
    with open(filename) as f:
        # Load w2i and generate i2w from it
        vocab.w2i = json.load(f)
        vocab.i2w = {id: word for word, id in vocab.w2i.items()}
    return vocab

In [9]:
from typing import Iterable, Tuple
from scratch.deep_learning import Layer, Tensor, random_tensor, zeros_like
class Embedding(Layer):
    def __init__(self, num_embeddings: int, embedding_dim: int) -> None:
        self.num_embeddings = num_embeddings
        self.embedding_dim = embedding_dim
        # One vector of size embedding_dim for each desired embedding
        self.embeddings = random_tensor(num_embeddings, embedding_dim)
        self.grad = zeros_like(self.embeddings)
        # Save last input id
        self.last_input_id = None
    def forward(self, input_id: int) -> Tensor:
        """Just select the embedding vector corresponding to the input id"""
        self.input_id = input_id # remember for use in backpropagation
        return self.embeddings[input_id]
    def backward(self, gradient: Tensor) -> None:
        # Zero out the gradient corresponding to the last input.
        # This is way cheaper than creating a new all-zero tensor each time.
        if self.last_input_id is not None:
            zero_row = [0 for _ in range(self.embedding_dim)]
            self.grad[self.last_input_id] = zero_row
        self.last_input_id = self.input_id
        self.grad[self.input_id] = gradient
    def params(self) -> Iterable[Tensor]:
        return [self.embeddings]
    def grads(self) -> Iterable[Tensor]:
        return [self.grad]

<Figure size 640x480 with 0 Axes>

In [10]:
class TextEmbedding(Embedding):
    def __init__(self, vocab: Vocabulary, embedding_dim: int) -> None:
        # Call the superclass constructor
        super().__init__(vocab.size, embedding_dim)
        # And hang onto the vocab
        self.vocab = vocab
    def __getitem__(self, word: str) -> Tensor:
        word_id = self.vocab.get_id(word)
        if word_id is not None:
            return self.embeddings[word_id]
        else:
            return None
    def closest(self, word: str, n: int = 5) -> List[Tuple[float, str]]:
        """Returns the n closest words based on cosine similarity"""
        vector = self[word]
        # Compute pairs (similarity, other_word), and sort most similar first
        scores = [(cosine_similarity(vector, self.embeddings[i]), other_word) for other_word, i in self.vocab.w2i.items()]
        scores.sort(reverse=True)
        return scores[:n]

In [11]:
import re
# This is not a great regex, but it works on our data.
tokenized_sentences = [re.findall("[a-z]+|[.]", sentence.lower()) for sentence in sentences]

In [12]:
# Create a vocabulary (that is, a mapping word -> word_id) based on our text.
vocab = Vocabulary(word for sentence_words in tokenized_sentences for word in sentence_words)

In [13]:
from scratch.deep_learning import Tensor, one_hot_encode
inputs: List[int] = []
targets: List[Tensor] = []
for sentence in tokenized_sentences:
    for i, word in enumerate(sentence): # For each word
        for j in [i - 2, i - 1, i + 1, i + 2]: # take the nearby locations
            if 0 <= j < len(sentence): # that aren't out of bounds
                nearby_word = sentence[j] # and get those words.
                # Add an input that's the original word_id
                inputs.append(vocab.get_id(word))
                # Add a target that's the one-hot-encoded nearby word
                targets.append(vocab.one_hot_encode(nearby_word))

In [14]:
from scratch.deep_learning import Sequential, Linear
random.seed(0)
EMBEDDING_DIM = 5 # seems like a good size
# Define the embedding layer separately, so we can reference it.
embedding = TextEmbedding(vocab=vocab, embedding_dim=EMBEDDING_DIM)
model = Sequential([
# Given a word (as a vector of word_ids), look up its embedding.
embedding,
# And use a linear layer to compute scores for "nearby words."
Linear(input_dim=EMBEDDING_DIM, output_dim=vocab.size)
])

In [15]:
from scratch.deep_learning import SoftmaxCrossEntropy, Momentum, GradientDescent
loss = SoftmaxCrossEntropy()
optimizer = GradientDescent(learning_rate=0.01)
for epoch in range(100):
    epoch_loss = 0.0
    for input, target in zip(inputs, targets):
        predicted = model.forward(input)
        epoch_loss += loss.loss(predicted, target)
        gradient = loss.gradient(predicted, target)
        model.backward(gradient)
        optimizer.step(model)
print(epoch, epoch_loss) # Print the loss
print(embedding.closest("black")) # and also a few nearest words
print(embedding.closest("slow")) # so we can see what's being
print(embedding.closest("car")) # learned.

99 2403.2259775721227
[(1.0, 'black'), (0.9821293900663808, 'blue'), (0.8709372536245938, 'yellow'), (0.8560680073921021, 'green'), (0.7209977273875182, 'red')]
[(1.0, 'slow'), (0.8900872652719086, 'fast'), (0.8846955683864672, 'hard'), (0.8505709819185355, 'soft'), (0.3979688002551069, 'quite')]
[(1.0, 'car'), (0.8545578601895432, 'cat'), (0.7977433731597623, 'bed'), (0.6982593591613964, 'boat'), (0.22838470790337795, 'black')]


In [16]:
len(inputs)

1004