In [62]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from typing import List
import itertools
class WordEmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim = 10, context_size = 2):
        super(WordEmbeddingModel, self).__init__()
        self.context_size = context_size
        self.embedding_dim = embedding_dim
        self.vocal_size = vocab_size
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)
    
    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs
    
    def train(self, sentences, lr = 0.001):
        losses = []
        loss_function = nn.NLLLoss()
        model = self
        optimizer = optim.SGD(model.parameters(), lr=0.001)
        sentences = list(itertools.chain.from_iterable(sentences))
        ngrams = [
            (
                [sentences[i - j - 1] for j in range(self.context_size)],
                sentences[i]
            )
            for i in range(self.context_size, len(sentences))
        ]
        for epoch in range(10):
            total_loss = 0
            print("epoch = %d" % epoch)
            for context, target in ngrams:
                # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
                # into integer indices and wrap them in tensors)
                context_idxs = torch.tensor(context, dtype=torch.long)

                # Step 2. Recall that torch *accumulates* gradients. Before passing in a
                # new instance, you need to zero out the gradients from the old
                # instance
                model.zero_grad()

                # Step 3. Run the forward pass, getting log probabilities over next
                # words
                log_probs = model(context_idxs)

                # Step 4. Compute your loss function. (Again, Torch wants the target
                # word wrapped in a tensor)

                loss = loss_function(log_probs, torch.tensor([target], dtype=torch.long))

                # Step 5. Do the backward pass and update the gradient
                loss.backward()
                optimizer.step()

                # Get the Python number from a 1-element Tensor by calling tensor.item()
                total_loss += loss.item()
            losses.append(total_loss)
            print("loss = %lf" % total_loss)
  

        

In [63]:
sentences = [[1,2,3,4], [2, 4, 5,6,7], [4,3,4,4]]

In [64]:
embeding_model = WordEmbeddingModel(10, 10, 2)

In [65]:
embeding_model.train(sentences)

epoch = 0
loss = 26.199548
epoch = 1
loss = 25.691113
epoch = 2
loss = 25.197547
epoch = 3
loss = 24.718529
epoch = 4
loss = 24.253862
epoch = 5
loss = 23.803203
epoch = 6
loss = 23.366371
epoch = 7
loss = 22.943171
epoch = 8
loss = 22.533129
epoch = 9
loss = 22.136443


In [66]:
embeding_model.embeddings.weight[9]

tensor([ 2.1557,  0.3412,  0.5148,  0.7387,  0.9112,  0.3007, -0.9163,  0.6143,
        -0.6890,  0.2110], grad_fn=<SelectBackward0>)