<a href="https://colab.research.google.com/github/Woracle/Doc2Vec_pytorch/blob/main/Doc2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html#sphx-glr-beginner-nlp-word-embeddings-tutorial-py

Developed as a tweak to the example on above link. Rather than make word2vec i created a doc2vec method based on training an embedding for each document label. Next step would be to wrap the module and method etc 
within a formal class and store information such as which words in the vocab are documents and which are words, for easier information retrieval. 

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

In [None]:
lstm = nn.LSTM(3,3)

inputs = [torch.randn(1, 3) for _ in range(5)]

In [None]:
hidden = (torch.randn(1,1,3), torch.randn(1,1,3))

In [None]:
hidden[0].view(1, 1, -1)

tensor([[[-1.0708, -0.4503, -0.1727]]])

In [None]:
for i in inputs:
  print(i.view(1, 1, -1))
  out , hidden = lstm(i.view(1, 1, -1), hidden)

tensor([[[ 0.9961,  1.0446, -0.7019]]])
tensor([[[-0.4111, -0.2937,  0.4871]]])
tensor([[[-0.6553,  1.2947, -2.0795]]])
tensor([[[ 1.1747, -1.1901, -0.8126]]])
tensor([[[-1.1561, -0.5007,  0.0127]]])


In [None]:
inputs

[tensor([[ 0.9961,  1.0446, -0.7019]]),
 tensor([[-0.4111, -0.2937,  0.4871]]),
 tensor([[-0.6553,  1.2947, -2.0795]]),
 tensor([[ 1.1747, -1.1901, -0.8126]]),
 tensor([[-1.1561, -0.5007,  0.0127]])]

In [None]:
out

tensor([[[0.4259, 0.0674, 0.2460]]], grad_fn=<StackBackward>)

In [None]:
hidden

(tensor([[[0.4259, 0.0674, 0.2460]]], grad_fn=<StackBackward>),
 tensor([[[0.5782, 0.1903, 0.9118]]], grad_fn=<StackBackward>))

In [None]:
# lets explore word embeddings

word_to_ix = {"This": 0, "is": 1, "an": 2, "experiment": 3}

embeds = nn.Embedding(len(word_to_ix), 5)

In [None]:
exp = torch.tensor([word_to_ix["experiment"]],dtype=torch.long)

In [None]:
embeds(exp)

tensor([[ 0.6283,  0.5514,  0.2731,  0.6019, -1.0703]],
       grad_fn=<EmbeddingBackward>)

In [None]:
# can i build a doc 2 vec here

Context_size = 2
text = {"doc1" : "This is our first document  for exploring".split(),
        "doc2" : "The red cat sat in a hat and ate cheese".split(),
        "doc3" : "Another couple of sentences later and we have doc2vec".split()}



In [None]:
vocab = []

for doc in text:
  vocab = vocab + text[doc]

vocab = list(text.keys()) + vocab


In [None]:
vocab

['doc1',
 'doc2',
 'doc3',
 'This',
 'is',
 'our',
 'first',
 'document',
 'for',
 'exploring',
 'The',
 'red',
 'cat',
 'sat',
 'in',
 'a',
 'hat',
 'and',
 'ate',
 'cheese',
 'Another',
 'couple',
 'of',
 'sentences',
 'later',
 'and',
 'we',
 'have',
 'doc2vec']

In [None]:
word_to_ix = {word: i for i, word in enumerate(vocab)}
data = []
for j in text:
  for i in range(2, len(text[j]) - 2):
      context = [j, text[j][i - 2], text[j][i - 1],
                text[j][i + 1], text[j][i + 2]]
      target = text[j][i]
      data.append((context, target))
print(data[:5])

[(['doc1', 'This', 'is', 'first', 'document'], 'our'), (['doc1', 'is', 'our', 'document', 'for'], 'first'), (['doc1', 'our', 'first', 'for', 'exploring'], 'document'), (['doc2', 'The', 'red', 'sat', 'in'], 'cat'), (['doc2', 'red', 'cat', 'in', 'a'], 'sat')]


In [None]:
def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)

def make_target_vector(target, word_to_ix):
  idx = word_to_ix[target]
  return torch.tensor(idx, dtype=torch.long)

In [None]:
word_to_ix

{'Another': 20,
 'The': 10,
 'This': 3,
 'a': 15,
 'and': 25,
 'ate': 18,
 'cat': 12,
 'cheese': 19,
 'couple': 21,
 'doc1': 0,
 'doc2': 1,
 'doc2vec': 28,
 'doc3': 2,
 'document': 7,
 'exploring': 9,
 'first': 6,
 'for': 8,
 'hat': 16,
 'have': 27,
 'in': 14,
 'is': 4,
 'later': 24,
 'of': 22,
 'our': 5,
 'red': 11,
 'sat': 13,
 'sentences': 23,
 'we': 26}

In [None]:
rec = np.empty(shape=(14,5))

for i,  row in enumerate(data):
  rec[i] = make_context_vector(data[i][0], word_to_ix= word_to_ix)
  

In [None]:
word_to_ix[data[1][1]]

6

In [None]:
rec = torch.from_numpy(rec)
targ = torch.from_numpy(targ)

In [None]:
rec

tensor([[ 0.,  3.,  4.,  6.,  7.],
        [ 0.,  4.,  5.,  7.,  8.],
        [ 0.,  5.,  6.,  8.,  9.],
        [ 1., 10., 11., 13., 14.],
        [ 1., 11., 12., 14., 15.],
        [ 1., 12., 13., 15., 16.],
        [ 1., 13., 14., 16., 25.],
        [ 1., 14., 15., 25., 18.],
        [ 1., 15., 16., 18., 19.],
        [ 2., 20., 21., 23., 24.],
        [ 2., 21., 22., 24., 25.],
        [ 2., 22., 23., 25., 26.],
        [ 2., 23., 24., 26., 27.],
        [ 2., 24., 25., 27., 28.]], dtype=torch.float64)

In [None]:
class Doc2Vec(nn.Module):

  def __init__(self, vocab_size, embedding_dim, context_size):
    super(Doc2Vec, self).__init__()
    self.embeddings = nn.Embedding(vocab_size, embedding_dim) # the output of the embedding layer is a tensor for each word so 5 tensors of emedding dim long
    self.linear = nn.Linear(context_size * embedding_dim, vocab_size)


  def forward(self, inputs):
    embeds = self.embeddings(inputs)
    out = self.linear(embeds.view(1,-1))
    log_probs = F.log_softmax(out, dim=1)
    return log_probs



In [None]:
embeddings = nn.Embedding(len(vocab), 10)
linear = nn.Linear(50, len(vocab))


In [None]:
embs = embeddings(rec[13, :].type(torch.long))

In [None]:
embs

tensor([[ 0.1124, -0.4338,  0.1170, -0.1787,  1.0799,  0.2531, -1.2409, -0.6822,
         -0.3185,  1.1176],
        [-0.5583, -0.2488,  0.8714, -0.1752,  0.1384, -1.1334, -0.9564,  0.3587,
          1.6018,  1.1322],
        [-0.3461,  1.6263, -0.0739,  0.2127,  0.6517,  0.1587,  0.6815, -0.0901,
          0.2385,  0.5890],
        [ 0.0043, -0.8064,  0.0291,  0.1963,  0.7239,  0.0136,  0.0103,  0.3219,
         -1.2149,  0.7407],
        [ 0.5951,  1.8907, -0.2101, -1.6978, -0.0039, -1.6067, -1.6329,  0.5005,
          1.5389,  1.2314]], grad_fn=<EmbeddingBackward>)

In [None]:
model = Doc2Vec(vocab_size=len(vocab), embedding_dim= 10, context_size= 5)

In [None]:
log_probs = model(rec[1, :].type(torch.long))

In [None]:
log_probs

tensor([[-2.5496, -3.3986, -3.9204, -2.5497, -4.3944, -3.4811, -3.0037, -2.0418,
         -3.3976, -2.9969, -3.6387, -4.4810, -4.2514, -3.7076, -3.8133, -2.8552,
         -3.2783, -3.4504, -3.2052, -4.3678, -4.0273, -4.1972, -3.4334, -3.5503,
         -4.1352, -3.8756, -4.1995, -3.7846, -3.2796]],
       grad_fn=<LogSoftmaxBackward>)

In [None]:
loss_function = nn.NLLLoss()

In [None]:
targ[1]

tensor([6.], dtype=torch.float64)

In [None]:
loss = loss_function(log_probs, targ[1].type(torch.long))

In [None]:
rec[1]

tensor([0., 4., 5., 7., 8.], dtype=torch.float64)

In [None]:
losses = []
loss_function = nn.NLLLoss()
model = Doc2Vec(vocab_size=len(vocab), embedding_dim= 10, context_size= 5)
optimizer = optim.SGD(model.parameters(), lr = 0.001)

for epoch in range(100):
  total_loss = 0
  for i in range(len(rec)):

    model.zero_grad()

    log_probs = model(rec[i].type(torch.long))

    loss = loss_function(log_probs, targ[i].type(torch.long))

    loss.backward()
    optimizer.step()

    total_loss += loss.item()
  
  losses.append(total_loss)
print(losses)

[47.39168453216553, 45.407543659210205, 43.45357966423035, 41.53386163711548, 39.652857542037964, 37.81536543369293, 36.02639925479889, 34.29104030132294, 32.61424505710602, 31.000614047050476, 29.454163312911987, 27.978100419044495, 26.574644565582275, 25.24492347240448, 23.988940238952637, 22.805635392665863, 21.69302123785019, 20.648351967334747, 19.6683269739151, 18.749293744564056, 17.88741832971573, 17.078831046819687, 16.31974032521248, 15.606509894132614, 14.93570727109909, 14.304137468338013, 13.708848595619202, 13.14713191986084, 12.61651685833931, 12.114751622080803, 11.639790132641792, 11.189775809645653, 10.763016238808632, 10.357974261045456, 9.973248846828938, 9.607558496296406, 9.259732238948345, 8.92869370430708, 8.613452769815922, 8.313098810613155, 8.02678445726633, 7.753725975751877, 7.493195429444313, 7.24451270699501, 7.0070458091795444, 6.780197869986296, 6.563415169715881, 6.35617370903492, 6.157983168959618, 5.9683836586773396, 5.786938086152077, 5.613237284123

In [None]:
doc4 = "This is exploring our first document".split()

doc4embs = [model.embeddings.weight[word_to_ix[word]] for word in doc4]

doc4embs

[tensor([-0.2182,  0.4983,  0.1173,  0.0914,  0.3153, -0.5963, -1.0619,  0.8877,
          0.3174, -0.9643], grad_fn=<SelectBackward>),
 tensor([-2.6197, -3.0649, -0.0554,  0.6509,  0.7018, -0.1657,  0.2540,  1.7208,
          0.3995,  0.4790], grad_fn=<SelectBackward>),
 tensor([-0.3405, -0.9031, -1.3256, -0.2440,  0.3910,  0.5161,  0.5112, -0.7389,
         -0.7672,  1.3889], grad_fn=<SelectBackward>),
 tensor([ 1.8204,  0.0410, -0.1428, -0.9709, -0.7335,  1.2012,  0.1200, -0.4249,
          1.4659,  1.4000], grad_fn=<SelectBackward>),
 tensor([-1.9927, -2.4304,  0.0723,  0.3660,  0.5282,  0.4519,  1.2636,  0.4118,
         -1.9838,  1.6796], grad_fn=<SelectBackward>),
 tensor([-1.5148, -1.1591,  0.2386,  1.5738, -0.9429,  1.5610, -0.1037,  0.1501,
          1.5666,  0.4109], grad_fn=<SelectBackward>)]

In [None]:
doc4vector = torch.mean(torch.stack(doc4embs), dim = 0)

In [None]:
doc1vector = model.embeddings.weight[word_to_ix["doc1"]]
doc2vector = model.embeddings.weight[word_to_ix["doc2"]]
doc3vector = model.embeddings.weight[word_to_ix["doc3"]]

In [None]:
d4 = doc4vector.detach().numpy()
d1 = doc1vector.detach().numpy()
d2 = doc2vector.detach().numpy()
d3 = doc3vector.detach().numpy()

cos_sim41 =  ( d4 @ d1.T) / (np.linalg.norm(d4)*np.linalg.norm(d1))
cos_sim42 =  ( d4 @ d2.T) / (np.linalg.norm(d4)*np.linalg.norm(d2))
cos_sim43 =  ( d4 @ d3.T) / (np.linalg.norm(d4)*np.linalg.norm(d3))

In [None]:
cos_sim41, cos_sim42, cos_sim43

(0.07801964, -0.02057004, 0.7141524)

The Averaging technique used above to generate doc4's vectors has returned quite an odd result. where doc2 and doc 4 are the most similiar 1 and 4 are close 

In [None]:
print(text["doc1"])
print(doc4)


['This', 'is', 'our', 'first', 'document', 'for', 'exploring']
['This', 'is', 'exploring', 'our', 'first', 'document']
