<a href="https://colab.research.google.com/github/Sghosh32/Word2Vec/blob/main/Word2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import torch.optim as optim
from sklearn.manifold import TSNE
from matplotlib import cm
import numpy as np

In [None]:
corpus = """vocabulary is basically a list of unique words with assigned indices. corpus is very simple and short. warsaw is poland capital. berlin is germany
 capital. paris is france capital. the paper was eventually banned. the future of the bears was under discussion. he had a medal. he is a king. she is a 
 queen. all the roads going into the town were gravelled. he drove off the road. i need you to sign these papers. we drink coffee every morning.""".split()

#print(corpus)

In [None]:
def make_context_vector(context, word_to_ix, vocab_size):
  idxs = [word_to_ix[w] for w in context]
  return torch.tensor(idxs, dtype = torch.long)

In [None]:
context_size = 2
embedding_dimension = 100

In [None]:
vocab = " ".join(corpus).split()
print(vocab)
vocab = list(set(vocab))
print(vocab)
print(len(vocab))

['vocabulary', 'is', 'basically', 'a', 'list', 'of', 'unique', 'words', 'with', 'assigned', 'indices.', 'corpus', 'is', 'very', 'simple', 'and', 'short.', 'warsaw', 'is', 'poland', 'capital.', 'berlin', 'is', 'germany', 'capital.', 'paris', 'is', 'france', 'capital.', 'the', 'paper', 'was', 'eventually', 'banned.', 'the', 'future', 'of', 'the', 'bears', 'was', 'under', 'discussion.', 'he', 'had', 'a', 'medal.', 'he', 'is', 'a', 'king.', 'she', 'is', 'a', 'queen.', 'all', 'the', 'roads', 'going', 'into', 'the', 'town', 'were', 'gravelled.', 'he', 'drove', 'off', 'the', 'road.', 'i', 'need', 'you', 'to', 'sign', 'these', 'papers.', 'we', 'drink', 'coffee', 'every', 'morning.']
['a', 'corpus', 'bears', 'simple', 'basically', 'he', 'town', 'these', 'unique', 'and', 'roads', 'france', 'papers.', 'morning.', 'she', 'king.', 'capital.', 'words', 'coffee', 'the', 'list', 'is', 'germany', 'paris', 'into', 'paper', 'warsaw', 'drove', 'very', 'queen.', 'need', 'under', 'discussion.', 'indices.', 

In [None]:
word_to_ix = {word:ix for ix, word in enumerate(vocab)}
ix_to_word = {ix:word for ix, word in enumerate(vocab)}
vocab_size = len(vocab)
print(ix_to_word) 
#print(len(corpus))

{0: 'a', 1: 'corpus', 2: 'bears', 3: 'simple', 4: 'basically', 5: 'he', 6: 'town', 7: 'these', 8: 'unique', 9: 'and', 10: 'roads', 11: 'france', 12: 'papers.', 13: 'morning.', 14: 'she', 15: 'king.', 16: 'capital.', 17: 'words', 18: 'coffee', 19: 'the', 20: 'list', 21: 'is', 22: 'germany', 23: 'paris', 24: 'into', 25: 'paper', 26: 'warsaw', 27: 'drove', 28: 'very', 29: 'queen.', 30: 'need', 31: 'under', 32: 'discussion.', 33: 'indices.', 34: 'i', 35: 'eventually', 36: 'of', 37: 'we', 38: 'banned.', 39: 'vocabulary', 40: 'off', 41: 'with', 42: 'drink', 43: 'berlin', 44: 'poland', 45: 'medal.', 46: 'were', 47: 'all', 48: 'every', 49: 'was', 50: 'to', 51: 'short.', 52: 'going', 53: 'sign', 54: 'future', 55: 'assigned', 56: 'gravelled.', 57: 'you', 58: 'had', 59: 'road.'}


In [None]:
data = []
for i in range(context_size, len(corpus) - context_size):
  context = [corpus[i - 1], corpus[i + 1]]
  target = corpus[i]
  data.append((context, target))
print(data[0])

(['is', 'a'], 'basically')


In [None]:
class Word2Vec(nn.Module):
  def __init__(self, vocab_size, embedding_dimension):
    super(Word2Vec, self).__init__()

    self.embeddings = nn.Embedding(vocab_size, embedding_dimension)
    self.hidden1 = nn.Linear(embedding_dimension, 128)
    self.activation_function1 = nn.Tanh()

    self.hidden2 = nn.Linear(128, vocab_size)
    self.activation_function2 = nn.LogSoftmax(dim = -1)

  def forward(self, x):
    word_embeddings = sum(self.embeddings(x)).view(1, -1)
    output = self.hidden1(word_embeddings)
    output = self.activation_function1(output)
    output = self.hidden2(output)
    output = self.activation_function2(output) 
    return output    

  def get_word_embedding(self, word):
    word = torch.tensor([word_to_ix[word]])
    return self.embeddings(word).view(1, -1)

In [None]:
model = Word2Vec(vocab_size, embedding_dimension)

criterion = nn.NLLLoss()
optimize = optim.Adam(model.parameters(), lr = 0.005)

In [None]:
for epoch in range(500):
  loss = 0
  loss_list = []
  for context, target in data:
    context_vector = make_context_vector(context, word_to_ix, vocab_size)
    #print(context_vector)
    log_probs = model(context_vector)
    #print(log_probs)
    loss += criterion(log_probs, torch.tensor([word_to_ix[target]]))

  if (epoch + 1) % 100 == 0:
    print('Epoch: ', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

  optimize.zero_grad()
  loss.backward()
  optimize.step()



Epoch:  0100 cost = 17.008297
Epoch:  0200 cost = 16.950745
Epoch:  0300 cost = 16.926922
Epoch:  0400 cost = 16.914736
Epoch:  0500 cost = 16.907621
