In [144]:
import torch
from torch import nn

In [145]:
text = "Milk is used in making bread and we can make Sandwich from bread"
tokens = text.lower().split()
tokens

['milk',
 'is',
 'used',
 'in',
 'making',
 'bread',
 'and',
 'we',
 'can',
 'make',
 'sandwich',
 'from',
 'bread']

In [146]:
vocab = dict()
for t in tokens:
    if t not in vocab.keys():
        vocab[t] = len(vocab)
vocab

{'milk': 0,
 'is': 1,
 'used': 2,
 'in': 3,
 'making': 4,
 'bread': 5,
 'and': 6,
 'we': 7,
 'can': 8,
 'make': 9,
 'sandwich': 10,
 'from': 11}

In [147]:
# model
class w2vec(nn.Module):
    def __init__(self,vocab, hidden):
        super().__init__()
        self.v = vocab
        self.hidden_layer = nn.Linear(in_features = len(self.v), out_features = hidden) # weight matrix = input vectors W
        self.out = nn.Linear(in_features = hidden, out_features = len(self.v))# weight matrix = output vectors W'

    def forward(self,x):
        x = self.hidden_layer(x)
        x = self.out(x)
        return x
    def get_input_vec(self,word):
        with torch.no_grad():
            return self.hidden_layer.weight.T[self.v[word]]
    def get_output_vec(self,word):
        with torch.no_grad():
            return self.out.weight[self.v[word]]

In [148]:
torch.manual_seed(42)
cbow = w2vec(vocab,100)
cbow.state_dict()['hidden_layer.weight'].shape

torch.Size([100, 12])

In [149]:
def one_hot_encoder(word):
    one_hot_vector = [0]*(len(vocab))
    one_hot_vector[vocab[word]] = 1
    return one_hot_vector

In [150]:
def generate_context(tokens,center,window_size=3):
    start = center-window_size if center-window_size >0 else 0
    end = center+window_size+1 if center+window_size+1 <= len(tokens) else len(tokens)
    context = tokens[start : center] + tokens[center+1 if center+1 < len(tokens) else end : end]
    # print(tokens)
    # print(context)
    return torch.tensor([one_hot_encoder(c) for c in context])

In [151]:
loss_fun = nn.CrossEntropyLoss()
optimizer1 = torch.optim.SGD(params = cbow.parameters(), lr = 0.1)

COW-MODEL

In [152]:
# Create contexts and pass the suitable input
epochs = 100
cbow.train()
for epoch in range(epochs):
    epoch_loss = 0
    for center in range(len(tokens)):
        context = generate_context(tokens, center = center, window_size = 2)
        avg_input = torch.sum(context,dim=0)/len(context)
        pred = cbow(avg_input)
        loss = loss_fun(pred,torch.tensor(vocab[tokens[center]]))
        epoch_loss += loss
        loss.backward()
        optimizer1.step()
        optimizer1.zero_grad()
    if epoch % 10 == 9:
        print(f"Epoch : {epoch+1} | train_loss : {epoch_loss/len(tokens)}")

Epoch : 10 | train_loss : 1.958733320236206
Epoch : 20 | train_loss : 1.2391306161880493
Epoch : 30 | train_loss : 0.6945803165435791
Epoch : 40 | train_loss : 0.3786560297012329
Epoch : 50 | train_loss : 0.22482378780841827
Epoch : 60 | train_loss : 0.14783605933189392
Epoch : 70 | train_loss : 0.10551529377698898
Epoch : 80 | train_loss : 0.08004318922758102
Epoch : 90 | train_loss : 0.0635107085108757
Epoch : 100 | train_loss : 0.05211568623781204


SKIP-GRAM MODEL

In [153]:
torch.manual_seed(42)
skip_gram = w2vec(vocab,100)
skip_gram.state_dict()['hidden_layer.weight'].shape

torch.Size([100, 12])

In [154]:
optimizer2 = torch.optim.SGD(params = skip_gram.parameters(),lr=0.1)

In [155]:
# Create contexts and pass the suitable input_word
epochs = 100
skip_gram.train()
for epoch in range(epochs):
    epoch_loss = 0
    for center in range(len(tokens)):
        context = generate_context(tokens, center = center, window_size = 2)
        input_word = torch.tensor(one_hot_encoder(tokens[center]),dtype=torch.float)
        preds = skip_gram(input_word)
        stacked_preds = preds.repeat(len(context),1)
        targets = torch.tensor([torch.argmax(c,dim=0) for c in context])
        loss = loss_fun(stacked_preds,targets)
        epoch_loss += loss*len(context)
        loss.backward()
        optimizer2.step()
        optimizer2.zero_grad()
    if epoch % 20 == 19:
        print(f"Epoch : {epoch+1} | train_loss : {epoch_loss/len(tokens)}")

Epoch : 20 | train_loss : 5.980834484100342
Epoch : 40 | train_loss : 5.283209800720215
Epoch : 60 | train_loss : 5.1318559646606445
Epoch : 80 | train_loss : 5.074250221252441
Epoch : 100 | train_loss : 5.043362617492676


In [157]:
def cosine_similarity(vec1,vec2):
    return torch.dot(vec1,vec2)/( (torch.norm(vec1) * torch.norm(vec2)) )

In [158]:
# cosine similarity between input and output vectors of a word
print('Cosine similarity between input and output vectors for CBOW')
for word in vocab.keys():
    print(word,f":{cosine_similarity(cbow.get_input_vec(word),cbow.get_output_vec(word) )}")

Cosine similarity between input and output vectors for CBOW
milk :-0.04099941626191139
is :-0.07781639695167542
used :-0.23917976021766663
in :-0.22917291522026062
making :-0.22500504553318024
bread :-0.43913477659225464
and :-0.20420560240745544
we :-0.12202194333076477
can :-0.2393714189529419
make :-0.20285667479038239
sandwich :-0.4179246723651886
from :-0.30360865592956543


In [159]:
print('Cosine similarity between input and output vectors for SKIP-GRAM')
for word in vocab.keys():
    print(word,f":{cosine_similarity(skip_gram.get_input_vec(word),skip_gram.get_output_vec(word) )}")

Cosine similarity between input and output vectors for SKIP-GRAM
milk :0.05938509479165077
is :0.06074342876672745
used :-0.02561378665268421
in :-0.12289121747016907
making :-0.24847109615802765
bread :-0.4889272451400757
and :-0.19889748096466064
we :-0.14856505393981934
can :-0.11800306290388107
make :-0.03872044011950493
sandwich :-0.155063658952713
from :-0.18093720078468323


In [160]:
print('Cosine similarity between input vectors of both models')
for word in vocab.keys():
    print(word,f":{cosine_similarity(skip_gram.get_input_vec(word),cbow.get_input_vec(word) )}")

Cosine similarity between input vectors of both models
milk :0.9849143028259277
is :0.987615704536438
used :0.9887687563896179
in :0.9806521534919739
making :0.9762115478515625
bread :0.9834405183792114
and :0.9898326396942139
we :0.9847304224967957
can :0.9677909016609192
make :0.9707474112510681
sandwich :0.9720782041549683
from :0.9752551317214966


In [161]:
print('Cosine similarity between output vectors of both models')
for word in vocab.keys():
    print(word,f":{cosine_similarity(skip_gram.get_output_vec(word),cbow.get_output_vec(word) )}")

Cosine similarity between output vectors of both models
milk :0.9645285606384277
is :0.9757919907569885
used :0.9560241103172302
in :0.967519223690033
making :0.981456458568573
bread :0.9522157907485962
and :0.9790166616439819
we :0.962906002998352
can :0.9760797619819641
make :0.9437264800071716
sandwich :0.9254117608070374
from :0.9560730457305908


In [170]:
milk = skip_gram.get_input_vec('milk')
bread = skip_gram.get_input_vec('bread')
sandwich = skip_gram.get_input_vec('sandwich')
we = skip_gram.get_input_vec('we')
make = skip_gram.get_input_vec('make')
making = skip_gram.get_input_vec('making')
_is = skip_gram.get_input_vec('is')
_in = skip_gram.get_input_vec('in')
milk_bread = milk + bread
cosine_similarity(milk_bread,bread)

tensor(0.5546, grad_fn=<DivBackward0>)

In [176]:
cosine_similarity(_is,_in)

tensor(0.0667, grad_fn=<DivBackward0>)