In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import re
import numpy as np
from io import open

In [2]:
def procData(path):
    file = open(path)
    
    data = ""
    
    for i in range(0,10):
        data = data + (file.readline()).lower()
    #print(data)
    data = re.sub("\\t","", data)
    data = re.sub("\\n","", data)
    data = re.sub("  ", " ", data)
    data = data.split(" . ")

    holder = []
    for d in data:
        if d != "":
            holder.append(d)   
    data = holder

    for i in range(len(data)):
        data[i] = data[i] + " . "
        
    o1 = set(''.join(data).split(" "))
        
    for j in range(len(data)):
        data[j] = data[j].split(" ")

        holder = []
        for i in range(len(data[j])):
            if data[j][i] != "":
                holder.append(data[j][i])
        data[j] = holder  
    
    return data, o1    

In [3]:
path = "recipe_tokenized_instructions_.txt"
data, vocab = procData(path)

In [4]:
pentagrams = [([data[j][i], data[j][i + 1]], data[j][i + 2])
              for j in range(len(data))
                for i in range(len(data[j]) - 2)]
print(pentagrams[:5])

[(['preheat', 'the'], 'oven'), (['the', 'oven'], 'to'), (['oven', 'to'], '350'), (['to', '350'], 'f'), (['350', 'f'], '.')]


In [5]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 32
word_indices = {word: i for i, word in enumerate(vocab)}

In [9]:
class NGramLanguageModeler(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs
    
    def getEmb(self):
        return self.embeddings
    
    def getProb(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

In [10]:
losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [11]:
for epoch in range(25):
    if (epoch % 1 == 0):
        print("Epoch: ", epoch)
    total_loss = 0
    for context, target in pentagrams:
        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.tensor([word_indices[w] for w in context], dtype=torch.long)
        
        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs)
        print(log_probs.shape)
        
        
        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = loss_function(log_probs, torch.tensor([word_indices[target]], dtype=torch.long))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
    losses.append(round(total_loss, 3))
print(losses)  # The loss decreased every iteration over the training data!

Epoch:  0
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
tor

torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1

torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1

torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1

torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1

torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1

torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1

torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1, 315])
torch.Size([1

KeyboardInterrupt: 

In [None]:
em = model.embeddings(torch.tensor([word_indices["mix"]]))
#print(em)


imp = torch.tensor([word_indices['preheat'], word_indices['the']], dtype=torch.long)
t = model.getProb(imp)
#print(t[0])
values, indices = t[0].max(0)

print(values, indices)
print(word_indices)

#normalized_embedding = t.weight/((t.weight**2).sum(0)**0.5).expand_as(t.weight)
#dist, ind = torch.topk(torch.mv(normalized_embedding,em[0]),5)

#estimation = int(ind[0])

#print(estimation)
#print(word_indices)

In [None]:
print(t.shape)

In [None]:
count = 0.0
for i in word_indices:
    em = model.embeddings(torch.tensor([word_indices[i]]))
    t = model.getEmb()
    normalized_embedding = t.weight/((t.weight**2).sum(0)**0.5).expand_as(t.weight)
    dist, ind = torch.topk(torch.mv(normalized_embedding,em[0]),5)
    
    estimation = int(ind[0])
    target = word_indices[i]
    
    if (estimation == target):
        count += 1.0

print(count/len(word_indices))

In [None]:
dL = 0
for i in data:
    dL = dL + len(i)

In [None]:
wp = {word: 0 for word in vocab}

for i in data:
    for j in i:
        wp[j] = wp[j] + (1/dL)

In [None]:
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import svds, eigs

In [None]:
# From "A SIMPLE BUT TOUGH-TO-BEAT BASELINE FOR SENTENCE EMBEDDINGS"
vs = {" ".join(sentence): np.zeros([EMBEDDING_DIM]) for sentence in data}
for i in range(len(data)):
    sb = 1/len(data[i])
    a = 1
    
    vw = np.zeros([EMBEDDING_DIM])
    for j in data[i]:
        p = wp[j]
        vw = vw + (a/(a+p)) * model.embeddings(torch.tensor([word_indices[j]]))[0].data.numpy()
    vw = vw * sb
    index = " ".join(data[i])
    vs[index] = vw

    
X = np.zeros((EMBEDDING_DIM, len(vs)))
for i in range(len(vs)):    
    X[:,i] = vs[" ".join(data[i])].reshape(EMBEDDING_DIM)

u, s, vt = svds(X, k=1, return_singular_vectors=True)

for s in vs:
    vs[s] = vs[s].reshape(EMBEDDING_DIM,1)
    vs[s] = vs[s] - np.dot(np.dot(u,u.T),vs[s])

In [21]:
t = ["preheat the oven to 350 f .", "preheat the oven to 425 f .", "reserved that blender to a halibut ."]

In [22]:
# From "A SIMPLE BUT TOUGH-TO-BEAT BASELINE FOR SENTENCE EMBEDDINGS"
vst = {sentence: np.zeros([EMBEDDING_DIM]) for sentence in t}
print(vst)

{'preheat the oven to 350 f .': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), 'preheat the oven to 425 f .': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), 'reserved that blender to a halibut .': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])}


In [23]:
for i in range(len(vst)):
    sb = 1/len(t)
    a = 1
    
    vw = np.zeros([EMBEDDING_DIM])
    for j in t[i].split(" "):
        print(j)
        p = wp[j]
        vw = vw + (a/(a+p)) * model.embeddings(torch.tensor([word_indices[j]]))[0].data.numpy()
    vw = vw * sb
    index = t[i]
    vst[index] = vw
    
X = np.zeros((EMBEDDING_DIM, len(vst)))
for i in range(len(vst)):    
    X[:,i] = vst[t[i]].reshape(EMBEDDING_DIM)
    
u, s, vt = svds(X, k=1, return_singular_vectors=True)

for s in vst:
    print(vst[s].shape)
    vst[s] = vst[s] - u*u.T*vst[s] ## vst is (20,), see what happens if it's (20,1)
    #print(u.shape)
    #print(vst[s].shape)
    #print(s)
    #print(vst[s])

preheat
the
oven
to
350
f
.
preheat
the
oven
to
425
f
.
reserved
that
blender
to
a
halibut
.
(32,)
(32,)
(32,)


In [24]:
np.linalg.norm(vst[t[0]]-vst[t[1]])

14.940371380508081

In [25]:
np.linalg.norm(vst[t[0]]-vst[t[2]])

40.28320464221657