In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data.csv')

In [3]:
sentence_1_text=df['sentence1'][:100].tolist()
sentence_2_text=df['sentence2'][:100].tolist()

In [4]:
corpus=sentence_1_text+sentence_2_text

In [5]:
words = " ".join(corpus).split()
vocab = list(set(words))
word_to_ix = {word: idx for idx, word in enumerate(vocab)}
ix_to_word = {idx: word for word, idx in word_to_ix.items()}
vocab_size=len(vocab)
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right
EMDEDDING_DIM = 10

In [6]:
data = []
for i in range(2, len(words) - 2):
    context = [words[i - 2], words[i - 1], words[i + 1], words[i + 2]]
    target = words[i]
    data.append((context, target))
print(data[:5])

[(['paris', 'october', 'secretly', 'met'], '1560'), (['october', '1560', 'met', 'english'], 'secretly'), (['1560', 'secretly', 'english', 'ambassador'], 'met'), (['secretly', 'met', 'ambassador', 'nicolas'], 'english'), (['met', 'english', 'nicolas', 'throckmorton'], 'ambassador')]


In [7]:


def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)






# By deriving a set from `raw_text`, we deduplicate the array














In [8]:
class CBOW(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()

        #out: 1 x emdedding_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, 128)
        self.activation_function1 = nn.ReLU()

        #out: 1 x vocab_size
        self.linear2 = nn.Linear(128, vocab_size)
        self.activation_function2 = nn.LogSoftmax(dim = -1)


    def forward(self, inputs):
        embeds = sum(self.embeddings(inputs)).view(1,-1)
        out = self.linear1(embeds)
        out = self.activation_function1(out)
        out = self.linear2(out)
        out = self.activation_function2(out)
        return out

    def get_word_emdedding(self, word):
        word = torch.tensor([word_to_ix[word]])
        return self.embeddings(word).view(1,-1)

In [9]:
model = CBOW(vocab_size, EMDEDDING_DIM)

loss_function = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

In [10]:


#TRAINING
for epoch in range(20):
    total_loss = 0

    for context, target in data:
        context_vector = make_context_vector(context, word_to_ix)

        log_probs = model(context_vector)

        total_loss += loss_function(log_probs, torch.tensor([word_to_ix[target]]))

    #optimize at the end of each epoch
    optimizer.zero_grad()
    total_loss.backward()
    optimizer.step()

In [11]:

#TESTING
context = ['paris', 'october', 'secretly', 'met']
context_vector = make_context_vector(context, word_to_ix)
a = model(context_vector)


In [12]:
#Print result

print(f'Context: {context}\n')
print(f'Prediction: {ix_to_word[torch.argmax(a[0]).item()]}')

Context: ['paris', 'october', 'secretly', 'met']

Prediction: born


In [13]:
t=model.get_word_emdedding("paris")

In [14]:
type(t)

torch.Tensor

In [15]:
t

tensor([[-2.3689, -0.6885,  0.9433, -1.5555,  0.5560, -0.4342, -0.0408, -1.2490,
         -0.0025,  0.4616]], grad_fn=<ViewBackward0>)

In [16]:
word_embeddings_dict = {}

for word in vocab:
    embedding = model.get_word_emdedding(word)
    word_embeddings_dict[word] = embedding

In [17]:
word_embeddings_dict["player"]

tensor([[ 0.4788,  0.4711, -0.1946,  0.5628,  0.6132,  0.6614,  0.7245,  0.9574,
         -1.5689, -0.5242]], grad_fn=<ViewBackward0>)

In [18]:
type(df["sentence1"][1])

str

In [19]:


def get_document_embedding(sentence, embedding_dictionary):
    words = sentence.split()
    word_embeddings = []
    for word in words:

        word_embedd = embedding_dictionary[word]

        word_embeddings.append(word_embedd)
    document_embedding = torch.mean(torch.cat(word_embeddings, dim=0), dim=0)

    return document_embedding


In [21]:
get_document_embedding("paris  nba",word_embeddings_dict)

tensor([-1.8012, -0.4952,  0.7508, -1.4469,  0.5266,  0.8998,  0.7528, -0.7879,
         0.1887, -0.0351], grad_fn=<MeanBackward1>)

In [28]:
# def levenshtein_similarity(embeddings1, embeddings2):
#     sentence1 = ' '.join([str(embedding) for embedding in embeddings1])
#     sentence2 = ' '.join([str(embedding) for embedding in embeddings2])
#     lev_dist = levenshtein_distance(sentence1, sentence2)
#     max_length = max(len(sentence1), len(sentence2))
#     normalized_lev_dist = lev_dist / max_length
#     similarity = 1 - normalized_lev_dist
#     return similarity

# similarity = levenshtein_similarity(embedding_dictionary, sentence2_embeddings)
# print("Levenshtein Distance-based Similarity:", similarity)


In [26]:
from Levenshtein import distance as levenshtein_distance

def levenshtein_similarity(sentence1, sentence2, embedding_dictionary):
    embedding1 = get_document_embedding(sentence1, embedding_dictionary)
    embedding2 = get_document_embedding(sentence2, embedding_dictionary)

    if embedding1 is not None and embedding2 is not None:
        lev_dist = levenshtein_distance(str(embedding1), str(embedding2))
        max_length = max(len(str(embedding1)), len(str(embedding2)))
        normalized_lev_dist = lev_dist / max_length
        similarity = 1 - normalized_lev_dist
        return similarity
    else:
        return 0 


In [27]:
skipgram_similarity = levenshtein_similarity(sentence1, sentence2, skipgram_embedding_dictionary)
cbow_similarity = levenshtein_similarity(sentence1, sentence2, cbow_embedding_dictionary)

print("Skip-gram Levenshtein Distance-based Similarity:", skipgram_similarity)
print("CBOW Levenshtein Distance-based Similarity:", cbow_similarity)

Skip-gram Levenshtein Distance-based Similarity: 0.6705479452054794
CBOW Levenshtein Distance-based Similarity: 0.6444223107569721
