In [None]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.linalg

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
# CBOW is a window view; we are trying to infer the word in the middle.
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right

raw_text= """In text retrieval, full-text search refers to techniques for searching a single computer-stored document or a collection in a full-text database. 
Full-text search is distinguished from searches based on metadata or on parts of the original texts represented in databases such as titles, 
abstracts, selected sections, or bibliographical references.In a full-text search, a search engine examines all of the words in every stored document as it tries to match search criteria 
(for example, text specified by a user). Full-text-searching techniques became common in online bibliographic databases in the 1990s.Many websites and application programs 
such as word processing software provide full-text-search capabilities. Some web search engines, such as AltaVista, employ full-text-search techniques, 
while others index only a portion of the web pages examined by their indexing systems.When dealing with a small number of documents, it is possible for the full-text-search engine to directly scan the contents of the documents with each query, 
a strategy called serial scanning.This is what some tools, such as grep, do when searching.""".split()

# By deriving a set from "raw_text", we deduplicate the array
vocab = set(raw_text)
vocab_size = len(vocab)

# Basic Tokenizer
word_to_ix = {word: i for i, word in enumerate(vocab)}

print(len(raw_text))
print(vocab_size)

166
112


In [None]:
# Now lets create a "dataset"
data = []
for i in range(CONTEXT_SIZE, len(raw_text) - CONTEXT_SIZE):
    context = []
    for j in range(CONTEXT_SIZE, 0, -1):
        context.append(raw_text[i - j])

    for j in range(1, CONTEXT_SIZE + 1):
        context.append(raw_text[i + j])
        
    target = raw_text[i]
    data.append((context, target))
print(data[:5])


[(['In', 'text', 'full-text', 'search'], 'retrieval,'), (['text', 'retrieval,', 'search', 'refers'], 'full-text'), (['retrieval,', 'full-text', 'refers', 'to'], 'search'), (['full-text', 'search', 'to', 'techniques'], 'refers'), (['search', 'refers', 'techniques', 'for'], 'to')]


In [None]:
class CBOW(nn.Module):

    def __init__(self, vocab_size, embed_dim, context, hidden_size):
        super(CBOW, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.linear = nn.Sequential(
            nn.Linear(context*embed_dim, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, vocab_size),
            nn.LogSoftmax(dim = -1)
        )
        
    def forward(self, inputs):
#         print(inputs.shape)
#         print(inputs)
        out = self.embedding(inputs)
#         print(out.shape)
        out = out.view(1, -1)
#         print(out.shape)
        out = self.linear(out)
#         print(out.shape)
        return out
    
    # This is what we are actually interested on
    def get_word_vector(self, word):
        out = self.embedding(word)
        return out


In [None]:
VOCAB_SIZE = len(vocab)
EMBEDD_DIM = 10
BATCH_SIZE = 6
FULL_CONTEXT_SIZE = CONTEXT_SIZE * 2
HIDDEN_SIZE = 256

example_tensor = torch.randint(0, VOCAB_SIZE, [BATCH_SIZE, FULL_CONTEXT_SIZE]).to(device='cuda')
print(example_tensor)
print(device)

tensor([[ 69,  39,  13, 106],
        [ 53,  94,  18,  91],
        [111,  32,  17,  28],
        [ 56, 111,   9,  64],
        [ 81, 111,  39,  23],
        [ 81,  30,  51,  53]], device='cuda:0')
cuda


In [None]:
CBOW_embedding = nn.Embedding(VOCAB_SIZE, EMBEDD_DIM).to(device='cuda')

example_result = CBOW_embedding(example_tensor)
# Now we have a representation of the words in a vector of EMBEDD_DIM Dimensions
print(example_result.shape)
# example_result = torch.flatten(example_result, start_dim=1)
example_result = example_result.view(BATCH_SIZE, -1)
print(example_result.shape)


CBOW_hidden = nn.Linear(EMBEDD_DIM * FULL_CONTEXT_SIZE, HIDDEN_SIZE).to(device='cuda')

CBOW_hidden_relu = nn.ReLU()
example_result = CBOW_hidden(example_result)
example_result = CBOW_hidden_relu(example_result)
print(example_result.shape)
print(device)

torch.Size([6, 4, 10])
torch.Size([6, 40])
torch.Size([6, 256])
cuda


In [None]:
CBOW_output = nn.Linear(HIDDEN_SIZE, VOCAB_SIZE).to(device='cuda')

CBOW_output_soft = nn.LogSoftmax(dim = -1)
example_result = CBOW_output(example_result)
example_result = CBOW_output_soft(example_result)
print(example_result.shape)
print(device)


torch.Size([6, 112])
cuda


In [None]:
print(example_result[0].argmax(-1))
print(example_result[0])
print(example_result[1].argmax(-1))
print(example_result[1])
# print(example_result[2].argmax(-1))
# print(example_result[3].argmax(-1))
# print(example_result[4].argmax(-1))
print(device)

tensor(105, device='cuda:0')
tensor([-5.0558, -5.2768, -4.7627, -4.7145, -4.5963, -4.6088, -4.5524, -4.3121,
        -4.5132, -4.7526, -4.8646, -4.9905, -4.8123, -4.6209, -4.6675, -4.5826,
        -4.9897, -4.7059, -4.7516, -4.8466, -4.6100, -4.7823, -4.3319, -4.2959,
        -4.8649, -4.9223, -4.8116, -4.3349, -4.3782, -4.5373, -4.7756, -4.6252,
        -4.5847, -5.1595, -4.8907, -4.5817, -5.0192, -4.6256, -5.1471, -4.6095,
        -4.2658, -4.6264, -5.4936, -4.6324, -4.6546, -4.7912, -5.1694, -4.8234,
        -4.5502, -5.0332, -4.9396, -4.9764, -4.8820, -5.0477, -4.2381, -4.4675,
        -5.0875, -4.5827, -4.6923, -4.8908, -4.4941, -4.7939, -5.0441, -4.7967,
        -4.8061, -4.5325, -5.2259, -4.8873, -4.8066, -4.7174, -4.7025, -4.5007,
        -4.4148, -4.4291, -4.8298, -4.8457, -5.2504, -4.6443, -4.7679, -5.2180,
        -5.4880, -4.6324, -4.7344, -4.5548, -4.6799, -4.5521, -4.7645, -5.1476,
        -4.4434, -4.7676, -4.5460, -4.5790, -4.8169, -5.0828, -4.7807, -4.6402,
        -4.

In [None]:
# Simple helper method to transform the context to the expected int vector - tensor

def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)

make_context_vector(data[0][0], word_to_ix)
print(device)

cuda


In [None]:
def train(model, epochs, data, optimizer, loss_fn):
    model.train()
    losses = []
    for epoch in range(epochs):
        total_loss = 0
        for context, target in data:

            # Prepare inputs and targets 
            context_idxs = make_context_vector(context, word_to_ix)
            context_idxs = context_idxs.to(device)
            target_id = make_context_vector([target], word_to_ix)
            target_id = target_id.to(device)

            # Do not accumulate 
            model.zero_grad()

            # Step 3. Run the forward pass
            log_probs = model(context_idxs)
    #         break

            # Step 4. Compute your loss function.
            loss = loss_fn(log_probs, target_id)

    #         loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))

            # Step 5. Do the backward pass and update the gradient
            loss.backward()
            optimizer.step()

            # Get the Python number from a 1-element Tensor by calling tensor.item()
            total_loss += loss.item()
        losses.append(total_loss)
    return losses


In [None]:
VOCAB_SIZE = len(vocab)
EMBEDD_DIM = 10
BATCH_SIZE = 6
FULL_CONTEXT_SIZE = CONTEXT_SIZE * 2
HIDDEN_SIZE = 256

loss_function = nn.NLLLoss() # Because we are using Log_softmax
model = CBOW(vocab_size, EMBEDD_DIM, FULL_CONTEXT_SIZE, HIDDEN_SIZE)
model = model.to(device)
optimizer = optim.SGD(model.parameters(), lr=0.001)

lossesCBOW = train(model, 100, data, optimizer, loss_function)
model.eval()

print(lossesCBOW)  # The loss decreased every iteration over the training data!

[763.6366374492645, 754.586905002594, 745.7141070365906, 737.0117332935333, 728.4742207527161, 720.0976560115814, 711.8756289482117, 703.8017141819, 695.8725395202637, 688.0854001045227, 680.4357118606567, 672.9172267913818, 665.524908542633, 658.2507607936859, 651.0840373039246, 644.0180397033691, 637.041854262352, 630.1462798118591, 623.3186938762665, 616.5522645711899, 609.8371688127518, 603.1610062122345, 596.517971098423, 589.9010387659073, 583.304720044136, 576.7250691652298, 570.156611084938, 563.5910363197327, 557.0245035290718, 550.455904006958, 543.8841069936752, 537.3061303496361, 530.72320535779, 524.136483579874, 517.5395446717739, 510.9313420057297, 504.31455248594284, 497.6849080324173, 491.0447569489479, 484.3965540230274, 477.73818722367287, 471.0729140341282, 464.39693762362003, 457.7120953500271, 451.0183807015419, 444.31834268569946, 437.6125331521034, 430.9050825536251, 424.1885405629873, 417.4719637185335, 410.75493824481964, 404.0381329357624, 397.3229997307062, 

In [None]:
# list out keys and values separately
key_list = list(word_to_ix.keys())
val_list = list(word_to_ix.values())
def similarity_cbow(word_1, word_2):
    
    # test word similarity
    print(word_1)
    print(word_2)
    w1_id = torch.tensor(word_to_ix[word_1], dtype=torch.long)
    w2_id = torch.tensor(word_to_ix[word_2], dtype=torch.long)
    w1_id = w1_id.to(device)
    w2_id = w2_id.to(device)
    
    word_1_vec = model.get_word_vector(w1_id)
    word_2_vec = model.get_word_vector(w2_id)
    
    # The norm of a vector (1D-matrix) is the square root of the sum of all the squared values within the vector.
    print(math.sqrt(torch.square(word_1_vec).sum()))    
    print(torch.linalg.norm(word_1_vec))
    print(torch.linalg.norm(word_2_vec))
    print(word_1_vec.dot(word_2_vec))
    
    word_distance = torch.linalg.norm(word_1_vec - word_2_vec)
    print("Distance between '{}' & '{}' : {:0.4f}".format(word_1, word_2, word_distance))
    word_similarity = (word_1_vec.dot(word_2_vec) / (torch.linalg.norm(word_1_vec) * torch.linalg.norm(word_2_vec)))
    print("Similarity between '{}' & '{}' : {:0.4f}".format(word_1, word_2, word_similarity))
similarity_cbow("full-text", "search")

full-text
search
4.541694127737662
tensor(4.5417, device='cuda:0', grad_fn=<CopyBackwards>)
tensor(3.2998, device='cuda:0', grad_fn=<CopyBackwards>)
tensor(-2.3952, device='cuda:0', grad_fn=<DotBackward0>)
Distance between 'full-text' & 'search' : 6.0254
Similarity between 'full-text' & 'search' : -0.1598


In [None]:
def predict_middle_word(prev_words, post_words):
    prev_words = prev_words.split()
    post_words = post_words.split()

    input_words= make_context_vector(prev_words + post_words, word_to_ix)
    input_words = input_words.to(device)
    output = model(input_words)
    out_ind = output.argmax(1)
#     print(word_to_ix)
#     out_word = word_to_ix.itos[out_ind.item()]
    out_word = key_list[val_list.index(out_ind.item())]
    print(out_word)

In [None]:
predict_middle_word("full-text search", "to techniques")
predict_middle_word("Full-text search", "distinguished from ")

refers
is


Now that you saw how to create the CBOW model (word2vec), you should work on doing the "opposite" model, Skip-Gram

Skip-gram as you saw on the lectures, reverses the problem so you need to predict through the "fake task" the context of the input


In [None]:
class Skipgram(nn.Module):
    def __init__(self, vocab_size, embedding_dim, contextSizeIn):
        super(Skipgram, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.linearFirst = nn.Linear(contextSizeIn*embedding_dim, 256)
        self.linearSecond = nn.Linear(256, vocab_size)
        self.contextSize = contextSizeIn
        #self.Vembeddings = nn.Embedding(vocab_sze, embedding_dim)
        #self.embedding_dim = embedding_dim
    def forward(self, inputs):
        embeddings = self.embedding(inputs).view((1,-1))
        linFirstOut = F.relu(self.linearFirst(embeddings))
        linSecondOut = self.linearSecond(linFirstOut)
        logProbabilities = F.log_softmax(linSecondOut, dim=1)#.view(contextSize, -1)
        return logProbabilities
    def get_word_vector(self, word):
        out = self.embedding(word)
        return out 
        

In [None]:

losses = []
lossFunction = nn.NLLLoss()
modelSKIPGRAM = Skipgram(vocab_size, EMBEDD_DIM, FULL_CONTEXT_SIZE).to(device='cuda')
optimizer = optim.SGD(modelSKIPGRAM.parameters(), lr=0.001)

lossesSKIPGRAM = train(modelSKIPGRAM, 100, data, optimizer, lossFunction)

modelSKIPGRAM.eval()
print(lossesSKIPGRAM)


[768.51584815979, 760.3814251422882, 752.4013042449951, 744.5687782764435, 736.8795006275177, 729.331107378006, 721.9164290428162, 714.6358025074005, 707.4851417541504, 700.4603791236877, 693.5564687252045, 686.7672955989838, 680.0870931148529, 673.5092389583588, 667.0286347866058, 660.637082695961, 654.3286379575729, 648.0972534418106, 641.9334415197372, 635.8313617706299, 629.7829107046127, 623.7842313051224, 617.8294861316681, 611.9129755496979, 606.0280883312225, 600.1731289625168, 594.3452273607254, 588.5361127853394, 582.743234038353, 576.9627338647842, 571.1899589300156, 565.4227484464645, 559.6595476269722, 553.9010642170906, 548.1417244076729, 542.3838433623314, 536.6249312758446, 530.8630304336548, 525.0946083068848, 519.3204051852226, 513.5403293967247, 507.7536745071411, 501.9599792957306, 496.1563643813133, 490.3432182073593, 484.51973205804825, 478.6843473315239, 472.8398398756981, 466.9810518026352, 461.11416456103325, 455.23841857910156, 449.3543047308922, 443.458541631

In [None]:
def predict_middle_word_SKIPGRAM(prev_words, post_words):
    prev_words = prev_words.split()
    post_words = post_words.split()

    input_words= make_context_vector(prev_words + post_words, word_to_ix)
    input_words = input_words.to(device)
    output = modelSKIPGRAM(input_words)
    out_ind = output.argmax(1)
#     print(word_to_ix)
#     out_word = word_to_ix.itos[out_ind.item()]
    out_word = key_list[val_list.index(out_ind.item())]
    print(out_word)
predict_middle_word_SKIPGRAM("full-text search", "to techniques")
predict_middle_word_SKIPGRAM("Full-text search", "distinguished from")

refers
is


In [None]:
def similarity_SKIPGRAM(word_1, word_2):
    
    # test word similarity
    print(word_1)
    print(word_2)
    w1_id = torch.tensor(word_to_ix[word_1], dtype=torch.long)
    w2_id = torch.tensor(word_to_ix[word_2], dtype=torch.long)
    w1_id = w1_id.to(device)
    w2_id = w2_id.to(device)
    
    word_1_vec = modelSKIPGRAM.get_word_vector(w1_id)
    word_2_vec = modelSKIPGRAM.get_word_vector(w2_id)
    
    # The norm of a vector (1D-matrix) is the square root of the sum of all the squared values within the vector.
    print(math.sqrt(torch.square(word_1_vec).sum()))    
    print(torch.linalg.norm(word_1_vec))
    print(torch.linalg.norm(word_2_vec))
    print(word_1_vec.dot(word_2_vec))
    
    word_distance = torch.linalg.norm(word_1_vec - word_2_vec)
    print("Distance between '{}' & '{}' : {:0.4f}".format(word_1, word_2, word_distance))
    word_similarity = (word_1_vec.dot(word_2_vec) / (torch.linalg.norm(word_1_vec) * torch.linalg.norm(word_2_vec)))
    print("Similarity between '{}' & '{}' : {:0.4f}".format(word_1, word_2, word_similarity))
similarity_SKIPGRAM("full-text", "search")

full-text
search
2.440068774270296
tensor(2.4401, device='cuda:0', grad_fn=<CopyBackwards>)
tensor(2.5758, device='cuda:0', grad_fn=<CopyBackwards>)
tensor(-0.9484, device='cuda:0', grad_fn=<DotBackward0>)
Distance between 'full-text' & 'search' : 3.8060
Similarity between 'full-text' & 'search' : -0.1509
