**<h1>LAB1 Level A: Word Representation</h1>**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
import bz2

class SimpleWikiDataset():

    def __init__(self, max_sentences=None):
        self.max_sentences = max_sentences

    def __iter__(self):
        with bz2.open('/content/drive/MyDrive/TDDE09/labs/l1/simplewiki.txt.bz2', 'rt') as sentences:
            for i, sentence in enumerate(sentences):
                if self.max_sentences and i >= self.max_sentences:
                    break
                yield sentence.split()

In [12]:
# Dataset with all sentences (N = 1,163,769)
full_dataset = SimpleWikiDataset()

# Minimal dataset
full_dataset = SimpleWikiDataset(max_sentences=11638)

In [13]:
def tokens(sentences):
    for sentence in sentences:
        for token in sentence:
            yield token

In [8]:
print(sum(1 for t in tokens(mini_dataset)))

FileNotFoundError: ignored

<h3>Problem 1: Build the vocabulary and frequency table</h3>

In [14]:
import numpy as np

def make_vocab_and_counts(sentences, min_count=5):
    # TODO: Replace the next line with your own code
    #Tokenize
    tokens = np.concatenate(list(sentences))

    #Get frequency for unique tokens
    unique, counts = np.unique(tokens, return_counts=True)

    #Set and sort frequency in 'decending order'
    freq = dict(sorted(dict(zip(unique, counts)).items(), key=lambda foo:foo[1], reverse=True))

    #Filter out any tokens with less frequency than min_count
    freq_filtered = dict(filter(lambda foo:foo[1] >= min_count, freq.items())) #Filters words with frequency below min count.

    #Make the vocab based on the filtered frequency
    vocab = dict(zip(freq_filtered.keys(), range(0, len(freq)-1)))
    
    #return
    return vocab, np.fromiter(freq_filtered.values(), dtype=int)

Test Code, result should be 3231 for mini and 73339 for full.

In [15]:
vocab, counts = make_vocab_and_counts(full_dataset)
print(type(counts))
print("Vocabulary:  " + str(vocab))
print("Frequencies: " + str(list(counts)))
print("Unique Vocabulary Length: " + str(len(vocab)))

<class 'numpy.ndarray'>
Frequencies: [14507, 5014, 4602, 4529, 4000, 3219, 3010, 2958, 2225, 1271, 1271, 1262, 1144, 1141, 1102, 1056, 1008, 935, 876, 851, 816, 727, 709, 668, 610, 562, 539, 520, 519, 518, 517, 508, 482, 459, 453, 445, 441, 421, 415, 409, 406, 404, 397, 392, 389, 387, 384, 382, 379, 375, 352, 347, 332, 327, 323, 306, 305, 303, 303, 301, 300, 298, 297, 290, 286, 279, 278, 277, 276, 276, 275, 275, 267, 267, 265, 263, 263, 262, 259, 253, 253, 253, 251, 251, 250, 246, 244, 243, 242, 239, 238, 237, 236, 234, 232, 231, 225, 224, 223, 221, 220, 220, 220, 218, 217, 217, 216, 216, 216, 210, 209, 208, 205, 203, 197, 197, 196, 192, 192, 190, 188, 187, 186, 186, 183, 183, 182, 181, 178, 176, 175, 172, 170, 169, 168, 168, 168, 166, 166, 165, 163, 163, 163, 162, 158, 157, 157, 156, 155, 155, 154, 154, 152, 151, 150, 150, 148, 147, 146, 143, 143, 142, 142, 140, 140, 139, 138, 137, 136, 135, 135, 133, 130, 130, 130, 130, 129, 129, 128, 127, 127, 126, 126, 125, 124, 124, 123, 121, 120,

# <h3>Problem 2: Preprocess the data</h3>

In [16]:
import math as m

def subsampling(t, N, w):
    sub = max(0, (1 - m.sqrt((t*N)/(w))))
    if sub <= np.random.rand(1):
        return True

def preprocess(vocab, counts, sentences, threshold=0.001):
    # TODO: Replace the next line with your own code
    #Total words in counts
    total = np.sum(counts)

    #Tokenize |foo = np.concatenate(sentences)|  <- Works as well but lets use the already built tokens() this time
    #And discard words not in vocab
    for sentence in sentences:
        foo = []
        for word in sentence:
            #Find word in vocab. If not exist in vocab, do nothing
            if word in vocab:
                id = vocab[word]
                #Get the frequency of the word corresponding to id and subsample
                count = counts[id]
                if subsampling(threshold, total, count):
                    #Append id of the existing word in vocab
                    foo.append(id)
        if foo:
            yield foo

Test Code, result should be 59% for mini and 69% for full.

In [17]:
foo = preprocess(vocab, counts, full_dataset)

original_length = sum([len(token) for token in full_dataset])
foo_length = sum([len(token) for token in foo])
print("Original Length: " + str(original_length))
print("Filtered Length: " + str(foo_length))
print("Percentage: " + str(round((foo_length/original_length)*100, 2)) + "%")

Original Length: 170815
Filtered Length: 101487
Percentage: 59.41%


# <h3>Problem 3: Generate the training examples</h3>

In [18]:
vocab, counts = make_vocab_and_counts(full_dataset)

In [19]:
import torch

def training_examples(vocab, counts, sentences, window=5, num_ns=5, batch_size=1<<19, ns_exponent=0.75):
    # TODO: Replace the next line with your own code

    ##Negative Sampling
    exp = [val**ns_exponent for val in counts]
    exp = torch.tensor(exp)
    cumulative_sum = torch.cumsum(exp, dim=0)

    rand = torch.rand(batch_size, num_ns)*cumulative_sum[-1]

    ns = torch.searchsorted(cumulative_sum, rand)

    filtered_sentence = preprocess(vocab, counts, sentences)
    filtered_sentence = list(filtered_sentence)

    ##Positive Sampling
    x, y = [], []
    clock = 0
    for filtered_row in filtered_sentence:
        win_size = m.ceil(np.random.uniform(1, window))
        for idx, target_id in enumerate(filtered_row):
            context_window = slice(max(0, -win_size+idx),
                                   min(win_size+idx, len(filtered_row)),
                                   1)
            for context_id in filtered_row[context_window]:
                if target_id != context_id:
                    x.append(target_id)
                    y.append([context_id])
                    clock += 1
                    ##if we have reached batch size, yield target(x) and context(y)
                    if not clock < batch_size:
                        yield torch.tensor(x), torch.cat((torch.tensor(y), ns), 1)
                        ##Reset everything after a batch have been completed and generate new negative samples for new batch
                        x, y = [], []
                        clock = 0
                        ns = torch.searchsorted(cumulative_sum, torch.rand(batch_size, num_ns)*cumulative_sum[-1])

    ##if we don't reach batch size after going though every sentence in preprocessed sentences, yield what we have
    yield torch.tensor(x), torch.cat((torch.tensor(y), ns[:len(torch.tensor(x))]), 1)

Test Code, result should be 2.64 ratio for mini and 3.25 ratio for full.

In [20]:
foo = 0
batches = 0
w, c = [], []
for x, y in training_examples(vocab, counts, list(full_dataset)):
    w = x
    c = y
    foo += len(x)
    batches += 1
    # print("Target: " + str(x))
    # print("Context: " + str(y))

print("Batch Length: " + str(foo))
print("Ratio: " + str(foo/original_length))
print("Batches: " + str(batches))

Batch Length: 454125
Ratio: 2.6585779937359133
Batches: 1


# <h3>Problem 4: Implement the model</h3>

In [21]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SGNSModel(nn.Module):
    
    def __init__(self, vocab, embedding_dim):
        super().__init__()
        self.vocab = vocab
        self.w = nn.Embedding(len(vocab), embedding_dim)
        self.c = nn.Embedding(len(vocab), embedding_dim)
    
    def forward(self, w, c):
        # print(torch.transpose(self.c(c), 1, 2).shape)
        # print(torch.unsqueeze(self.w(w), 1).shape)
        # TODO: Replace the next line with your own code
        return torch.bmm(torch.unsqueeze(self.w(w), 1), torch.transpose(self.c(c), 1, 2))
        # return torch.zeros_like(c, dtype=torch.float, requires_grad=True)

Test Code, result should return first 100 random examples.

In [22]:
import numpy as np

def random_example(vocab, counts, sentences):
    skip = np.random.randint(100)
    for i, example in enumerate(training_examples(vocab, counts, sentences, num_ns=1, batch_size=5)):
        if i >= skip:
            break
    return example

w, c = random_example(vocab, counts, full_dataset)
model = SGNSModel(vocab, 50)
print(model.forward(w, c))

tensor([[[  3.2633,   3.9679]],

        [[-15.6455,  -2.7786]],

        [[ 17.1294,  12.0510]],

        [[ -6.1056, -20.6147]],

        [[  1.0540,   8.5170]]], grad_fn=<BmmBackward0>)


# <h3>Problem 5: Train the model</h3>

In [None]:
import torch
import torch.nn.functional as F
import torch.optim as optim
import tqdm

def onehot(batch):
    tensor = torch.ones((2,), dtype=torch.float64)
    one = tensor.new_full((batch, 1), 1)
    zero = tensor.new_full((batch, 5), 0)
    onehot = torch.cat((one, zero), 1)
    return onehot

def train(sentences, embedding_dim=50, window=5, num_ns=5, batch_size=1<<19, n_epochs=1, lr=1e-1):
    # Create the vocabulary and the counts
    vocab, counts = make_vocab_and_counts(sentences)
    
    # Initialize the model
    model = SGNSModel(vocab, embedding_dim)
    
    # Initialize the optimizer. Here we use Adam rather than plain SGD
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    # TODO: Add your code here
    # We train for several epochs
    with tqdm.tqdm(total=n_epochs) as pbar:
        for t in range(n_epochs):
            pbar.set_description(f'Epoch {t+1}')
            # running_loss = 0
            # iteration = 0

            # In each epoch, we loop over all the minibatches
            for w, c in training_examples(vocab, counts, sentences):

                # Reset the accumulated gradients
                optimizer.zero_grad()

                # Forward pass
                output = model.forward(w, c)
                c = torch.unsqueeze(onehot(c.shape[0]), 1)
                # print(output.shape)
                # print(torch.transpose(torch.unsqueeze(c, 1), 1, 2).shape)

                # Compute the loss
                loss = F.binary_cross_entropy(torch.sigmoid(output.to(torch.float32)), c.to(torch.float32))

                # Backward pass; propagates the loss and computes the gradients
                loss.backward()

                # Update the parameters of the model
                optimizer.step()
            #     running_loss += loss.item()
            #     iteration += 1

            # print("\n[Epoch " + str(t+1) +  " avg loss]: " + str(running_loss/iteration))
            pbar.update()
    
    return model

Test Code, result should take 10-40 min per epoch on CPU.

In [None]:
model = train(full_dataset, n_epochs=1)

Epoch 1: 100%|██████████| 1/1 [18:48<00:00, 1128.36s/it]


[Epoch 1 avg loss]: 0.6208312931385908





# <h3>Problem 5: Analyse the embeddings (reflection)</h3>

In [None]:
def save_model(model):
    # Extract the embedding vectors as a NumPy array
    embeddings = model.w.weight.detach().numpy()
    
    # Create the word–vector pairs
    items = sorted((i, w) for w, i in model.vocab.items())
    items = [(w, e) for (i, w), e in zip(items, embeddings)]
    
    # Write the embeddings and the word labels to files
    with open("/content/drive/MyDrive/_LiU/Master's/Courses/TDDE09 - NLP/Lab1/vectors.tsv", 'wt') as fp1, open("/content/drive/MyDrive/_LiU/Master's/Courses/TDDE09 - NLP/Lab1/metadata.tsv", 'wt') as fp2:
        for w, e in items:
            print('\t'.join('{:.5f}'.format(x) for x in e), file=fp1)
            print(w, file=fp2)

save_model(model)

Take some time to explore the embedding space. In particular, inspect the local neighbourhoods of words that you are curious about, say the 10 closest neighbours. Document your exploration in a short reflection piece (ca. 150 words). Respond to the following prompts:

* Which words did you try? Which results did you get? Did you do anything else than inspecting local neighbourhoods?
* Based on what you know about word embeddings, did you expect your results? How do you explain them?
* What did you learn? How, exactly, did you learn it? Why does this learning matter?



 TODO: Enter your text here

 * I tried the words like: sea, race, meter. Both race and meter only yielded a couple of neighbours, and some of which were the word in different forms. For example meter came in the form of meters, kilometer and diameter. Though interestingly every form of meter occured quite some distance apart from each others. This occured for all three of the words and this can be because a word in plural from doesn't really happen in the same context as its singular counterpart.

 * I did expect that closest neighbours would be words that were more contextually similar, and not that only words with similar spelling would occur as neighbours. For example, one could expect seeing words like "driver", "road", "street", "car", etc. as neighbours to "traffic". I also expected that semantic words would be more connected and closer then syntactic words like "meter" and "meters". I haven't had time to train the model on the full_dataset which is likely a reason for these bad predictions.

 * I have gained a deeper understanding about skip-gram and how words in context influence each other. How the meaning of the target word can vary depending on the context and the importance of filtering out stop words by checking the frequency of words. Without the filtering (subsampling), words that occur very frequently can have an misguiding influence on the context as well a influence the speed of learning. I learned all of this through implementation of the methods and models from scratch, instead of using the already built in function in the ready exist in the python library. By building the skip-gram model from the group up, I learned the process of how a skip-gram work in more detail. Doing this process the long way round matters in order to "actually" learn what each process do and how each process work in tangent with each other. Instead of only learning that a built in function works because it "works".