In [34]:
corpus = [
          "We always come to Paris",
          "The professor is from Australia",
          "I live in Stanford",
          "He comes from Taiwan",
          "The capital of Turkey is Ankara"
         ]
corpus

['We always come to Paris',
 'The professor is from Australia',
 'I live in Stanford',
 'He comes from Taiwan',
 'The capital of Turkey is Ankara']

In [35]:
def preprocess(sentence):
    return sentence.lower().split()
preprocessed_corpus = []
for sentence in corpus:
    preprocessed_corpus.append(preprocess(sentence))

preprocessed_corpus

[['we', 'always', 'come', 'to', 'paris'],
 ['the', 'professor', 'is', 'from', 'australia'],
 ['i', 'live', 'in', 'stanford'],
 ['he', 'comes', 'from', 'taiwan'],
 ['the', 'capital', 'of', 'turkey', 'is', 'ankara']]

In [8]:
locations = set(["australia", "ankara", "paris", "stanford", "taiwan", "turkey"])
train_labels =[[1 if word in locations else 0 for word in sentence] for sentence in preprocessed_corpus]
train_labels


[[0, 0, 0, 0, 1],
 [0, 0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1, 0, 1]]

In [13]:
vocabulary = set(w for s in preprocessed_corpus for w in s)
vocabulary.add("<unk>")   # adding an unknown word if incase some word pops up that is not in the vocabulary
vocabulary


{'<unk>',
 'always',
 'ankara',
 'australia',
 'capital',
 'come',
 'comes',
 'from',
 'he',
 'i',
 'in',
 'is',
 'live',
 'of',
 'paris',
 'professor',
 'stanford',
 'taiwan',
 'the',
 'to',
 'turkey',
 'we'}

In [14]:
vocabulary.add("<pad>")

In [16]:
def pad_window(sentence, window_size, padtoken = "<pad>"):
    window = [padtoken]*window_size
    return window + sentence + window

window_size = 2
pad_window(preprocessed_corpus[0], window_size, "<pad>")

['<pad>', '<pad>', 'we', 'always', 'come', 'to', 'paris', '<pad>', '<pad>']

In [17]:
ix_to_word = sorted(list(vocabulary))

word_to_ix = {word: ind for ind, word in enumerate(ix_to_word)}
word_to_ix

{'<pad>': 0,
 '<unk>': 1,
 'always': 2,
 'ankara': 3,
 'australia': 4,
 'capital': 5,
 'come': 6,
 'comes': 7,
 'from': 8,
 'he': 9,
 'i': 10,
 'in': 11,
 'is': 12,
 'live': 13,
 'of': 14,
 'paris': 15,
 'professor': 16,
 'stanford': 17,
 'taiwan': 18,
 'the': 19,
 'to': 20,
 'turkey': 21,
 'we': 22}

In [18]:
ix_to_word[1]

'<unk>'

In [20]:
def convert_token_to_indices(sentence, word_to_ix):
    indices = []
    for token in sentence:
        if token in word_to_ix:
            index = word_to_ix[token]
        else:
            index = word_to_ix["<unk>"]
        indices.append(index)
    return indices

example_sentence = ["we", "always", "come", "to", "kuwait"]
example_indices = convert_token_to_indices(example_sentence, word_to_ix)
restored_example = [ix_to_word[ind] for ind in example_indices]

print(f"Original sentence is: {example_sentence}")
print(f"Going from words to indices: {example_indices}")
print(f"Going from indices to words: {restored_example}")

Original sentence is: ['we', 'always', 'come', 'to', 'kuwait']
Going from words to indices: [22, 2, 6, 20, 1]
Going from indices to words: ['we', 'always', 'come', 'to', '<unk>']


In [22]:
example_padded_indices = [convert_token_to_indices(s, word_to_ix) for s in preprocessed_corpus]
example_padded_indices

[[22, 2, 6, 20, 15],
 [19, 16, 12, 8, 4],
 [10, 13, 11, 17],
 [9, 7, 8, 18],
 [19, 5, 14, 21, 12, 3]]

In [None]:
import torch.nn as nn
import torch

embedding_dim = 5
embeds = nn.Embedding(len(vocabulary), embedding_dim)  ######### IMPORTANT (initially random)

list(embeds.parameters())

[Parameter containing:
 tensor([[-1.7934, -0.3921,  0.3263,  0.0237,  0.3535],
         [-0.4844,  0.1531,  0.7052,  0.6178,  0.3414],
         [ 0.4837, -0.6306, -0.9472,  0.4646, -0.6702],
         [ 1.4959, -0.4575,  0.3866,  1.8097, -0.0988],
         [ 1.0381,  0.4787, -0.2977,  0.7271,  0.9825],
         [-0.2925,  0.4193, -0.4721,  0.0244, -0.7662],
         [ 1.4292,  0.7997,  0.0171, -1.4060,  0.2648],
         [-0.8661,  0.4205,  0.0369,  1.5047,  0.3901],
         [-0.4543,  0.8686,  0.9590,  1.8814,  1.0227],
         [ 0.7355, -0.2077, -0.9366,  0.1022,  0.8561],
         [-0.8309, -1.1065, -0.1666, -0.5171, -0.4862],
         [-1.0524, -0.3441, -0.2002,  0.4536,  0.3124],
         [ 1.1443, -0.1175,  1.5089,  0.0245, -0.8457],
         [-0.5645,  2.1834, -0.4145, -0.5798, -0.4642],
         [ 0.2338,  0.6342,  1.1468, -0.3125,  0.5227],
         [ 1.6301, -0.3974, -1.8638,  1.1229,  0.5235],
         [-0.2502, -0.2389, -0.6230,  0.7744,  0.1322],
         [-0.0521, -0.174

In [32]:

index = word_to_ix['taiwan']
index_tensor = torch.tensor(index, dtype = torch.long)
taiwan_embeds = embeds(index_tensor)
taiwan_embeds

tensor([ 0.0861, -0.3245,  1.4825, -0.8944, -1.5655],
       grad_fn=<EmbeddingBackward0>)

In [33]:
index_paris = word_to_ix["paris"]
index_ankara = word_to_ix["ankara"]
indices = [index_paris, index_ankara]
indices_tensor = torch.tensor(indices, dtype=torch.long)
embeddings = embeds(indices_tensor)
embeddings

tensor([[ 1.6301, -0.3974, -1.8638,  1.1229,  0.5235],
        [ 1.4959, -0.4575,  0.3866,  1.8097, -0.0988]],
       grad_fn=<EmbeddingBackward0>)

**DATALOADER**

In [46]:
from torch.utils.data import DataLoader
from functools import partial

def custom_collate_fn(batch, window_size, word_to_ix):

    x, y = zip(*batch)                  # unpack the batch into training examples (x) and labels(y)

    def pad_window(sentence, window_size, pad_token="<pad>"):
        window = [pad_token] * window_size
        return window + sentence + window
    
    x = [pad_window(s, window_size=window_size) for s in x]

    def convert_token_to_indices(sentence, word_to_ix):
        return [word_to_ix.get(token, word_to_ix["<unk>"]) for token in sentence]
    
    x = [convert_token_to_indices(s, word_to_ix) for s in x]

    pad_token_ix = word_to_ix["<pad>"]

    # pad_sequence function expects the input to be a tensor, so we turn x into a tensor
    x = [torch.LongTensor(x_i) for x_i in x]
    x_padded = nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=pad_token_ix)       # pads all input sequence in x to be the length of the longest sequnce

    # We will also pad the labels. Before doing so, we will record the number of labels so that we know how many words existed in each example.
    lengths = [len(label) for label in y]
    lengths = torch.LongTensor(lengths)

    y = [torch.LongTensor(y_i) for y_i in y]
    y_padded = nn.utils.rnn.pad_sequence(y, batch_first=True, padding_value=0)

    # We are now ready to return our variables. The order we return our variables
    # here will match the order we read them in our training loop.
    return x_padded, y_padded, lengths




In [47]:
data = list(zip(preprocessed_corpus, train_labels))

batch_size = 2
shuffle = True
window_size = 2
collate_fn = partial(custom_collate_fn, window_size=window_size, word_to_ix=word_to_ix)

loader = DataLoader(data, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)

counter = 0
for batched_x, batched_y, batched_lengths in loader:
    print(f"Iteration {counter}")
    print("Batched Input:")
    print(batched_x)
    print("Batched Labels:")
    print(batched_y)
    print("Batched Lengths:")
    print(batched_lengths)
    print("")
    counter += 1


Iteration 0
Batched Input:
tensor([[ 0,  0, 19,  5, 14, 21, 12,  3,  0,  0],
        [ 0,  0, 10, 13, 11, 17,  0,  0,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 1, 0, 1],
        [0, 0, 0, 1, 0, 0]])
Batched Lengths:
tensor([6, 4])

Iteration 1
Batched Input:
tensor([[ 0,  0, 19, 16, 12,  8,  4,  0,  0],
        [ 0,  0, 22,  2,  6, 20, 15,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 0, 1],
        [0, 0, 0, 0, 1]])
Batched Lengths:
tensor([5, 5])

Iteration 2
Batched Input:
tensor([[ 0,  0,  9,  7,  8, 18,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 1]])
Batched Lengths:
tensor([4])



In [48]:
print(f"Original Tensor: ")
print(batched_x)
print("")

# Create the 2 * 2 + 1 chunks
chunk = batched_x.unfold(1, window_size*2 + 1, 1)
print(f"Windows: ")
print(chunk)

Original Tensor: 
tensor([[ 0,  0,  9,  7,  8, 18,  0,  0]])

Windows: 
tensor([[[ 0,  0,  9,  7,  8],
         [ 0,  9,  7,  8, 18],
         [ 9,  7,  8, 18,  0],
         [ 7,  8, 18,  0,  0]]])
