In [95]:
import torch # Basic Torch Module
import torch.nn as nn # NeuralNet Layers
import torch.optim as optim # Optimisers
import pprint # Printing a bit prettier
pp = pprint.PrettyPrinter()

In [14]:
# Tensors

# List vs Tensors ('3D Matrices')

ll = [[1,2,3], [4,5,6], [7,8,9]]

print(ll)

[[1, 2, 3], [4, 5, 6], [7, 8, 9]]


In [15]:
ts = torch.tensor([[1,2,3], [4,5,6], [7,8,9]])
print(ts)

tensor([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]])


In [18]:
ts = torch.tensor([[1.1,2,3], [4,5,6], [7,8,9]], dtype = torch.float32)
print(ts)

tensor([[1.1000, 2.0000, 3.0000],
        [4.0000, 5.0000, 6.0000],
        [7.0000, 8.0000, 9.0000]])


In [19]:
# Alternative Zeros

zeros = torch.zeros(2, 5)

print(zeros)

tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]])


In [21]:
rr = torch.arange(1,10)
print(rr)

tensor([1, 2, 3, 4, 5, 6, 7, 8, 9])


In [22]:
a = torch.tensor([[1, 2], [2, 3], [4, 5]]) 
b = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8]]) 

print("A is", a)
print("B is", b)
print("The product is", a.matmul(b))
print("The other product is", a @ b) # +, -, *, @

A is tensor([[1, 2],
        [2, 3],
        [4, 5]])
B is tensor([[1, 2, 3, 4],
        [5, 6, 7, 8]])
The product is tensor([[11, 14, 17, 20],
        [17, 22, 27, 32],
        [29, 38, 47, 56]])
The other product is tensor([[11, 14, 17, 20],
        [17, 22, 27, 32],
        [29, 38, 47, 56]])


In [28]:
matr_2d = torch.tensor([[1, 2, 3], [4, 5, 6], [4, 5, 6], [4, 5, 6]])
print(matr_2d.shape)
print(matr_2d)

torch.Size([4, 3])
tensor([[1, 2, 3],
        [4, 5, 6],
        [4, 5, 6],
        [4, 5, 6]])


In [24]:
matr_3d = torch.tensor([[[1, 2, 3, 4], [-2, 5, 6, 9]], [[5, 6, 7, 2], [8, 9, 10, 4]], [[-3, 2, 2, 1], [4, 6, 5, 9]]])
print(matr_3d)
print(matr_3d.shape)

tensor([[[ 1,  2,  3,  4],
         [-2,  5,  6,  9]],

        [[ 5,  6,  7,  2],
         [ 8,  9, 10,  4]],

        [[-3,  2,  2,  1],
         [ 4,  6,  5,  9]]])
torch.Size([3, 2, 4])


In [31]:
prod = matr_3d @ matr_2d

print(prod)
print('The shape of the prod is: ', prod.shape)

tensor([[[ 37,  47,  57],
         [ 78,  96, 114]],

        [[ 65,  85, 105],
         [100, 131, 162]],

        [[ 17,  19,  21],
         [ 84, 108, 132]]])
The shape of the prod is:  torch.Size([3, 2, 3])


In [32]:
rr = torch.arange(1, 16)
print("The shape is currently", rr.shape)
print("The contents are currently", rr)
print()
rr = rr.view(5, 3)
print("After reshaping, the shape is currently", rr.shape)
print("The contents are currently", rr)

The shape is currently torch.Size([15])
The contents are currently tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15])

After reshaping, the shape is currently torch.Size([5, 3])
The contents are currently tensor([[ 1,  2,  3],
        [ 4,  5,  6],
        [ 7,  8,  9],
        [10, 11, 12],
        [13, 14, 15]])


In [33]:
import numpy as np

# numpy.ndarray --> torch.Tensor:
arr = np.array([[1, 0, 5]])
data = torch.tensor(arr)
print("This is a torch.tensor", data)

# torch.Tensor --> numpy.ndarray:
new_arr = data.numpy()
print("This is a np.ndarray", new_arr)

This is a torch.tensor tensor([[1, 0, 5]])
This is a np.ndarray [[1 0 5]]


In [39]:
arrll = np.array(ll)
arlltotensor = torch.tensor(arrll)
arlltotensor

tensor([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]])

In [34]:
prod.numpy()

array([[[ 37,  47,  57],
        [ 78,  96, 114]],

       [[ 65,  85, 105],
        [100, 131, 162]],

       [[ 17,  19,  21],
        [ 84, 108, 132]]])

One of the reasons why we use tensors is vectorized operations: operations that be conducted in parallel over a particular dimension of a tensor.

In [40]:
data = torch.arange(1, 36, dtype=torch.float32).reshape(5, 7)
print("Data is:", data)

# We can perform operations like *sum* over each row...
print("Taking the sum over columns:")
print(data.sum(dim=0))

# or over each column.
print("Taking thep sum over rows:")
print(data.sum(dim=1))

# Other operations are available:
print("Taking the stdev over rows:")
print(data.std(dim=1))

Data is: tensor([[ 1.,  2.,  3.,  4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11., 12., 13., 14.],
        [15., 16., 17., 18., 19., 20., 21.],
        [22., 23., 24., 25., 26., 27., 28.],
        [29., 30., 31., 32., 33., 34., 35.]])
Taking the sum over columns:
tensor([ 75.,  80.,  85.,  90.,  95., 100., 105.])
Taking thep sum over rows:
tensor([ 28.,  77., 126., 175., 224.])
Taking the stdev over rows:
tensor([2.1602, 2.1602, 2.1602, 2.1602, 2.1602])


In [43]:
data = torch.tensor([[1,2.2, 9.6], [4,-7.2, 6.3]])

print(data.mean(dim=1)) # Row Wise Operation
print(data.mean(dim=0)) # Col Wise Operation



tensor([4.2667, 1.0333])
tensor([ 2.5000, -2.5000,  7.9500])


In [67]:
# Auto Grad

# Create an example tensor
# requires_grad parameter tells PyTorch to store gradients
x = torch.tensor([2.], requires_grad=True)

# Print the gradient if it is calculated
# Currently None since x is a scalar (it is 0)
pp.pprint(x.grad)


y = 3*x**2
y.backward()
print(x.grad) # d(y)/d(x) = d(3x^2)/d(x) = 6x = 12

None
tensor([12.])


In [70]:
# test 2
x = torch.tensor([np.sin(3)], requires_grad = True)

pp.pprint(x.grad)
y = 3*x**2
y.backward()
x.grad

None


tensor([0.8467], dtype=torch.float64)

In [72]:
# Lets update the gradient again one gradient step
z = 3*x**2
z.backward()
x.grad

tensor([1.6934], dtype=torch.float64)

### **Linear Layer**
We can use `nn.Linear(H_in, H_out)` to create a a linear layer. This will take a matrix of `(N, *, H_in)` dimensions and output a matrix of `(N, *, H_out)`. The `*` denotes that there could be arbitrary number of dimensions in between. The linear layer performs the operation `Ax+b`, where `A` and `b` are initialized randomly. If we don't want the linear layer to learn the bias parameters, we can initialize our layer with `bias=False`.

In [88]:
# Create the inputs
input = torch.ones(2,3,4) # N* H_in -> N*H_out

# Make a linear layers transforming N,*,H_in dimensinal inputs to N,*,H_out
# dimensional outputs
linear = nn.Linear(4, 2)
linear_output = linear(input)
linear_output

tensor([[[-0.2484, -0.3679],
         [-0.2484, -0.3679],
         [-0.2484, -0.3679]],

        [[-0.2484, -0.3679],
         [-0.2484, -0.3679],
         [-0.2484, -0.3679]]], grad_fn=<AddBackward0>)

In [76]:
list(linear.parameters()) # Ax + b

[Parameter containing:
 tensor([[-0.0845, -0.3740, -0.2001,  0.4769],
         [-0.3340,  0.1874,  0.2491, -0.4552]], requires_grad=True),
 Parameter containing:
 tensor([-0.0722, -0.0878], requires_grad=True)]

In [86]:
maxpool = nn.AdaptiveMaxPool2d(4,2)
maxpool_output = maxpool(input)
maxpool_output[1]

tensor([[[ 0,  1,  2,  3],
         [ 0,  1,  2,  3],
         [ 4,  5,  6,  7],
         [ 8,  9, 10, 11]],

        [[ 0,  1,  2,  3],
         [ 0,  1,  2,  3],
         [ 4,  5,  6,  7],
         [ 8,  9, 10, 11]]])

In [89]:
sigmoid = nn.Sigmoid()
output = sigmoid(linear_output)
output

tensor([[[0.4382, 0.4091],
         [0.4382, 0.4091],
         [0.4382, 0.4091]],

        [[0.4382, 0.4091],
         [0.4382, 0.4091],
         [0.4382, 0.4091]]], grad_fn=<SigmoidBackward0>)

In [90]:
output = sigmoid(maxpool_output[1])
output

tensor([[[0.5000, 0.7311, 0.8808, 0.9526],
         [0.5000, 0.7311, 0.8808, 0.9526],
         [0.9820, 0.9933, 0.9975, 0.9991],
         [0.9997, 0.9999, 1.0000, 1.0000]],

        [[0.5000, 0.7311, 0.8808, 0.9526],
         [0.5000, 0.7311, 0.8808, 0.9526],
         [0.9820, 0.9933, 0.9975, 0.9991],
         [0.9997, 0.9999, 1.0000, 1.0000]]])

In [91]:
class MultilayerPerceptron(nn.Module):


    def __init__(self, input_size, hidden_size):
        # Call to the __init__ function of the super class
        super(MultilayerPerceptron, self).__init__()

        # Bookkeeping: Saving the initialization parameters
        self.input_size = input_size 
        self.hidden_size = hidden_size 

        # Defining of our model
        # There isn't anything specific about the naming of `self.model`. It could
        # be something arbitrary.
        self.model = nn.Sequential(
            nn.Linear(self.input_size, self.hidden_size),
            nn.ReLU(),
            nn.Linear(self.hidden_size, self.input_size),
            nn.Sigmoid()
        )


        def forward(self, x):
            output = self.model(x)
            return output

In [92]:
# Alternative Way to define it 

class MultilayerPerceptron(nn.Module):

    def __init__(self, input_size, hidden_size):
        # Call to the __init__ function of the super class
        super(MultilayerPerceptron, self).__init__()

        # Bookkeeping: Saving the initialization parameters
        self.input_size = input_size 
        self.hidden_size = hidden_size 

        # Defining of our layers
        self.linear = nn.Linear(self.input_size, self.hidden_size)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(self.hidden_size, self.input_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        linear = self.linear(x)
        relu = self.relu(linear)
        linear2 = self.linear2(relu)
        output = self.sigmoid(linear2)
        return output

In [93]:
# Make a sample input
input = torch.randn(2, 5)

# Create our model
model = MultilayerPerceptron(5, 3)

# Pass our input through our model
model(input)

tensor([[0.5959, 0.4947, 0.3749, 0.5005, 0.6783],
        [0.5021, 0.4759, 0.4328, 0.5804, 0.5663]], grad_fn=<SigmoidBackward0>)

In [94]:
list(model.named_parameters())

[('linear.weight',
  Parameter containing:
  tensor([[-0.4161,  0.4228,  0.2159, -0.1273,  0.3866],
          [ 0.3517, -0.1959, -0.0090,  0.2656,  0.0216],
          [ 0.2552,  0.0704,  0.2814, -0.2291,  0.1701]], requires_grad=True)),
 ('linear.bias',
  Parameter containing:
  tensor([-0.0456, -0.0508, -0.1223], requires_grad=True)),
 ('linear2.weight',
  Parameter containing:
  tensor([[ 0.4478, -0.5310, -0.5390],
          [ 0.0890,  0.0979, -0.0655],
          [-0.2861, -0.1515,  0.1486],
          [-0.3793, -0.4265,  0.4907],
          [ 0.5768, -0.2459,  0.1907]], requires_grad=True)),
 ('linear2.bias',
  Parameter containing:
  tensor([ 0.0648, -0.0894, -0.2861,  0.2729,  0.2468], requires_grad=True))]

In [96]:
# Create the y data
y = torch.ones(10, 5)

# Add some noise to our goal y to generate our x
# We want out model to predict our original data, albeit the noise
x = y + torch.randn_like(y)
x

tensor([[ 0.8613,  0.4758,  3.0184, -0.0661, -0.3059],
        [ 1.2183,  2.2498,  0.6187,  1.2606,  0.7395],
        [ 2.4300,  1.6481,  1.3270,  1.5043, -0.5223],
        [ 1.3513, -0.5992,  1.5634,  1.3973, -0.2025],
        [ 0.1527,  1.7099,  1.4883, -2.0671,  1.8627],
        [ 1.1948,  0.1298,  0.1568, -0.3245,  1.6241],
        [ 2.3872,  1.3440,  1.0434,  0.1171,  0.9605],
        [ 0.3640,  0.1352,  1.5010,  0.5231,  1.2953],
        [ 4.0890,  0.6901,  0.6015,  2.4574,  0.9185],
        [ 1.2452,  0.4282,  1.7317,  3.3398,  1.0501]])

In [97]:
# Instantiate the model
model = MultilayerPerceptron(5, 3)

# Define the optimizer
adam = optim.Adam(model.parameters(), lr=1e-1)

# Define loss using a predefined loss function
loss_function = nn.BCELoss()

# Calculate how our model is doing now
y_pred = model(x)
loss_function(y_pred, y).item()

0.8096874952316284

In [98]:
# Set the number of epoch, which determines the number of training iterations
n_epoch = 10 

for epoch in range(n_epoch):
  # Set the gradients to 0 such they do not accumulate over time
  adam.zero_grad()

  # Get the model predictions
  y_pred = model(x)

  # Get the loss
  loss = loss_function(y_pred, y)

  # Print stats
  print(f"Epoch {epoch}: traing loss: {loss}")

  # Compute the gradients
  loss.backward()

  # Take a step to optimize the weights
  adam.step()

Epoch 0: traing loss: 0.8096874952316284
Epoch 1: traing loss: 0.6837695240974426
Epoch 2: traing loss: 0.569698691368103
Epoch 3: traing loss: 0.4380474090576172
Epoch 4: traing loss: 0.30530479550361633
Epoch 5: traing loss: 0.1962931603193283
Epoch 6: traing loss: 0.11694113910198212
Epoch 7: traing loss: 0.06505902111530304
Epoch 8: traing loss: 0.03437305614352226
Epoch 9: traing loss: 0.017681915313005447


In [99]:
list(model.parameters())

[Parameter containing:
 tensor([[ 0.9557,  1.0294,  0.9336,  0.1610,  0.7505],
         [-0.5454, -0.1231, -0.1051, -0.1946, -0.7005],
         [-0.0791,  0.2149, -0.3598, -0.2682, -0.2641]], requires_grad=True),
 Parameter containing:
 tensor([ 0.9267, -0.3853, -0.4395], requires_grad=True),
 Parameter containing:
 tensor([[ 1.1494,  1.0764, -0.5574],
         [ 1.4297, -0.0541, -0.5118],
         [ 0.5742,  0.0980, -0.3255],
         [ 1.2964, -0.0293, -0.4493],
         [ 0.7406,  0.9080,  0.4901]], requires_grad=True),
 Parameter containing:
 tensor([0.5166, 0.5561, 1.0287, 0.5255, 0.8529], requires_grad=True)]

In [100]:
# See how our model performs on the training data
y_pred = model(x)
y_pred

tensor([[0.9977, 0.9994, 0.9780, 0.9989, 0.9881],
        [0.9992, 0.9998, 0.9870, 0.9997, 0.9940],
        [0.9994, 0.9999, 0.9889, 0.9998, 0.9951],
        [0.9840, 0.9935, 0.9442, 0.9899, 0.9598],
        [0.9986, 0.9997, 0.9831, 0.9994, 0.9916],
        [0.9896, 0.9962, 0.9547, 0.9938, 0.9694],
        [0.9996, 0.9999, 0.9905, 0.9998, 0.9960],
        [0.9931, 0.9977, 0.9627, 0.9961, 0.9763],
        [0.9998, 1.0000, 0.9943, 0.9999, 0.9979],
        [0.9989, 0.9998, 0.9850, 0.9995, 0.9928]], grad_fn=<SigmoidBackward0>)

In [104]:
y_pred.round() - y 

tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]], grad_fn=<SubBackward0>)

In [107]:
## NlP with Pytorch

# Our raw data, which consists of sentences
corpus = [
          "We always come to Paris",
          "The professor is from Tuebingen",
          "I live in Berlin",
          "He comes from Taiwan",
          "The capital of Turkey is Ankara"
         ]

#### Preprocessing

To make it easier for our models to learn, we usually apply a few preprocessing steps to our data. This is especially important when dealing with text data. Here are some examples of text preprocessing:
* **Tokenization**: Tokenizing the sentences into words.
* **Lowercasing**: Changing all the letters to be lowercase.
* **Noise removal:** Removing special characters (such as punctuations). 
* **Stop words removal**: Removing commonly used words.

Which preprocessing steps are necessary is determined by the task at hand. For example, although it is useful to remove special characters in some tasks, for others they may be important (for example, if we are dealing with multiple languages). For our task, we will lowercase our words and tokenize. 


In [109]:
def preprocess_sentence(sentence):
    return sentence.lower().split() # lowercase and split the sentence

# Create our training set
train_sentences = [preprocess_sentence(sent) for sent in corpus]
train_sentences

[['we', 'always', 'come', 'to', 'paris'],
 ['the', 'professor', 'is', 'from', 'tuebingen'],
 ['i', 'live', 'in', 'berlin'],
 ['he', 'comes', 'from', 'taiwan'],
 ['the', 'capital', 'of', 'turkey', 'is', 'ankara']]

In [111]:
# Set of locations that appear in our corpus
locations = set(["tuebingen", "ankara", "paris", "berlin", "taiwan", "turkey"])

# Our train labels
train_labels = [[1 if word in locations else 0 for word in sent] for sent in train_sentences]
train_labels

[[0, 0, 0, 0, 1],
 [0, 0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1, 0, 1]]

#### Converting Words to Embeddings

Let's look at our training data a little more closely. Each datapoint we have is a sequence of words. On the other hand, we know that machine learning models work with numbers in vectors. How are we going to turn words into numbers? You may be thinking embeddings and you are right!

Imagine that we have an embedding lookup table `E`, where each row corresponds to an embedding. That is, each word in our vocabulary would have a corresponding embedding row `i` in this table. Whenever we want to find an embedding for a word, we will follow these steps:
1. Find the corresponding index `i` of the word in the embedding table: `word->index`.
2. Index into the embedding table and get the embedding: `index->embedding`.

Let's look at the first step. We should assign all the words in our vocabulary to a corresponding index. We can do it as follows:
1. Find all the unique words in our corpus.
2. Assign an index to each.

In [112]:
# Find all the unique words in our corpus 
vocabulary = set(w for s in train_sentences for w in s)
vocabulary

{'always',
 'ankara',
 'berlin',
 'capital',
 'come',
 'comes',
 'from',
 'he',
 'i',
 'in',
 'is',
 'live',
 'of',
 'paris',
 'professor',
 'taiwan',
 'the',
 'to',
 'tuebingen',
 'turkey',
 'we'}

`vocabulary` now contains all the words in our corpus. On the other hand, during the test time, we can see words that are not contained in our vocabulary. If we can figure out a way to represent the unknown words, our model can still reason about whether they are a `LOCATION` or not, since we are also looking at the neighboring words for each prediction. 

We introduce a special token, `<unk>`, to tackle the words that are out of vocabulary. We could pick another string for our unknown token if we wanted. The only requirement here is that our token should be unique: we should only be using this token for unknown words. We will also add this special token to our vocabulary. 

In [113]:
vocabulary.add("<unk>") 

Earlier we mentioned that our task was called `Word Window Classification` because our model is looking at the surroundings words in addition to the given word when it needs to make a prediction. 

For example, let's take the sentence "We always come to Paris". The corresponding training label for this sentence is `0, 0, 0, 0, 1` since only Paris, the last word, is a `LOCATION`. In one pass (meaning a call to `forward()`), our model will try to generate the correct label for one word. Let's say our model is trying to generate the correct label `1` for `Paris`. If we only allow our model to see `Paris`, but nothing else, we will miss out on the important information that the word `to` often times appears with `LOCATION`s. 

Word windows allow our model to consider the surrounding `+N` or `-N` words of each word when making a prediction. In our earlier example for `Paris`, if we have a window size of 1, that means our model will look at the words that come immediately before and after `Paris`, which are `to`, and, well, nothing. Now, this raises another issue. `Paris` is at the end of our sentence, so there isn't another word following it. Remember that we define the input dimensions of our `PyTorch` models when we are initializing them. If we set the window size to be `1`, it means that our model will be accepting `3` words in every pass. We cannot have our model expect `2` words from time to time.

The solution is to introduce a special token, such as `<pad>`, that will be added to our sentences to make sure that every word has a valid window around them. Similar to `<unk>` token, we could pick another string for our pad token if we wanted, as long as we make sure it is used for a unique purpose. 

In [114]:
# Add the <pad> token to our vocabulary
vocabulary.add("<pad>")

# Function that pads the given sentence
# We are introducing this function here as an example
# We will be utilizing it later in the tutorial
def pad_window(sentence, window_size, pad_token="<pad>"):
    window = [pad_token] * window_size
    return window + sentence + window

# Show padding example
window_size = 2
pad_window(train_sentences[0], window_size=window_size)

['<pad>', '<pad>', 'we', 'always', 'come', 'to', 'paris', '<pad>', '<pad>']

In [115]:
# We are just converting our vocabularly to a list to be able to index into it
# Sorting is not necessary, we sort to show an ordered word_to_ind dictionary
# That being said, we will see that having the index for the padding token
# be 0 is convenient as some PyTorch functions use it as a default value
# such as nn.utils.rnn.pad_sequence, which we will cover in a bit
ix_to_word = sorted(list(vocabulary))

# Creating a dictionary to find the index of a given word
word_to_ix = {word: ind for ind, word in enumerate(ix_to_word)}
word_to_ix

{'<pad>': 0,
 '<unk>': 1,
 'always': 2,
 'ankara': 3,
 'berlin': 4,
 'capital': 5,
 'come': 6,
 'comes': 7,
 'from': 8,
 'he': 9,
 'i': 10,
 'in': 11,
 'is': 12,
 'live': 13,
 'of': 14,
 'paris': 15,
 'professor': 16,
 'taiwan': 17,
 'the': 18,
 'to': 19,
 'tuebingen': 20,
 'turkey': 21,
 'we': 22}

In [116]:
# Given a sentence of tokens, return the corresponding indices
def convert_token_to_indices(sentence, word_to_ix):

    indices = []
    for token in sentence:
        # Check if the token is in our vocabularly. If it is, get it's index. 
        # If not, get the index for the unknown token.
        if token in word_to_ix:
            index = word_to_ix[token]
        else:
            index = word_to_ix["<unk>"]
        indices.append(index)

    return indices

# More compact version of the same function
def _convert_token_to_indices(sentence, word_to_ix):

    return [word_to_ind.get(token, word_to_ix["<unk>"]) for token in sentence]

# Show an example
example_sentence = ["we", "always", "come", "to", "kuwait"]
example_indices = convert_token_to_indices(example_sentence, word_to_ix)
restored_example = [ix_to_word[ind] for ind in example_indices]

print(f"Original sentence is: {example_sentence}")
print(f"Going from words to indices: {example_indices}")
print(f"Going from indices to words: {restored_example}")

Original sentence is: ['we', 'always', 'come', 'to', 'kuwait']
Going from words to indices: [22, 2, 6, 19, 1]
Going from indices to words: ['we', 'always', 'come', 'to', '<unk>']


In [117]:
# Converting our sentences to indices
example_padded_indices = [convert_token_to_indices(s, word_to_ix) for s in train_sentences]
example_padded_indices

[[22, 2, 6, 19, 15],
 [18, 16, 12, 8, 20],
 [10, 13, 11, 4],
 [9, 7, 8, 17],
 [18, 5, 14, 21, 12, 3]]

In [118]:
# Creating an embedding table for our words
embedding_dim = 5
embeds = nn.Embedding(len(vocabulary), embedding_dim)

# Printing the parameters in our embedding table
list(embeds.parameters())

[Parameter containing:
 tensor([[-7.5482e-01, -2.2092e-03, -4.1821e-01,  1.3172e+00,  1.1483e+00],
         [-7.7579e-02, -2.6317e-01, -3.3766e-01,  9.5767e-02, -3.4949e-01],
         [ 1.8953e+00,  1.2690e+00, -1.9266e-01, -1.2372e-01,  1.0209e+00],
         [-3.0698e-01, -1.5109e+00,  2.4257e+00, -3.0385e-01, -1.7477e+00],
         [-2.1599e-01, -4.6947e-02, -2.9402e-01, -1.0722e+00,  3.1287e-01],
         [ 1.8210e-01, -4.3033e-01, -4.6328e-01,  4.1029e-01,  5.6159e-01],
         [ 7.1831e-01, -1.2839e-01,  3.9307e-01, -6.6254e-01,  2.1043e-01],
         [-1.7192e+00,  5.1032e-01,  3.5390e-01,  1.1350e-01, -9.6802e-01],
         [-8.0716e-01,  6.9640e-01, -1.8330e+00, -4.0220e-01,  1.4787e+00],
         [ 2.4446e-01, -1.4288e+00,  1.2488e+00, -1.1721e+00,  9.8981e-01],
         [-5.8435e-01, -7.1598e-01, -1.1451e+00,  3.3518e-01,  4.3613e-01],
         [ 1.4727e+00, -7.4476e-01,  1.5523e+00,  1.8630e+00, -5.4626e-01],
         [ 3.8871e-01, -3.8843e-01,  9.1190e-02, -3.6838e-01, -7.

To get the word embedding for a word in our vocabulary, all we need to do is to create a lookup tensor. The lookup tensor is just a tensor containing the index we want to look up `nn.Embedding` class expects an index tensor that is of type Long Tensor, so we should create our tensor accordingly. 

In [120]:
# Get the embedding for the word Paris
index = word_to_ix["paris"]
index_tensor = torch.tensor(index, dtype=torch.long)
paris_embed = embeds(index_tensor)
paris_embed

tensor([ 0.5774,  1.5857,  1.2581, -0.1671, -1.3377],
       grad_fn=<EmbeddingBackward0>)

In [121]:
# We can also get multiple embeddings at once
index_paris = word_to_ix["paris"]
index_ankara = word_to_ix["ankara"]
indices = [index_paris, index_ankara]
indices_tensor = torch.tensor(indices, dtype=torch.long)
embeddings = embeds(indices_tensor)
embeddings

tensor([[ 0.5774,  1.5857,  1.2581, -0.1671, -1.3377],
        [-0.3070, -1.5109,  2.4257, -0.3038, -1.7477]],
       grad_fn=<EmbeddingBackward0>)

#### Batching Sentences

We have learned about batches in class. Waiting our whole training corpus to be processed before making an update is constly. On the other hand, updating the parameters after every training example causes the loss to be less stable between updates. To combat these issues, we instead update our parameters after training on a batch of data. This allows us to get a better estimate of the gradient of the global loss. In this section, we will learn how to structure our data into batches using the `torch.util.data.DataLoader` class. 

We will be calling the `DataLoader` class as follows: `DataLoader(data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)`.  The `batch_size` parameter determines the number of examples per batch. In every epoch, we will be iterating over all the batches using the `DataLoader`. The order of batches is deterministic by default, but we can ask `DataLoader` to shuffle the batches by setting the `shuffle` parameter to `True`. This way we ensure that we don't encounter a bad batch multiple times.

If provided, `DataLoader` passes the batches it prepares to the `collate_fn`. We can write a custom function to pass to the `collate_fn` parameter in order to print stats about our batch or perform extra processing. In our case, we will use the `collate_fn` to:
1. Window pad our train sentences.
2. Convert the words in the training examples to indices.
3. Pad the training examples so that all the sentences and labels have the same length. Similarly, we also need to pad the labels. This creates an issue because when calculating the loss, we need to know the actual number of words in a given example. We will also keep track of this number in the function we pass to the `collate_fn` parameter.

Because our version of the `collate_fn` function will need to access to our `word_to_ix` dictionary (so that it can turn words into indices), we will make use of the `partial` function in `Python`, which passes the parameters we give to the function we pass it. 

In [124]:
from torch.utils.data import DataLoader
from functools import partial

def custom_collate_fn(batch, window_size, word_to_ix):
    # Break our batch into the training examples (x) and labels (y)
    # We are turning our x and y into tensors because nn.utils.rnn.pad_sequence
    # method expects tensors. This is also useful since our model will be
    # expecting tensor inputs. 
    x, y = zip(*batch)

    # Now we need to window pad our training examples. We have already defined a 
    # function to handle window padding. We are including it here again so that
    # everything is in one place.
    def pad_window(sentence, window_size, pad_token="<pad>"):
        window = [pad_token] * window_size
        return window + sentence + window

    # Pad the train examples.
    x = [pad_window(s, window_size=window_size) for s in x]

    # Now we need to turn words in our training examples to indices. We are
    # copying the function defined earlier for the same reason as above.
    def convert_tokens_to_indices(sentence, word_to_ix):
        return [word_to_ix.get(token, word_to_ix["<unk>"]) for token in sentence]

    # Convert the train examples into indices.
    x = [convert_tokens_to_indices(s, word_to_ix) for s in x]

    # We will now pad the examples so that the lengths of all the example in 
    # one batch are the same, making it possible to do matrix operations. 
    # We set the batch_first parameter to True so that the returned matrix has 
    # the batch as the first dimension.
    pad_token_ix = word_to_ix["<pad>"]

    # pad_sequence function expects the input to be a tensor, so we turn x into one
    x = [torch.LongTensor(x_i) for x_i in x]
    x_padded = nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=pad_token_ix)

    # We will also pad the labels. Before doing so, we will record the number 
    # of labels so that we know how many words existed in each example. 
    lengths = [len(label) for label in y]
    lenghts = torch.LongTensor(lengths)

    y = [torch.LongTensor(y_i) for y_i in y]
    y_padded = nn.utils.rnn.pad_sequence(y, batch_first=True, padding_value=0)

    # We are now ready to return our variables. The order we return our variables
    # here will match the order we read them in our training loop.
    return x_padded, y_padded, lenghts 

In [122]:
# Shorter Version of it 

def _custom_collate_fn(batch, window_size, word_to_ix):
    # Prepare the datapoints
    x, y = zip(*batch)  
    x = [pad_window(s, window_size=window_size) for s in x]
    x = [convert_tokens_to_indices(s, word_to_ix) for s in x]

    # Pad x so that all the examples in the batch have the same size
    pad_token_ix = word_to_ix["<pad>"]
    x = [torch.LongTensor(x_i) for x_i in x]
    x_padded = nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=pad_token_ix)

    # Pad y and record the length
    lengths = [len(label) for label in y]
    lenghts = torch.LongTensor(lengths)
    y = [torch.LongTensor(y_i) for y_i in y]
    y_padded = nn.utils.rnn.pad_sequence(y, batch_first=True, padding_value=0)
    
    return x_padded, y_padded, lenghts  

In [125]:
# making the dataloader live

# Parameters to be passed to the DataLoader
data = list(zip(train_sentences, train_labels))
batch_size = 2
shuffle = True
window_size = 2
collate_fn = partial(custom_collate_fn, window_size=window_size, word_to_ix=word_to_ix)

# Instantiate the DataLoader
loader = DataLoader(data, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)

# Go through one loop
counter = 0
for batched_x, batched_y, batched_lengths in loader:
    print(f"Iteration {counter}")
    print("Batched Input:")
    print(batched_x)
    print("Batched Labels:")
    print(batched_y)
    print("Batched Lengths:")
    print(batched_lengths)
    print("")
    counter += 1

Iteration 0
Batched Input:
tensor([[ 0,  0, 18, 16, 12,  8, 20,  0,  0,  0],
        [ 0,  0, 18,  5, 14, 21, 12,  3,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 0, 1, 0],
        [0, 0, 0, 1, 0, 1]])
Batched Lengths:
tensor([5, 6])

Iteration 1
Batched Input:
tensor([[ 0,  0, 10, 13, 11,  4,  0,  0],
        [ 0,  0,  9,  7,  8, 17,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 1],
        [0, 0, 0, 1]])
Batched Lengths:
tensor([4, 4])

Iteration 2
Batched Input:
tensor([[ 0,  0, 22,  2,  6, 19, 15,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 0, 1]])
Batched Lengths:
tensor([5])



The batched input tensors you see above will be passed into our model. On the other hand, we started off saying that our model will be a window classifier. The way our input tensors are currently formatted, we have all the words in a sentence in one datapoint. When we pass this input to our model, it needs to create the windows for each word, make a prediction as to whether the center word is a `LOCATION` or not for each window, put the predictions together and return. 

We could avoid this problem if we formatted our data by breaking it into windows beforehand. In this example, we will instead how our model take care of the formatting. 

Given that our `window_size` is `N` we want our model to make a prediction on every `2N+1` tokens. That is, if we have an input with `9` tokens, and a `window_size` of `2`, we want our model to return `5` predictions. This makes sense because before we padded it with `2` tokens on each side, our input also had `5` tokens in it! 

We can create these windows by using for loops, but there is a faster `PyTorch` alternative, which is the `unfold(dimension, size, step)` method. We can create the windows we need using this method as follows:

In [126]:
# Print the original tensor
print(f"Original Tensor: ")
print(batched_x)
print("")

# Create the 2 * 2 + 1 chunks
chunk = batched_x.unfold(1, window_size*2 + 1, 1)
print(f"Windows: ")
print(chunk)

Original Tensor: 
tensor([[ 0,  0, 22,  2,  6, 19, 15,  0,  0]])

Windows: 
tensor([[[ 0,  0, 22,  2,  6],
         [ 0, 22,  2,  6, 19],
         [22,  2,  6, 19, 15],
         [ 2,  6, 19, 15,  0],
         [ 6, 19, 15,  0,  0]]])


In [128]:
class WordWindowClassifier(nn.Module):

    def __init__(self, hyperparameters, vocab_size, pad_ix=0):
        super(WordWindowClassifier, self).__init__()

        """ Instance variables """
        self.window_size = hyperparameters["window_size"]
        self.embed_dim = hyperparameters["embed_dim"]
        self.hidden_dim = hyperparameters["hidden_dim"]
        self.freeze_embeddings = hyperparameters["freeze_embeddings"]

        """ Embedding Layer 
        Takes in a tensor containing embedding indices, and returns the 
        corresponding embeddings. The output is of dim 
        (number_of_indices * embedding_dim).

        If freeze_embeddings is True, set the embedding layer parameters to be
        non-trainable. This is useful if we only want the parameters other than the
        embeddings parameters to change. 

        """
        self.embeds = nn.Embedding(vocab_size, self.embed_dim, padding_idx=pad_ix)
        if self.freeze_embeddings:
            self.embed_layer.weight.requires_grad = False

        """ Hidden Layer
        """
        full_window_size = 2 * window_size + 1
        self.hidden_layer = nn.Sequential(
          nn.Linear(full_window_size * self.embed_dim, self.hidden_dim), 
          nn.Tanh()
        )

        """ Output Layer
        """
        self.output_layer = nn.Linear(self.hidden_dim, 1)

        """ Probabilities 
        """
        self.probabilities = nn.Sigmoid()

        
    def forward(self, inputs):
        """
        Let B:= batch_size
            L:= window-padded sentence length
            D:= self.embed_dim
            S:= self.window_size
            H:= self.hidden_dim

        inputs: a (B, L) tensor of token indices
        """
        B, L = inputs.size()

        """
        Reshaping.
        Takes in a (B, L) LongTensor
        Outputs a (B, L~, S) LongTensor
        """
        # Fist, get our word windows for each word in our input.
        token_windows = inputs.unfold(1, 2 * self.window_size + 1, 1)
        _, adjusted_length, _ = token_windows.size()

        # Good idea to do internal tensor-size sanity checks, at the least in comments!
        assert token_windows.size() == (B, adjusted_length, 2 * self.window_size + 1)

        """
        Embedding.
        Takes in a torch.LongTensor of size (B, L~, S) 
        Outputs a (B, L~, S, D) FloatTensor.
        """
        embedded_windows = self.embeds(token_windows)

        """
        Reshaping.
        Takes in a (B, L~, S, D) FloatTensor.
        Resizes it into a (B, L~, S*D) FloatTensor.
        -1 argument "infers" what the last dimension should be based on leftover axes.
        """
        embedded_windows = embedded_windows.view(B, adjusted_length, -1)

        """
        Layer 1.
        Takes in a (B, L~, S*D) FloatTensor.
        Resizes it into a (B, L~, H) FloatTensor
        """
        layer_1 = self.hidden_layer(embedded_windows)

        """
        Layer 2
        Takes in a (B, L~, H) FloatTensor.
        Resizes it into a (B, L~, 1) FloatTensor.
        """
        output = self.output_layer(layer_1)

        """
        Softmax.
        Takes in a (B, L~, 1) FloatTensor of unnormalized class scores.
        Outputs a (B, L~, 1) FloatTensor of (log-)normalized class scores.
        """
        output = self.probabilities(output)
        output = output.view(B, -1)

        return output

In [129]:
# Training 

# Prepare the data
data = list(zip(train_sentences, train_labels))
batch_size = 2
shuffle = True
window_size = 2
collate_fn = partial(custom_collate_fn, window_size=window_size, word_to_ix=word_to_ix)

# Instantiate a DataLoader
loader = DataLoader(data, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)

# Initialize a model
# It is useful to put all the model hyperparameters in a dictionary
model_hyperparameters = {
    "batch_size": 4,
    "window_size": 2,
    "embed_dim": 25,
    "hidden_dim": 25,
    "freeze_embeddings": False,
}

vocab_size = len(word_to_ix)
model = WordWindowClassifier(model_hyperparameters, vocab_size)

# Define an optimizer
learning_rate = 0.01
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

# Define a loss function, which computes to binary cross entropy loss
def loss_function(batch_outputs, batch_labels, batch_lengths):   
    # Calculate the loss for the whole batch
    bceloss = nn.BCELoss()
    loss = bceloss(batch_outputs, batch_labels.float())

    # Rescale the loss. Remember that we have used lengths to store the 
    # number of words in each training example
    loss = loss / batch_lengths.sum().float()

    return loss

In [130]:
# Function that will be called in every epoch
def train_epoch(loss_function, optimizer, model, loader):
  
  # Keep track of the total loss for the batch
    total_loss = 0
    for batch_inputs, batch_labels, batch_lengths in loader:
        # Clear the gradients
        optimizer.zero_grad()
        # Run a forward pass
        outputs = model.forward(batch_inputs)
        # Compute the batch loss
        loss = loss_function(outputs, batch_labels, batch_lengths)
        # Calculate the gradients
        loss.backward()
        # Update the parameteres
        optimizer.step()
        total_loss += loss.item()

    return total_loss

In [131]:
# Function containing our main training loop
def train(loss_function, optimizer, model, loader, num_epochs=10000):

  # Iterate through each epoch and call our train_epoch function
  for epoch in range(num_epochs):
    epoch_loss = train_epoch(loss_function, optimizer, model, loader)
    if epoch % 100 == 0: print(epoch_loss)

In [132]:
num_epochs = 1000
train(loss_function, optimizer, model, loader, num_epochs=num_epochs)

0.2894841432571411
0.22141649201512337
0.17786000669002533
0.14061449095606804
0.10255928337574005
0.08533350378274918
0.060118136927485466
0.052894541062414646
0.04444397892802954
0.03734759893268347


In [133]:
# Create test sentences
test_corpus = ["She comes from Paris"]
test_sentences = [s.lower().split() for s in test_corpus]
test_labels = [[0, 0, 0, 1]]

# Create a test loader
test_data = list(zip(test_sentences, test_labels))
batch_size = 1
shuffle = False
window_size = 2
collate_fn = partial(custom_collate_fn, window_size=2, word_to_ix=word_to_ix)
test_loader = torch.utils.data.DataLoader(test_data, 
                                           batch_size=1, 
                                           shuffle=False, 
                                           collate_fn=collate_fn)

In [134]:
for test_instance, labels, _ in test_loader:
    outputs = model.forward(test_instance)
    print(labels)
    print(outputs)

tensor([[0, 0, 0, 1]])
tensor([[0.0413, 0.6142, 0.0452, 0.9549]], grad_fn=<ViewBackward0>)
