# Welcome to CS 5242 **Homework 6**

ASSIGNMENT DEADLINE ⏰ : **23 October 23:59** 

In this assignment, we have four questions. Write the answers to each question in this notebook.

Colab is a hosted Jupyter notebook service that requires no setup to use, while providing access free of charge to computing resources including GPUs. In this semester, we will use Colab to run our experiments.

### **Grades Policy**

We have 10 points for this homework. 15% off per day late, 0 scores if you submit it 7 days after the deadline.

### **Cautions**

**DO NOT** copy the code from the internet, e.g. GitHub.

---

### **Contact**

Please feel free to contact us if you have any question about this homework or need any further information.

Slack (Recommend): Kin Whye Chew

TA Email: kinwhye@nus.edu.sg

> If you have not join the slack group, you can click [here](https://join.slack.com/t/cs5242ay20222-oiw1784/shared_invite/zt-1eiv24k1t-0J9EI7vz3uQmAHa68qU0aw)

In [None]:
# For Google Colaboratory
import sys, os
if 'google.colab' in sys.modules:
    # mount google drive
    from google.colab import drive
    drive.mount('/content/gdrive', force_remount=True)
    file_name = 'Homework_06.ipynb'
    path_to_file = '/content/gdrive/My Drive/Homework06' # Please adjust the path accordingly 
    print(path_to_file)
    # change current path to the folder containing "file_name"
    
    os.chdir(path_to_file)
    !pwd

Mounted at /content/gdrive
/content/gdrive/My Drive/Homework
/content/gdrive/My Drive/Homework


In [1]:
!ls

Homework06_Slides.pdf [1m[36mdataset[m[m               utils.py
Homework_06.ipynb     dataset.zip


Extract dataset

In [2]:
!unzip dataset.zip

Archive:  dataset.zip
replace dataset/.DS_Store? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


## Question 1 (1 Point)

Implement and train a vanilla recurrent neural network (VRNN) for predicting the next world in a sequence. The  dataset is a subset of PTB composed of 20 sub-documents with each training document having 1000 words and each test document having 1000 words as well. 

__Requirements/Grading:__
1. Find the hyperparameters to obtain test perplexity smaller than 400.

Hint: You may choose your own values for the hyper-parameters, except for the number of epochs. You can use torch.nn.RNN. You can use the test perplexity for hyper-parameter tuning. Think about which parameter would have a huge impact on the performance and tune it.

In [3]:
%reset -f
import torch
import datetime
print('Timestamp:',datetime.datetime.now().strftime("%y-%m-%d--%H-%M-%S"))

train_data, test_data = torch.load('dataset/small_PTB.pt')
print(train_data.size(), test_data.size())

bs = 20
vocab_size = 10000

import torch
import torch.nn.functional as F
import torch.nn as nn
import math
import time
import numpy as np

# In order to obtain a test perplexity of less than 400, some hyperparameter tuning has to be performed
# In this scenario, we're using the test set to perform hyperparamter tuning. In practice, this is NEVER allowed
# When we do hyperparameter tuning with the test set, the hyperparameters will overfit to the test set, and therefore the test set
# will no longer be an unbiased estimate of the generalization error

num_epochs = 5
# Here, we only tune the hidden size, lr, and seq length.
# Looking through the code, you can find other hyperparameters to tune, such as the optimizer, number of layers, lr_decay, weight_decay, etc

# YOUR CODE STARTS HERE
hidden_size = 50
my_lr = 1
seq_length = 5
# YOUR CODE ENDS HERE

# The code below is taken fromthe VRNN demo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


class three_layer_recurrent_net(nn.Module):

    def __init__(self, hidden_size):
        super(three_layer_recurrent_net, self).__init__()
        
        self.layer1 = nn.Embedding( vocab_size  , hidden_size  )
        self.layer2 = nn.RNN(       hidden_size , hidden_size  )
        self.layer3 = nn.Linear(    hidden_size , vocab_size   )

        
    def forward(self, word_seq, h_init ):
        
        g_seq               =   self.layer1( word_seq )  
        h_seq , h_final     =   self.layer2( g_seq , h_init )
        score_seq           =   self.layer3( h_seq )
        
        return score_seq,  h_final 

net = three_layer_recurrent_net( hidden_size )

print(net)

net = net.to(device)
net.layer1.weight.data.uniform_(-0.1, 0.1)

net.layer3.weight.data.uniform_(-0.1, 0.1)

print('')
criterion = nn.CrossEntropyLoss()
def normalize_gradient(net):

    grad_norm_sq=0

    for p in net.parameters():
        grad_norm_sq += p.grad.data.norm()**2

    grad_norm=math.sqrt(grad_norm_sq)

    if grad_norm<1e-4:
        net.zero_grad()
        print('grad norm close to zero')
    else:    
        for p in net.parameters():
            p.grad.data.div_(grad_norm)

    return grad_norm
def eval_on_test_set():

    running_loss=0
    num_batches=0    
    
    h = torch.zeros(1, bs, hidden_size)
    
    h=h.to(device)

    
    for count in range( 0 , 999-seq_length ,  seq_length) :
            
        minibatch_data =  test_data[ count   : count+seq_length   ]
        minibatch_label = test_data[ count+1 : count+seq_length+1 ]
        
        minibatch_data=minibatch_data.to(device)
        minibatch_label=minibatch_label.to(device)
                                
        scores, h  = net( minibatch_data, h )
        
        minibatch_label =   minibatch_label.view(  bs*seq_length ) 
        scores          =            scores.view(  bs*seq_length , vocab_size)
        
        loss = criterion(  scores ,  minibatch_label )    
        
        h=h.detach()
            
        running_loss += loss.item()
        num_batches += 1        
    
    total_loss = running_loss/num_batches 
    print('test: exp(loss) = ', math.exp(total_loss)  )
    return math.exp(total_loss)

start=time.time()

for epoch in range(num_epochs):
    
    # keep the learning rate to 1 during the first 4 epochs, then divide by 1.1 at every epoch
    if epoch >= 4:
        my_lr = my_lr / 1.1
    
    # create a new optimizer and give the current learning rate.   
    optimizer=torch.optim.SGD( net.parameters() , lr=my_lr )
        
    # set the running quantities to zero at the beginning of the epoch
    running_loss=0
    num_batches=0    
    
    # set the initial h to be the zero vector
    h = torch.zeros(1, bs, hidden_size)

    # send it to the gpu    
    h=h.to(device)
    
    for count in range( 0 , 999-seq_length ,  seq_length):
            
        # Set the gradients to zeros
        optimizer.zero_grad()
        
        # create a minibatch
        minibatch_data =  train_data[ count   : count+seq_length   ]
        minibatch_label = train_data[ count+1 : count+seq_length+1 ]        
        
        # send them to the gpu
        minibatch_data=minibatch_data.to(device)
        minibatch_label=minibatch_label.to(device)
        
        # Detach to prevent from backpropagating all the way to the beginning
        # Then tell Pytorch to start tracking all operations that will be done on h and c
        h=h.detach()
        # h=h.requires_grad_()
                    
        # forward the minibatch through the net        
        scores, h  = net( minibatch_data, h )
        
        # reshape the scores and labels to huge batch of size bs*seq_length
        scores          =            scores.view(  bs*seq_length , vocab_size)  
        minibatch_label =   minibatch_label.view(  bs*seq_length )       
        
        # Compute the average of the losses of the data points in this huge batch
        loss = criterion(  scores ,  minibatch_label )
        
        # backward pass to compute dL/dR, dL/dV and dL/dW
        loss.backward()

        # do one step of stochastic gradient descent: R=R-lr(dL/dR), V=V-lr(dL/dV), ...
        normalize_gradient(net)
        optimizer.step()
        
            
        # update the running loss  
        running_loss += loss.item()
        num_batches += 1
        
        
        
    # compute stats for the full training set
    total_loss = running_loss/num_batches
    elapsed = time.time()-start
    
    print('')
    print('epoch=',epoch, '\t time=', elapsed,'\t lr=', my_lr, '\t exp(loss)=',  math.exp(total_loss))
    test_loss = eval_on_test_set() 


  from .autonotebook import tqdm as notebook_tqdm


Timestamp: 22-10-22--02-00-30
torch.Size([1000, 20]) torch.Size([1000, 20])
cpu
three_layer_recurrent_net(
  (layer1): Embedding(10000, 50)
  (layer2): RNN(50, 50)
  (layer3): Linear(in_features=50, out_features=10000, bias=True)
)


epoch= 0 	 time= 1.9059457778930664 	 lr= 1 	 exp(loss)= 1106.2814238196204
test: exp(loss) =  890.0754735231917

epoch= 1 	 time= 4.782888889312744 	 lr= 1 	 exp(loss)= 753.4084927894423
test: exp(loss) =  778.9946633822738

epoch= 2 	 time= 7.661647796630859 	 lr= 1 	 exp(loss)= 602.2385190769221
test: exp(loss) =  667.070959498059

epoch= 3 	 time= 10.542536973953247 	 lr= 1 	 exp(loss)= 493.3682468674442
test: exp(loss) =  628.3779384551764

epoch= 4 	 time= 13.423060894012451 	 lr= 0.9090909090909091 	 exp(loss)= 405.7438097860771
test: exp(loss) =  592.0378751772201


## Question 2 (2 Point)

Implement and train a vanilla recurrent neural network (VRNN) on the small PTB dataset by **explicitly** implementing the VRNN layer (the use of the function nn.RNN() is prohibited) :

$$
\begin{align}
h_t =& \textrm{ tanh}(Ah_{t-1}+a+Bx_t+b)\\
y_t =& \ C h_{t} +c\\
\end{align}
$$

where $(A,a)$ are the parameters of the linear transformation (matrix,bias) applied to $h_{t-1}$, $(B,b)$ are the parameters of the linear transformation applied to $x_t$ and $(C,c)$ are the parameters of the linear transformation applied to $h_t$.

The  dataset is a subset of PTB composed of 20 sub-documents with each training document having 1000 words and each test document having 1000 words as well. 

__Requirements/Grading:__
1. Explicitly implement a vanilla recurrent neural network
**Hints**: 
1. Activation function tanh is given by *torch.tanh*
1. You may consider creating a list of $h_t$ with `h_seq = []`, add a vector $h_t$ to the list with `h_seq.append(h_t)` and convert the list of vectors into a PyTorch tensor with `h_seq = torch.stack(h_seq, dim=0).squeeze()`.
1. You can reuse the hyperparameters from question 1.

In [4]:
%reset -f
import torch
import datetime
print('Timestamp:',datetime.datetime.now().strftime("%y-%m-%d--%H-%M-%S"))

train_data, test_data = torch.load('dataset/small_PTB.pt')
print(train_data.size(), test_data.size())

bs = 20
vocab_size = 10000

import torch
import torch.nn.functional as F
import torch.nn as nn
import math
import time
import numpy as np

# The the RNN of the three_layer_recurrent_net has to be explicitly implemented

# YOUR CODE STARTS HERE 
class three_layer_recurrent_net(nn.Module):

    def __init__(self, hidden_size):
        super(three_layer_recurrent_net, self).__init__()
        
        self.layer1 = nn.Embedding(vocab_size, hidden_size)
        
        # A.h_t-1 + a
        self.layer2_1 = nn.Linear(hidden_size, hidden_size)
        # B.x_t + b
        self.layer2_2 = nn.Linear(hidden_size, hidden_size)
        # C.h_t + c
        self.layer3 = nn.Linear(hidden_size, vocab_size)

        
    def forward(self, word_seq, h_init ):
        g_seq = self.layer1(word_seq) # (5, 20, 50)
        
        h_seq = []
        h_t = h_init
        for i in range(len(g_seq)):
            # a' = A.h_t-1 + a
            a = self.layer2_1(h_t)
            # b' = B.x_t + b
            b = self.layer2_2(g_seq[i])
            # h_t = tanh(a' + b')
            h_t = torch.tanh(torch.add(a, b))
            h_seq.append(h_t)
            
        h_final = h_t
        h_seq = torch.stack(h_seq, dim=0).squeeze()
        # y_t = C.h_t + c
        score_seq = self.layer3(h_seq)
        
        return score_seq, h_final

    
hidden_size = 50
my_lr = 1
seq_length = 5
# YOUR CODE ENDS HERE

num_epochs = 5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

net = three_layer_recurrent_net( hidden_size )

print(net)

net = net.to(device)
net.layer1.weight.data.uniform_(-0.1, 0.1)

net.layer3.weight.data.uniform_(-0.1, 0.1)

print('')
criterion = nn.CrossEntropyLoss()
def normalize_gradient(net):

    grad_norm_sq=0

    for p in net.parameters():
        grad_norm_sq += p.grad.data.norm()**2

    grad_norm=math.sqrt(grad_norm_sq)

    if grad_norm<1e-4:
        net.zero_grad()
        print('grad norm close to zero')
    else:    
        for p in net.parameters():
            p.grad.data.div_(grad_norm)

    return grad_norm
def eval_on_test_set():

    running_loss=0
    num_batches=0    
    
    h = torch.zeros(1, bs, hidden_size)
    
    h=h.to(device)

    
    for count in range( 0 , 999-seq_length ,  seq_length) :
            
        minibatch_data =  test_data[ count   : count+seq_length   ]
        minibatch_label = test_data[ count+1 : count+seq_length+1 ]
        
        minibatch_data=minibatch_data.to(device)
        minibatch_label=minibatch_label.to(device)
                                
        scores, h  = net( minibatch_data, h )
        
        minibatch_label =   minibatch_label.view(  bs*seq_length ) 
        scores          =            scores.view(  bs*seq_length , vocab_size)
        
        loss = criterion(  scores ,  minibatch_label )    
        
        h=h.detach()
            
        running_loss += loss.item()
        num_batches += 1        
    
    total_loss = running_loss/num_batches 
    print('test: exp(loss) = ', math.exp(total_loss)  )
    return math.exp(total_loss)

start=time.time()

for epoch in range(num_epochs):
    
    # keep the learning rate to 1 during the first 4 epochs, then divide by 1.1 at every epoch
    if epoch >= 4:
        my_lr = my_lr / 1.1
    
    # create a new optimizer and give the current learning rate.   
    optimizer=torch.optim.SGD( net.parameters() , lr=my_lr )
        
    # set the running quantities to zero at the beginning of the epoch
    running_loss=0
    num_batches=0    
    
    # set the initial h to be the zero vector
    h = torch.zeros(1, bs, hidden_size)

    # send it to the gpu    
    h=h.to(device)
    
    for count in range( 0 , 999-seq_length ,  seq_length):
            
        # Set the gradients to zeros
        optimizer.zero_grad()
        
        # create a minibatch
        minibatch_data =  train_data[ count   : count+seq_length   ]
        minibatch_label = train_data[ count+1 : count+seq_length+1 ]        
        
        # send them to the gpu
        minibatch_data=minibatch_data.to(device)
        minibatch_label=minibatch_label.to(device)
        
        # Detach to prevent from backpropagating all the way to the beginning
        # Then tell Pytorch to start tracking all operations that will be done on h and c
        h=h.detach()
        # h=h.requires_grad_()
                    
        # forward the minibatch through the net   
        scores, h  = net( minibatch_data, h )
        
        # reshape the scores and labels to huge batch of size bs*seq_length
        scores          =            scores.view(  bs*seq_length , vocab_size)  
        minibatch_label =   minibatch_label.view(  bs*seq_length )       
        
        # Compute the average of the losses of the data points in this huge batch
        loss = criterion(  scores ,  minibatch_label )
        
        # backward pass to compute dL/dR, dL/dV and dL/dW
        loss.backward()

        # do one step of stochastic gradient descent: R=R-lr(dL/dR), V=V-lr(dL/dV), ...
        normalize_gradient(net)
        optimizer.step()
        
            
        # update the running loss  
        running_loss += loss.item()
        num_batches += 1
        
        
        
    # compute stats for the full training set
    total_loss = running_loss/num_batches
    elapsed = time.time()-start
    
    print('')
    print('epoch=',epoch, '\t time=', elapsed,'\t lr=', my_lr, '\t exp(loss)=',  math.exp(total_loss))
    test_loss = eval_on_test_set() 


Timestamp: 22-10-22--02-00-47
torch.Size([1000, 20]) torch.Size([1000, 20])
cpu
three_layer_recurrent_net(
  (layer1): Embedding(10000, 50)
  (layer2_1): Linear(in_features=50, out_features=50, bias=True)
  (layer2_2): Linear(in_features=50, out_features=50, bias=True)
  (layer3): Linear(in_features=50, out_features=10000, bias=True)
)


epoch= 0 	 time= 1.8801970481872559 	 lr= 1 	 exp(loss)= 1109.6043028292793
test: exp(loss) =  884.3017591994267

epoch= 1 	 time= 4.813304901123047 	 lr= 1 	 exp(loss)= 765.5071614177626
test: exp(loss) =  809.9315406455413

epoch= 2 	 time= 7.753467082977295 	 lr= 1 	 exp(loss)= 618.091834357656
test: exp(loss) =  717.1294936655363

epoch= 3 	 time= 10.7399320602417 	 lr= 1 	 exp(loss)= 506.2707505564073
test: exp(loss) =  654.1054815156931

epoch= 4 	 time= 13.668645143508911 	 lr= 0.9090909090909091 	 exp(loss)= 418.4952120780542
test: exp(loss) =  616.5863267259001


## Question 3 (3 Points)

Implement and train a gated recurrent unit network (GRU) on the small PTB dataset by **explicitly** implementing the GRU layer (the use of the function nn.GRU() is prohibited) :

$$
\begin{align}
r_t =& \textrm{ sigmoid}(A x_t + a + B h_{t-1} + b)\\
z_t =& \textrm{ sigmoid}(C x_t + c + D h_{t-1} + d)\\
n_t =& \textrm{ tanh} (E x_t + e + r_t \odot (F h_{t-1}+f))\\
h_t =& (1-z_t) \odot n_{t} + z_t \odot h_{t-1}\\
\end{align}
$$

where $(A,a), (B,b), (C,c), (D,d), (E,e), (F,f)$ are the parameters (matrix,bias) of all linear transformations and 
$\odot$ is the element-wise product operator or Hadamard product. 

The  dataset is a subset of PTB composed of 20 sub-documents with each training document having 1000 words and each test document having 1000 words as well. 

__Requirements/Grading:__
1. Explicitly implement the GRU network

**Hints:** 
1. Activation function sigmoid is given by *torch.sigmoid*.
1. The Hadamard product $\odot$ is given by `*`.
1. You can reuse the hyperparameters from question 1.


In [5]:
%reset -f
import torch
import datetime
print('Timestamp:',datetime.datetime.now().strftime("%y-%m-%d--%H-%M-%S"))

train_data, test_data = torch.load('dataset/small_PTB.pt')
print(train_data.size(), test_data.size())

bs = 20
vocab_size = 10000


import torch
import torch.nn.functional as F
import torch.nn as nn
import math
import time
import numpy as np
# The Now implementing GRU instead of a VRNN

# YOUR CODE STARTS HERE 
class GRU(nn.Module):

    def __init__(self, hidden_size):
        super(GRU, self).__init__()
        
        self.layer1 = nn.Embedding(vocab_size, hidden_size)
        
        # A.x_t + a
        self.layer2_1 = nn.Linear(hidden_size, hidden_size)
        # B.h_t-1 + b
        self.layer2_2 = nn.Linear(hidden_size, hidden_size)
        # C.x_t + c
        self.layer2_3 = nn.Linear(hidden_size, hidden_size)
        # D.h_t-1 + d
        self.layer2_4 = nn.Linear(hidden_size, hidden_size)
        # E.x_t + e
        self.layer2_5 = nn.Linear(hidden_size, hidden_size)
        # F.h_t-1 + f
        self.layer2_6 = nn.Linear(hidden_size, hidden_size)

        # Y.h_t + y
        self.layer3 = nn.Linear(hidden_size, vocab_size)

        
    def forward(self, word_seq, h_init ):
        g_seq = self.layer1(word_seq) # (5, 20, 50)
        
        h_seq = []
        h_t = h_init
        for i in range(len(g_seq)):
            a = self.layer2_1(g_seq[i])
            b = self.layer2_2(h_t)
            r = torch.sigmoid(torch.add(a, b))
            
            c = self.layer2_3(g_seq[i])
            d = self.layer2_4(h_t)
            z = torch.sigmoid(torch.add(c, d))
            
            e = self.layer2_5(g_seq[i])
            f = self.layer2_6(h_t)
            n = torch.tanh(torch.add(e, r*f))
            
            h_t = torch.add(torch.subtract(1, z)*n, z*h_t)
            h_seq.append(h_t)
            
        h_final = h_t
        h_seq = torch.stack(h_seq, dim=0).squeeze()
        # y_t = Y.h_t + y
        score_seq = self.layer3(h_seq)
        
        return score_seq,  h_final
    
    
hidden_size = 50
my_lr = 1
seq_length = 5
# YOUR CODE ENDS HERE 

num_epochs = 5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

net = GRU( hidden_size )

print(net)

net = net.to(device)
net.layer1.weight.data.uniform_(-0.1, 0.1)

net.layer3.weight.data.uniform_(-0.1, 0.1)

print('')
criterion = nn.CrossEntropyLoss()
def normalize_gradient(net):

    grad_norm_sq=0

    for p in net.parameters():
        grad_norm_sq += p.grad.data.norm()**2

    grad_norm=math.sqrt(grad_norm_sq)

    if grad_norm<1e-4:
        net.zero_grad()
        print('grad norm close to zero')
    else:    
        for p in net.parameters():
            p.grad.data.div_(grad_norm)

    return grad_norm
def eval_on_test_set():

    running_loss=0
    num_batches=0    
    
    h = torch.zeros(1, bs, hidden_size)
    
    h=h.to(device)

    
    for count in range( 0 , 999-seq_length ,  seq_length) :
            
        minibatch_data =  test_data[ count   : count+seq_length   ]
        minibatch_label = test_data[ count+1 : count+seq_length+1 ]
        
        minibatch_data=minibatch_data.to(device)
        minibatch_label=minibatch_label.to(device)
                                
        scores, h  = net( minibatch_data, h )
        
        minibatch_label =   minibatch_label.view(  bs*seq_length ) 
        scores          =            scores.view(  bs*seq_length , vocab_size)
        
        loss = criterion(  scores ,  minibatch_label )    
        
        h=h.detach()
            
        running_loss += loss.item()
        num_batches += 1        
    
    total_loss = running_loss/num_batches 
    print('test: exp(loss) = ', math.exp(total_loss)  )
    return math.exp(total_loss)

start=time.time()

for epoch in range(num_epochs):
    
    # keep the learning rate to 1 during the first 4 epochs, then divide by 1.1 at every epoch
    if epoch >= 4:
        my_lr = my_lr / 1.1
    
    # create a new optimizer and give the current learning rate.   
    optimizer=torch.optim.SGD( net.parameters() , lr=my_lr )
        
    # set the running quantities to zero at the beginning of the epoch
    running_loss=0
    num_batches=0    
    
    # set the initial h to be the zero vector
    h = torch.zeros(1, bs, hidden_size)

    # send it to the gpu    
    h=h.to(device)
    
    for count in range( 0 , 999-seq_length ,  seq_length):
            
        # Set the gradients to zeros
        optimizer.zero_grad()
        
        # create a minibatch
        minibatch_data =  train_data[ count   : count+seq_length   ]
        minibatch_label = train_data[ count+1 : count+seq_length+1 ]        
        
        # send them to the gpu
        minibatch_data=minibatch_data.to(device)
        minibatch_label=minibatch_label.to(device)
        
        # Detach to prevent from backpropagating all the way to the beginning
        # Then tell Pytorch to start tracking all operations that will be done on h and c
        h=h.detach()
        # h=h.requires_grad_()
                    
        # forward the minibatch through the net        
        scores, h  = net( minibatch_data, h )
        
        # reshape the scores and labels to huge batch of size bs*seq_length
        scores          =            scores.view(  bs*seq_length , vocab_size)  
        minibatch_label =   minibatch_label.view(  bs*seq_length )       
        
        # Compute the average of the losses of the data points in this huge batch
        loss = criterion(  scores ,  minibatch_label )
        
        # backward pass to compute dL/dR, dL/dV and dL/dW
        loss.backward()

        # do one step of stochastic gradient descent: R=R-lr(dL/dR), V=V-lr(dL/dV), ...
        normalize_gradient(net)
        optimizer.step()
        
            
        # update the running loss  
        running_loss += loss.item()
        num_batches += 1
        
        
        
    # compute stats for the full training set
    total_loss = running_loss/num_batches
    elapsed = time.time()-start
    
    print('')
    print('epoch=',epoch, '\t time=', elapsed,'\t lr=', my_lr, '\t exp(loss)=',  math.exp(total_loss))
    test_loss = eval_on_test_set()

Timestamp: 22-10-22--02-01-05
torch.Size([1000, 20]) torch.Size([1000, 20])
cpu
GRU(
  (layer1): Embedding(10000, 50)
  (layer2_1): Linear(in_features=50, out_features=50, bias=True)
  (layer2_2): Linear(in_features=50, out_features=50, bias=True)
  (layer2_3): Linear(in_features=50, out_features=50, bias=True)
  (layer2_4): Linear(in_features=50, out_features=50, bias=True)
  (layer2_5): Linear(in_features=50, out_features=50, bias=True)
  (layer2_6): Linear(in_features=50, out_features=50, bias=True)
  (layer3): Linear(in_features=50, out_features=10000, bias=True)
)


epoch= 0 	 time= 2.1343939304351807 	 lr= 1 	 exp(loss)= 1075.997841774213
test: exp(loss) =  831.3360228582131

epoch= 1 	 time= 5.398738145828247 	 lr= 1 	 exp(loss)= 692.9452692746793
test: exp(loss) =  704.7387849110769

epoch= 2 	 time= 8.668376922607422 	 lr= 1 	 exp(loss)= 544.8568718553637
test: exp(loss) =  619.9499625138033

epoch= 3 	 time= 11.935544967651367 	 lr= 1 	 exp(loss)= 441.75031896638035
test: exp

## Question 4 (4 Points)

Implement GRU-based seq2seq model with Luong attention (https://arxiv.org/pdf/1508.04025.pdf) and train the model on the french to english translation dataset. (the use of the function nn.GRU() is allowed but the attention scheme needs to be implemented **explicitly**) :

The Luong attention algorithm performs the following operations:

1. The encoder generates a set of hidden states, $H = \textbf{h}_i, i = 1, 2, .....T$ , from the input sentence. The decoder generates a set of hidden states, $S = \textbf{s}_t, t =1, 2, .....$.
2. The current decoder hidden state is computed as: $\textbf{s}_t = GRU_{decoder}(\textbf{s}_{t-1}, y_{t-1})$. Here, $\textbf{s}_{t-1}$ denotes the previous hidden decoder state, and $y_{t-1}$ the current input, which is also the expected output for the previous timestep.

3. A dot product on the encoder hidden state $\textbf{h}_i$ and the current decoder hidden state $\textbf{s}_t$ to compute the alignment scores: $e_{t,i} = \textbf{s}_t . \textbf{h}_i$. 

4. A softmax function is applied to the alignment scores, effectively normalizing them into attention weights in a range between 0 and 1: $\alpha_{t, i} = \text{softmax}(e_{t, i}/ \textbf{e}_t)$.

5. These attention weights together with the encoder hidden states are used to generate a context vector through a weighted sum: $\textbf{c}_t = \sum_{i=1}^T\alpha_{t, i}\textbf{h}_i$.

6. An attentional hidden state is computed based on a weighted concatenation of the context vector and the current decoder hidden state: $\tilde{\textbf{s}_t} = \text{tanh}\big(W_c\big[\textbf{c}_t; \textbf{s}_t\big]\big)$.

7. The decoder produces a final output by feeding it a weighted attentional hidden state: $y_t = \text{softmax}(W_y\tilde{\textbf{s}_t})$.

8. Steps 2-7 are repeated until the end of the sequence. 

The attention has to be calculated in parallel via matrix multiplication. For loop $\textbf{should not}$ be used.

__Requirements/Grading:__
1. Implement the forward pass for the network in the train and eval function.
1. Explicitly implement the attention network (Use of nn.GRU() is allowed).

**Hints:** 
1. torch.swapaxes to convert from [seq_len, bs, hidden_size] to [bs, seq_len, hidden_size].
1. torch.bmm to perform batch matrix multiplication
1. torch.concat to concatenate $c_t$ and $s_t$
1. Training took me around ~1 minute per epoch


In [6]:
!python3 -m spacy download fr_core_news_sm
!python3 -m spacy download en_core_web_sm

import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from typing import Iterable, List
from torch.nn.utils.rnn import pad_sequence
import torch
from torch.utils.data import DataLoader
import pickle
from torch.utils.data import Dataset
import os
import numpy as np

# For this dataset, we are trying to translate french to english
SRC_LANGUAGE = 'fr'
TGT_LANGUAGE = 'en'

# First, we create a custom dataset to load the data. Each item is a pair of french and english datapoint
class CustomDataset(Dataset):
    def __init__(self, train, train_size=10000, test_size=1000, max_len=50):
        self.en_dir = os.path.join("dataset", "europarl-v7.fr-en.en")
        self.fr_dir = os.path.join("dataset", "europarl-v7.fr-en.fr")
        with open(self.en_dir, "r", encoding="utf8") as f:
            self.english_data = f.readlines()
        with open(self.fr_dir, "r", encoding="utf8") as f:
            self.french_data = f.readlines()
        # Only train on sentences with less than 50 letters
        self.indicies = np.array([i for i in range(len(self.english_data)) if len(self.english_data[i]) < max_len])
        # First 10000 datapoints for train
        if train:
            with open(self.en_dir, "r", encoding="utf8") as f:
                self.english_data = [self.english_data[i] for i in self.indicies][:train_size]
            with open(self.fr_dir, "r", encoding="utf8") as f:
                self.french_data = [self.french_data[i] for i in self.indicies][:train_size]
        # Next 10000 datatpoints for test
        else:
            with open(self.en_dir, "r", encoding="utf8") as f:
                self.english_data = [self.english_data[i] for i in self.indicies][train_size:train_size+test_size]
            with open(self.fr_dir, "r", encoding="utf8") as f:
                self.french_data = [self.french_data[i] for i in self.indicies][train_size:train_size+test_size]
                # self.french_data = f.readlines()[train_size:train_size+test_size]

    def __len__(self):
        return len(self.english_data)

    def __getitem__(self, idx):
        return self.french_data[idx], self.english_data[idx]

# Instantiate dataset
dataset = CustomDataset(train=True)

# Next, we load the tokenizer that transforms the input sentence into tokens
token_transform = {}
token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='fr_core_news_sm')
token_transform[TGT_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm')

# Helper function to call token_transform
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}

    for data_sample in data_iter:
        yield token_transform[language](data_sample[language_index[language]])


# Next, we build the dictionary to convert the tokens to indicies.
# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

vocab_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    # Training data Iterator
    train_iter = iter(dataset)
    # Create torchtext's Vocab object
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln),
                                                    min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    vocab_transform[ln].set_default_index(UNK_IDX)

torch.manual_seed(0)
SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])


# Functions transform the input sentence to a format that can be used for training 
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))
text_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
                                               vocab_transform[ln], #Numericalization
                                               tensor_transform) # Add BOS/EOS and create tensor

def collate_fn(src, tgt):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in zip(src, tgt):
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n")))
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n")))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch


Collecting fr-core-news-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.4.0/fr_core_news_sm-3.4.0-py3-none-any.whl (16.3 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [7]:
# Print an example
batch_size = 8
dataset = CustomDataset(train=True)
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
fr_sentence, eng_sentence = next(iter(train_dataloader)) 
print(f"Raw Inputs: {fr_sentence[0]}\n{eng_sentence[0]}")

# First we split the sentence into tokens
fr_token, eng_token = [token_transform["fr"](i.rstrip("\n")) for i in fr_sentence], [token_transform["en"](i.rstrip("\n")) for i in eng_sentence]
print(f"Tokenized Inputs: {fr_token[0]}\n{eng_token[0]}")

# # Next we transform the tokens into numbers
fr_idx, eng_idx = [vocab_transform["fr"](i) for i in fr_token], [vocab_transform["en"](i) for i in eng_token]
print(f"Tokenized Inputs to indicies: {fr_idx[0]}\n{eng_idx[0]}")

# # Next, we add the beginning of sentence, end of sentence
fr_pad, eng_pad = [tensor_transform(i) for i in fr_idx], [tensor_transform(i) for i in eng_idx]
print(f"Tokenized Indicies with begin (2) and end token (3): {fr_pad[0]}\n{eng_pad[0]}")

# # Lastly, we pad the rest of the sentence
# This also changes the shape from (bs, seq_len) to (seq_len, bs)
fr_pad, eng_pad = pad_sequence(fr_pad, padding_value=PAD_IDX), pad_sequence(eng_pad, padding_value=PAD_IDX)
print(f"After padding (1): {fr_pad[:, 0]}\n{eng_pad[:, 0]}")

# All the above is combined into collate_fn
x, y = collate_fn(fr_sentence, eng_sentence)
print(f"Same Outputs: {x[:, 0]}\n{y[:, 0]}")

Raw Inputs: La troisième incohérence est liée aux droits de l'homme.

The third area of incoherence is human rights.

Tokenized Inputs: ['La', 'troisième', 'incohérence', 'est', 'liée', 'aux', 'droits', 'de', "l'", 'homme', '.']
['The', 'third', 'area', 'of', 'incoherence', 'is', 'human', 'rights', '.']
Tokenized Inputs to indicies: [34, 1226, 3007, 6, 3065, 156, 311, 7, 18, 251, 5]
[11, 726, 409, 12, 5180, 6, 310, 246, 5]
Tokenized Indicies with begin (2) and end token (3): tensor([   2,   34, 1226, 3007,    6, 3065,  156,  311,    7,   18,  251,    5,
           3])
tensor([   2,   11,  726,  409,   12, 5180,    6,  310,  246,    5,    3])
After padding (1): tensor([   2,   34, 1226, 3007,    6, 3065,  156,  311,    7,   18,  251,    5,
           3])
tensor([   2,   11,  726,  409,   12, 5180,    6,  310,  246,    5,    3,    1,
           1,    1])
Same Outputs: tensor([   2,   34, 1226, 3007,    6, 3065,  156,  311,    7,   18,  251,    5,
           3])
tensor([   2,   11,  726, 

In [68]:
import gc

# Hyperparameters
num_epochs = 5
hidden_size = 256
my_lr = 1.3
bs = 32

# Variables
SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])
train_dataset = CustomDataset(train=True)
train_dataloader = DataLoader(train_dataset, batch_size=bs, shuffle=True)
test_dataset = CustomDataset(train=False)
test_dataloader = DataLoader(test_dataset, batch_size=bs, shuffle=False)

def eval_on_test_set():

    running_loss=0
    num_batches=0    
    
    h = torch.zeros(1, bs, hidden_size)
    
    h=h.to(device)

    for x, y in test_dataloader:
        x, y = collate_fn(x, y)
        
        # Batch size might be different for the last batch
        batch_size = x.size()[1]
        seq_length = y.size()[0] - 1
        
        # set the initial h to be the zero vector
        h = torch.zeros(1, batch_size, hidden_size)
        
        # send them to the gpu
        minibatch_data=x.type(torch.LongTensor).to(device)
        minibatch_label=y.type(torch.LongTensor).to(device)
        h=h.to(device)

        # FILL UP FORWARD PASS
        decoder_y = minibatch_label[:-1]
        scores = net(minibatch_data, decoder_y, h)

        scores = torch.reshape(scores, (batch_size * seq_length, TGT_VOCAB_SIZE))
        minibatch_label = minibatch_label[1:].view(batch_size * seq_length)
        loss = criterion(scores, minibatch_label)
        # END OF FORWARD PASS

        # backward pass to compute dL/dR, dL/dV and dL/dW
        loss.backward()

        # update the running loss  
        running_loss += loss.detach().item()
        num_batches += 1
        # Collect garbage to prevent OOM
        gc.collect()
    
    total_loss = running_loss/num_batches 
    print('test: exp(loss) = ', math.exp(total_loss)  )
    return math.exp(total_loss)

# Fill UP ATTENTION NETWORK
class LuongAttention(nn.Module):
    def __init__(self, hidden_size):
        super(LuongAttention, self).__init__()
        
        self.layer1_encoder = nn.Embedding(SRC_VOCAB_SIZE, hidden_size)
        self.gru_encoder = nn.GRU(hidden_size, hidden_size)

        self.layer1_decoder = nn.Embedding(TGT_VOCAB_SIZE, hidden_size)
        self.gru_decoder = nn.GRU(hidden_size, hidden_size)
        
        self.layer2 = nn.Linear(hidden_size*2, hidden_size, bias=False)
        self.output = nn.Linear(hidden_size, TGT_VOCAB_SIZE, bias=False)


    def forward(self, x, y, h_init):
        input_seq = self.layer1_encoder(x)
        h_seq, h_final = self.gru_encoder(input_seq, h_init)
        h_seq_swapped = torch.swapaxes(h_seq, 0, 1)

        output_seq = self.layer1_decoder(y)
        s_seq, s_final = self.gru_decoder(output_seq, h_final)
        s_seq_swapped = torch.swapaxes(s_seq, 0, 1)
        
        alignment = torch.bmm(s_seq_swapped, torch.swapaxes(h_seq_swapped, 1, 2))
        attention = F.softmax(alignment, dim=2)
        context = torch.bmm(attention, h_seq_swapped)
        
        s_tilda = torch.tanh(self.layer2(torch.cat((context, s_seq_swapped), dim=2)))
        scores = F.log_softmax(self.output(s_tilda), dim=2)
        scores = torch.swapaxes(scores, 0, 1)

        return scores

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

net = LuongAttention( hidden_size )

print(net)

net = net.to(device)

print('')
criterion = nn.CrossEntropyLoss()
def normalize_gradient(net):

    grad_norm_sq=0

    for p in net.parameters():
        grad_norm_sq += p.grad.data.norm()**2

    grad_norm=math.sqrt(grad_norm_sq)

    if grad_norm<1e-4:
        net.zero_grad()
        print('grad norm close to zero')
    else:    
        for p in net.parameters():
            p.grad.data.div_(grad_norm)

    return grad_norm

start=time.time()

for epoch in range(num_epochs):
      # keep the learning rate to 1 during the first 4 epochs, then divide by 1.1 at every epoch
    if epoch >= 4:
        my_lr = my_lr / 1.1
    
    # create a new optimizer and give the current learning rate.   
    optimizer=torch.optim.SGD( net.parameters() , lr=my_lr )
        
    # set the running quantities to zero at the beginning of the epoch
    running_loss=0
    num_batches=0    

    for x, y in train_dataloader:
        # Set the gradients to zeros
        optimizer.zero_grad()
        
        # Transform inputs
        x, y = collate_fn(x, y)
        
        # Batch size might be different for the last batch
        batch_size = x.size()[1]
        seq_length = y.size()[0] - 1
        
        # set the initial h to be the zero vector
        h = torch.zeros(1, batch_size, hidden_size)
        
        # send them to the gpu
        minibatch_data=x.type(torch.LongTensor).to(device)
        minibatch_label=y.type(torch.LongTensor).to(device)
        h=h.to(device)

        # FILL UP FORWARD PASS
        decoder_y = minibatch_label[:-1]
        scores = net(minibatch_data, decoder_y, h)

        scores = torch.reshape(scores, (batch_size * seq_length, TGT_VOCAB_SIZE))
        minibatch_label = minibatch_label[1:].view(batch_size * seq_length)
        loss = criterion(scores, minibatch_label)
        # END OF FORWARD PASS
        
        # backward pass to compute dL/dR, dL/dV and dL/dW
        loss.backward()

        # do one step of stochastic gradient descent: R=R-lr(dL/dR), V=V-lr(dL/dV), ...
        normalize_gradient(net)
        optimizer.step()
        
        # update the running loss  
        running_loss += loss.detach().item()
        num_batches += 1
        
        # Collect garbage to prevent OOM
        gc.collect()
    # compute stats for the full training set
    total_loss = running_loss / num_batches
    elapsed = time.time() - start
    
    print('')
    print('epoch=',epoch, '\t time=', elapsed,'\t lr=', my_lr, '\t exp(loss)=',  math.exp(total_loss))
    eval_on_test_set()

cpu
LuongAttention(
  (layer1_encoder): Embedding(7797, 256)
  (gru_encoder): GRU(256, 256)
  (layer1_decoder): Embedding(6278, 256)
  (gru_decoder): GRU(256, 256)
  (layer2): Linear(in_features=512, out_features=256, bias=False)
  (output): Linear(in_features=256, out_features=6278, bias=False)
)


epoch= 0 	 time= 53.79262018203735 	 lr= 1.3 	 exp(loss)= 23.025468735498166
test: exp(loss) =  13.281550109640877

epoch= 1 	 time= 113.34258389472961 	 lr= 1.3 	 exp(loss)= 9.507329117482003
test: exp(loss) =  10.07044475426769

epoch= 2 	 time= 172.71715998649597 	 lr= 1.3 	 exp(loss)= 7.002982552374555
test: exp(loss) =  9.198572612908935

epoch= 3 	 time= 232.7482898235321 	 lr= 1.3 	 exp(loss)= 5.568289317250886
test: exp(loss) =  8.667906008852706

epoch= 4 	 time= 292.42796206474304 	 lr= 1.1818181818181817 	 exp(loss)= 4.473452695793515
test: exp(loss) =  8.501929550620176


In [69]:
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

count = 0
for x, y in test_dataloader:
    print(x)
    # set the initial h to be the zero vector
    h = torch.zeros(1, 1, hidden_size)

    # send it to the gpu    
    h=h.to(device)
    x, y = collate_fn(x, y)
    
    # send them to the gpu
    minibatch_data=x.type(torch.LongTensor).to(device)
    
    # The first prediction is the start of sentence index
    start_index = torch.tensor([[2]]).type(torch.LongTensor).to(device)
    predictions=start_index
    
    for _ in range(20):
        # At every loop, pass in the previous predictions
        predictions = net.forward(minibatch_data, predictions, h)
        predictions = torch.reshape(predictions, (-1, TGT_VOCAB_SIZE, 1))
        
        # Get the new predictions shifted right by 1 timestep
        predictions = torch.argmax(predictions, dim=1)
        
        # Add back the first timestep
        predictions = torch.cat([start_index, predictions], 0)
        
        if predictions[-1].item() == 3:
            break
            
    predictions = predictions.reshape(-1)
    # Transform from token to words
    predictions = [vocab_transform[TGT_LANGUAGE].lookup_token(i) for i in predictions]
    
    print(f"Label: {[vocab_transform[TGT_LANGUAGE].lookup_token(i) for i in y]}")
    print(f"Predicted: {predictions}\n")
    count += 1
    if count > 10:
        break

('Mais qui a peur ?\n',)
Label: ['<bos>', 'But', 'who', 'fears', ',', 'exactly', '?', '<eos>']
Predicted: ['<bos>', 'But', 'what', 'has', 'been', 'done', '?', '<eos>']

('Sapristi, mais quelle Europe est-ce donc ? !\n',)
Label: ['<bos>', 'For', '<unk>', "'s", 'sake', '!', 'What', 'sort', 'of', 'Europe', 'is', 'this', '?', '<eos>']
Predicted: ['<bos>', 'What', 'is', 'this', '?', '"', '<eos>']

('Pourquoi cette omission ?\n',)
Label: ['<bos>', 'Why', 'is', 'this', '?', '<eos>']
Predicted: ['<bos>', 'What', 'are', 'we', 'doing', 'so', '?', '<eos>']

('Ce sera ma première question.\n',)
Label: ['<bos>', 'That', 'is', 'my', 'first', 'question', '.', '<eos>']
Predicted: ['<bos>', 'This', 'will', 'be', 'my', 'first', 'question', '.', '<eos>']

("Nous sommes d'accord !\n",)
Label: ['<bos>', 'We', 'agree', '.', '<eos>']
Predicted: ['<bos>', 'We', 'agree', 'with', 'it', '!', '<eos>']

("Je suis encore bouleversé et choqué par ce que j'ai vu.\n",)
Label: ['<bos>', 'I', 'am', 'still', '<unk>', 'fr