In [2]:
# Run this cell to intall/import all denpendicies needed for this homework; you can also add arbitrary libararies.

!pip install datasets
from datasets import load_dataset
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm
from nltk import word_tokenize
import nltk
nltk.download('punkt')
from scipy.stats import pearsonr, kendalltau
from pprint import pprint 
import matplotlib.pyplot as plt
import gdown



  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/paolobonicco/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# This function is used to specify the random seeds that could impact pytorch.

def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch

    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    torch.set_default_tensor_type(torch.DoubleTensor) #added to avoid datatype problems

# **Task 1: Getting to Know Pytorch: Semantic Textual Similarity**

In this task, we define semantic textual similarity (STS) as a **supervised** regression task in which the semantic similarity of two pieces of text (typically sentences) should be determined. 

### **Task 1.1: Data Preparation**

**Run the following cell to load the dataset for this task first**. Each entry of this dataset contains one English sentence pair and their similarity score. 

To get familiar with the data format, please **print** the first entry of `train_set`, the size of `dev_set`, and the first 3 `sentence1` in `train_set`.

**Hint**: the data is structured like both Python dictionary and Pandas DataFrame.



In [4]:
train_set = load_dataset("stsb_multi_mt", "en", split='train')
dev_set = load_dataset("stsb_multi_mt", "en", split='dev')

dev_set

Found cached dataset stsb_multi_mt (/Users/paolobonicco/.cache/huggingface/datasets/stsb_multi_mt/en/1.0.0/a5d260e4b7aa82d1ab7379523a005a366d9b124c76a5a5cf0c4c5365458b0ba9)
Found cached dataset stsb_multi_mt (/Users/paolobonicco/.cache/huggingface/datasets/stsb_multi_mt/en/1.0.0/a5d260e4b7aa82d1ab7379523a005a366d9b124c76a5a5cf0c4c5365458b0ba9)


Dataset({
    features: ['sentence1', 'sentence2', 'similarity_score'],
    num_rows: 1500
})

In [5]:
print(dev_set)
print(train_set)

Dataset({
    features: ['sentence1', 'sentence2', 'similarity_score'],
    num_rows: 1500
})
Dataset({
    features: ['sentence1', 'sentence2', 'similarity_score'],
    num_rows: 5749
})


In [6]:
# TODO: YOUR CODE HERE

print("First entry of train_set: ", train_set[0])
# print("Size of train_set", len(train_set))
print("Size of dev_set: ", len(dev_set))
print("First 3 sentences1 in train_set: ", train_set[:3]['sentence1'])


First entry of train_set:  {'sentence1': 'A plane is taking off.', 'sentence2': 'An air plane is taking off.', 'similarity_score': 5.0}
Size of dev_set:  1500
First 3 sentences1 in train_set:  ['A plane is taking off.', 'A man is playing a large flute.', 'A man is spreading shreded cheese on a pizza.']


## **Task 1.1.1: Embed the sentences**
We will use the averages of the words' [FastText embeddings](https://fasttext.cc/docs/en/english-vectors.html) to embed both sentences. 

**Run the following cell to download the embeddings.**

In [7]:
# download word emebddings to your drive and unzip the file (run this cell only when you haven't downloaded the emb file yet.)
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
# !unzip wiki-news-300d-1M.vec.zip

### a) 
Implement a funtion to read the word embeddings into a Python dictionary that maps every token to the corresponding vector. Represent the vectors as Numpy arrays. Only load the embeddings of the first 40,000 tokens in the file. 

**Print** the size of the dictionary and the first 20 dimensions of the embedding for word "homework".

In [8]:
import numpy as np
import io

def load_embeddings(file="DL4NLP/HW-2/data/wiki-news-300d-1M.vec", limit = 40000):
  # TODO: YOUR CODE HERE
  fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
  data = {}
  for line in fin:
      tokens = line.rstrip().split(' ')
      data[tokens[0]] = np.array(tokens[1:])
      if(len(data) > limit-1):
        break
  return data

In [9]:
fname = 'data/wiki-news-300d-1M.vec'
word_limit = 40000
WORD_VEC = load_embeddings(fname, word_limit)
print("Size of dictionary / Number of tokens: ", len(WORD_VEC))
print("First 20 dimensions of the embedding for the word \"homework\" \n", WORD_VEC["homework"][:20])

Size of dictionary / Number of tokens:  40000
First 20 dimensions of the embedding for the word "homework" 
 ['-0.1756' '0.0695' '0.0847' '0.0988' '-0.1196' '-0.1389' '-0.0379'
 '0.0543' '0.1022' '-0.0082' '-0.0488' '-0.1748' '-0.0181' '-0.1310'
 '-0.1794' '0.2143' '-0.1612' '-0.1130' '0.0213' '-0.0763']


### b)
Implement a function that tokenizes a sentence using [nltk.word_tokenize](https://www.nltk.org/book/ch03.html#accessing-text-from-the-web-and-from-disk) and returns a list of tokens for given sentence.

**Print** the tokenized sentence1 and sentence2 of the first entry in the training set. 

In [10]:
def tokenize(sentence):
  # TODO: YOUR CODE HERE
  token = word_tokenize(sentence)
  return token
  

In [11]:
sentence1 = train_set[:1]["sentence1"][0]
sentence2 = train_set[:1]["sentence2"][0]
print("Tokens of sentence1: ", tokenize(sentence1))
print("Tokens of sentence2: ", tokenize(sentence2))

Tokens of sentence1:  ['A', 'plane', 'is', 'taking', 'off', '.']
Tokens of sentence2:  ['An', 'air', 'plane', 'is', 'taking', 'off', '.']


### c)
Implement a function that maps a sentence to its embedding. The sentence-level embedding should be the average of the embeddings of its tokens. If a token does not exist in the vocabulary of FastText, embed this token as a 0-vector with the same dimensions as the FastText embeddings.

**Print** the shape and the first 20 dimensions of sentence1's embedding of the first entry in the training set.


In [12]:
def embed_sentence(sentence, word2emb):
  # TODO: YOUR CODE HERE
  token_sentece = tokenize(sentence) # tokenize the sentence
  sent_embed = []
  for token in token_sentece:
    if token in word2emb:
      word_emb = word2emb[token] # retrive vec for the token's sentences
    else:
      word_emb = np.zeros(300)
    word_emb = [float(element) for element in word_emb] # Convert to float for evaluate the mean
    sent_embed.append(np.array(word_emb))

  sent_embed = np.mean(sent_embed, axis=0) # Evaluate the mean 
  
  return sent_embed

In [13]:
sentence1 = train_set[:1]["sentence1"][0]
res = embed_sentence(sentence1, WORD_VEC)
print(res[:20])

[ 0.01895     0.01555    -0.03663333  0.04301667 -0.05261667 -0.043
 -0.05235    -0.02503333 -0.01348333 -0.02275     0.023      -0.00196667
  0.01398333 -0.01931667  0.04275     0.02686667 -0.00416667  0.05871667
 -0.06091667  0.0174    ]


## **Task 1.1.2: Build Custom Dataset**


### a)
Implement a custom dataset class inheriting [torch.utils.data.Dataset](https://pytorch.org/docs/stable/data.html#data-loading-order-and-sampler) and override the following methods:
- `__len__`: which returns the size of the dataset.
- `__getitem__`: to support the indexing such that dataset[i] can be used to get `ith` sample. 

The `ith` sample should be a Python dict with two entries: 
- `encoding` the encoding of one sentence pair, which is the concatenation of the embeddings of the two sentences of a pair. E.g., sent1 = [1,2], sent2 = [3,4], the encoding for sent1 and sent2 should be [1,2,3,4].
- `score` the similarity score between the two sentences.

**Hint**: examples can be found here: https://pytorch.org/tutorials/beginner/data_loading_tutorial.html

In [14]:
class MLPDataset(Dataset):
  def __init__(self, sents_1, sents_2, scores):
    """
    Arguments:
      sents_1 (List[string]): the list of the first sentences.
      sents_2 (List[string]): the list of the second sentences.
      scores (List[float]): the list of the similarity scores.
    """
    # TODO: YOUR CODE HERE
    self.sents_1 = sents_1
    self.sents_2 = sents_2
    self.scores = scores
    
  
  def __getitem__(self, idx):
    # TODO: YOUR CODE HERE
    sent1 = embed_sentence(self.sents_1[idx], WORD_VEC)
    sent2 = embed_sentence(self.sents_2[idx], WORD_VEC)
    score = self.scores[idx]
    encoding = np.append(sent1, sent2)
    item = {'encoding': encoding, 'score': score}
    return item
    

  def __len__(self):
    # TODO: YOUR CODE HERE
    return len(self.sents_1)

### b)
Instantiate the above class for our `train_set` and `dev_set`.

**Print** the size of `dev_dataset` and the shape of the encoding of the first example.

In [15]:
train_set[:]['sentence1'][0]

'A plane is taking off.'

In [16]:
# TODO: YOUR CODE HERE

train_dataset = MLPDataset(train_set['sentence1'][:], train_set['sentence2'][:], train_set['similarity_score'][:])
dev_dataset = MLPDataset(dev_set['sentence1'][:], dev_set['sentence2'][:], dev_set['similarity_score'])

print("Size of dev_dataset: ", len(dev_dataset))

Size of dev_dataset:  1500


In [17]:
len(train_dataset[0]['encoding'])

600

## **Task 1.2: Scoring the Similarity**
We will train a simple multi-layer perceptron (MLP) to score the similarity of the two sentences. 

### **Task 1.2.1: Build MLP using Pytorch**

We will use [`pytorch.nn`](https://pytorch.org/docs/stable/nn.html) to build our MLP. 

Implement a class inheriting [`pytorch.nn.Module`]() for our MLP, which has the following components:
- A [linear layer](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html#torch.nn.Linear) with 1,200 dimensions and [relu activation](https://pytorch.org/docs/stable/generated/torch.nn.ReLU.html#torch.nn.ReLU) , which takes the encoding of one sentence pair as the input.
- A [dropout layer](https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html#torch.nn.Dropout) with probability 0.1.
- A linear layer with 600 dimensions and relu activation.
- A dropout layer with probability 0.1.
- A linear layer with 300 dimensions and relu activation.
- A dropout layer with probability 0.1.
- A linear layer with 1 dimension (output layer). 

**Hint**: 
- You need to override the method `forward` in this class
- Use `nn.Sequential` to sequentialize the layers.
- You may want to see a quick example: https://pytorch.org/tutorials/beginner/basics/buildmodel_tutorial.html?highlight=sequential


In [2]:
class MLP(nn.Module):
  def __init__(self):
    super(MLP, self).__init__()
    # TODO: YOUR CODE HERE
    self.layers = nn.Sequential(
        nn.Linear(600, 1200),
        nn.ReLU(),
        nn.Dropout(0.1),
        nn.Linear(1200, 600),
        nn.ReLU(),
        nn.Dropout(0.1),
        nn.Linear(600, 300),
        nn.ReLU(),
        nn.Dropout(0.1),
        nn.Linear(300, 1)
    )

  def forward(self, x):
    # TODO: YOUR CODE HERE
    return self.layers(x)

model = MLP()
print(model)

NameError: name 'nn' is not defined

### **Task 1.2.2: Train MLP with Pytorch**
Train the MLP with the following setups/hyperparameters:
- [AdmW](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html#torch.optim.AdamW) with a learning rate of 2e-3 as the  optimizer
- [Mean Square Error](https://pytorch.org/docs/stable/generated/torch.nn.MSELoss.html#torch.nn.MSELoss) as the loss function
- batch size: 128
- number of training epochs: 17


### a)
The method for training is provided below, which returns the list of the train loss at all epochs and the trained model. Please define the corresponding parameters to call this method and **plot** the training loss using `matplotlib.pyplot.plot`; with GPU the training takes about 1.5 mins for 17 epochs. 

**Hint**: create the dataloader for the custom datasets (`train_dataset` and `dev_dataset`) using [`torch.utils.data.DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader); reshuffle the training data at every epoch (don't forget to define the batch size for the dataloader!).



In [19]:
def train(model, train_dataloader, eval_dataloader, optimizer, loss_func, num_epochs, device='cuda'):
  
  train_losses = []

  for epoch in range(num_epochs):

    if epoch == 0:
      model.eval()
      loss_per_epoch = 0
      for batch_data in train_dataloader:
        with torch.no_grad():
          predictions = model(batch_data['encoding'].to(device))
          targets = batch_data['score'].to(device) # only if device='cuda'
          train_loss = loss_func(predictions.squeeze(), targets)
          loss_per_epoch += train_loss.item()
      loss_per_epoch = loss_per_epoch/len(train_dataloader)
      train_losses.append(loss_per_epoch)
      print(f'\ninital train loss: {loss_per_epoch}')

    model.train()
    loss_per_epoch = 0
    for batch_data in train_dataloader:
      predictions = model(batch_data['encoding'].to(device))
      targets = batch_data['score'].to(device) # only if device='cuda'
      train_loss = loss_func(predictions.squeeze(), targets) 
      loss_per_epoch += train_loss.item()

      optimizer.zero_grad()
      train_loss.backward()
      optimizer.step()   

    loss_per_epoch = loss_per_epoch/len(train_dataloader)
    train_losses.append(loss_per_epoch)
    print(f'\n Epoch {epoch+1} train loss: {loss_per_epoch}')
    #evaluate(model, eval_dataloader, loss_func)
  
  return train_losses, model

In [21]:
# Set random seeds; do not change this!
seed_everything(seed=999)

# TODO: YOUR CODE HERE
# ==============================================

# Define the training hyperparameters
num_epochs = 17
batch_size = 128
learning_rate = 0.002

# Create dataloader
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_dataloader = torch.utils.data.DataLoader(dev_dataset, batch_size=batch_size)

# Initialize the model
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = MLP()
model.to(device)

# Define Optimizer and Loss Function
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_func = nn.MSELoss()

train_losses, model = train(model, train_dataloader, dev_dataloader, optimizer, loss_func, num_epochs, device)

: 

: 

In [None]:
# YOUR CODE FOR PLOTTING HERE
def plot_epochs_loss(lst_epochs, lst_train_loss):
  plt.plot(lst_epochs, lst_train_loss, marker="o")
  plt.xticks(lst_epochs)
  plt.xlabel("Epochs")
  plt.ylabel("Training Loss")
  plt.title("Training Loss for each Epoch")
  plt.show()

epochs = list(range(0, 18))

plot_epochs_loss(epochs, train_losses)

# ==============================================

### b)
Implement a method to evaluate a model which calculates the Pearson correlation between the predictions of the model and the similarity scores from the dataset.

**Print** the evaluation results  of your trained model on the dev set.

In [None]:
from scipy.stats import pearsonr
def evaluate(model, eval_dataloader):
  # TODO: YOUR CODE HERE:
  model.eval() # this is used to set the model in evlauation mode; details see https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.eval
  
  with torch.no_grad(): # this is used to disable gradient calculation; details see https://pytorch.org/docs/stable/generated/torch.no_grad.html 
    for batch_data in eval_dataloader:
      predictions = model(batch_data['encoding'].to(device))
      targets = batch_data['score'].to(device)
    # transform preds and targets to numpy in CPU to then use pearsonr
    preds_numpy = predictions.view(-1).cpu().numpy() #reshape and put on cpu
    targets_numpy = targets.cpu().numpy() #put on cpu
    
  return preds_numpy, targets_numpy, pearsonr(preds_numpy, targets_numpy)[0], pearsonr(preds_numpy, targets_numpy)[1]

preds, targets, pearson_value, p_val = evaluate(model, dev_dataloader)
print(f"Pearson Correlation value == {pearson_value:.6f}")
print(f"P-Value == {p_val}")