# Introduction

This notebook is intended to experiment with Sentence Similarity techniques.

## BERT

In [94]:
# Import Standard Libraries
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

## Preprocessing

In [75]:
# Define model
model_name = 'sentence-transformers/bert-base-nli-mean-tokens'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [76]:
# Define text input
text = 'hello world what a time to be alive!'

In [77]:
# Tokenize the sentence
tokens = tokenizer.encode_plus(text, 
                               max_length=128, 
                               truncation=True, 
                               padding='max_length', 
                               return_tensors='pt')

In [78]:
# We have 128 tokens
tokens.attention_mask.shape

torch.Size([1, 128])

In [80]:
# Feed tokens
outputs = model(**tokens)

In [42]:
# Retrieve the embeddings of the sentence by taking the last layer output
embeddings = outputs.last_hidden_state

In [43]:
# We have 128 tokens with 768 values each
embeddings.shape

torch.Size([1, 128, 768])

768 values &rarr; most zeroes &rarr; sparse vectors

Let's convert them to dense vectors through a max pooling.

First of all, we need to multiply the sentence embeddings for their respective attention mask, in order to retrieve only the "real" important tokens and exclude the zero tokens.

In [44]:
# Retrieve attention mask
attention_mask = tokens.attention_mask

In [45]:
# Convert the attention_mask shape to match the embeddings one
print(attention_mask.unsqueeze(-1).shape)
print(attention_mask.unsqueeze(-1).expand(embeddings.shape).shape)

mask = attention_mask.unsqueeze(-1).expand(embeddings.shape).float()

torch.Size([1, 128, 1])
torch.Size([1, 128, 768])


In [46]:
# Compute the embeddings mask, in order to retrieve only important tokens
masked_embeddings = embeddings * mask

In [47]:
masked_embeddings.shape

torch.Size([1, 128, 768])

In [48]:
embeddings

tensor([[[ 3.0681e-01, -7.8806e-02,  1.7431e+00,  ..., -2.5348e-02,
          -1.1080e-01,  4.8310e-02],
         [ 7.1301e-01,  1.0437e-01,  1.8346e+00,  ...,  1.1344e-01,
          -7.5563e-02,  1.2667e-01],
         [ 8.1722e-01,  1.1321e-01,  1.5408e+00,  ..., -3.8067e-01,
           8.7479e-02, -1.9020e-01],
         ...,
         [ 5.4669e-01,  1.7181e-01,  1.1392e+00,  ...,  3.8548e-02,
          -1.5396e-01,  2.3015e-01],
         [ 3.4457e-01,  1.3151e-01,  1.1324e+00,  ..., -1.4203e-03,
          -1.7517e-01,  1.5220e-01],
         [ 3.2320e-01,  3.3353e-03,  1.1888e+00,  ...,  1.6736e-02,
          -2.0863e-01,  8.9315e-02]]], grad_fn=<NativeLayerNormBackward0>)

In [49]:
masked_embeddings

tensor([[[ 0.3068, -0.0788,  1.7431,  ..., -0.0253, -0.1108,  0.0483],
         [ 0.7130,  0.1044,  1.8346,  ...,  0.1134, -0.0756,  0.1267],
         [ 0.8172,  0.1132,  1.5408,  ..., -0.3807,  0.0875, -0.1902],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000, -0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ..., -0.0000, -0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000, -0.0000,  0.0000]]],
       grad_fn=<MulBackward0>)

If we compare values in `embeddings` and `masked_embeddings`, we can see that now some vectors have zeros instead of values. This is due to the application of the mask, which filters out non-important tokens.

<br>

Now we need to compute the max pooling, in order to transform the sparse `masked_embeddings` vectors into dense vectors.

In [50]:
# Compute the sum of all vecotrs dimension
summed = torch.sum(masked_embeddings, 1)
print(f'Summed shape: {summed.shape}')

# Count the 1 values in the mask (there are either 1 or 0) and a small value of avoid divide-by-zero
counts = torch.clamp(mask.sum(1), min=1e-9)
print(f'Counts shape: {counts.shape}')

# Compute the mean
mean_pooled = summed / counts

Summed shape: torch.Size([1, 768])
Counts shape: torch.Size([1, 768])


In [51]:
# This is our dense sentence vector
mean_pooled.shape

torch.Size([1, 768])

## Similarity

In [81]:
# Let's add some sentences
sentences = [
    "Three years later, the coffin was still full of Jello.",
    "The fish dreamed of escaping the fishbowl and into the toilet where he saw his riends.",
    "The person box was packed with jelly many dozens of months later.",
    "Standing on one's head at job interviews forms a lasting impression.",
    "It took him a month to finish the meal.",
    "He found a leprechaun in his walnut sheel."
]

In [86]:
# Initilaize dictionary for tokenized sentences
tokens = {
    'input_ids': [],
    'attention_mask': []
}

In [88]:
# Tokenize each sentence
for sentence in sentences:

    # Tokenize the sentence
    new_tokens = tokenizer.encode_plus(sentence, 
                                       max_length=128, 
                                       truncation=True, 
                                       padding='max_length', 
                                       return_tensors='pt')
    
    # Append tokens
    tokens['input_ids'].append(new_tokens['input_ids'][0])
    tokens['attention_mask'].append(new_tokens['attention_mask'][0])    

# Cast the list into a single tensor
tokens['input_ids'] = torch.stack(tokens['input_ids'])
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

In [89]:
tokens['input_ids'].shape

torch.Size([6, 128])

In [90]:
# Compute sentences embeddings
outputs = model(**tokens)
embeddings = outputs.last_hidden_state

In [91]:
# Compute mask
mask = tokens['attention_mask'].unsqueeze(-1).expand(embeddings.size()).float()

In [92]:
# Retrieve embeddings masked
masked_embeddings = embeddings * mask

In [93]:
# Compute dense vectors for sentence embeddings
summed = torch.sum(masked_embeddings, 1)
counts = torch.clamp(mask.sum(1), min=1e-9)
mean_pooled = summed / counts

In [95]:
# Detach tensors from PyTorch
mean_pooled = mean_pooled.detach().numpy()

cosine_similarity(
    [mean_pooled[0]],
    mean_pooled[1:]
)


array([[0.32754934, 0.72192574, 0.1747549 , 0.44709635, 0.54106987]],
      dtype=float32)

Ref: "Three years later, the coffin was still full of Jello.",
- 0: "The fish dreamed of escaping the fishbowl and into the toilet where he saw his riends.",
- 1: "The person box was packed with jelly many dozens of months later.",
- 2: "Standing on one's head at job interviews forms a lasting impression.",
- 3: "It took him a month to finish the meal.",
- 4: "He found a leprechaun in his walnut sheel."

<br>

The reference sentence has the highest similarity with the sentence number 1, has we expected! They use almost the same words and have the same meaning.