# Introduction

This notebook is intended to experiment with Sentence Similarity techniques.

## BERT

In [1]:
# Import Standard Libraries
from transformers import AutoTokenizer, AutoModel
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Define model
model_name = 'sentence-transformers/bert-base-nli-mean-tokens'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [3]:
# Define text input
text = 'hello world what a time to be alive!'

In [4]:
# Tokenize the sentence
tokens = tokenizer.encode_plus(text, 
                               max_length=128, 
                               truncation=True, 
                               padding='max_length', 
                               return_tensors='pt')

In [13]:
# We have 128 tokens
tokens.attention_mask.shape

torch.Size([1, 128])

In [5]:
# Feed tokens
outputs = model(**tokens)

In [6]:
# Retrieve the embeddings of the sentence by taking the last layer output
embeddings = outputs.last_hidden_state

In [7]:
# We have 128 tokens with 768 values each
embeddings.shape

torch.Size([1, 128, 768])

768 values &rarr; most zeroes &rarr; sparse vectors

Let's convert them to dense vectors through a max pooling.

First of all, we need to multiply the sentence embeddings for their respective attention mask, in order to retrieve only the "real" important tokens and exclude the zero tokens.

In [14]:
# Retrieve attention mask
attention_mask = tokens.attention_mask

In [20]:
# Convert the attention_mask shape to match the embeddings one
print(attention_mask.unsqueeze(-1).shape)
print(attention_mask.unsqueeze(-1).expand(embeddings.shape).shape)

mask = attention_mask.unsqueeze(-1).expand(embeddings.shape).float()

torch.Size([1, 128, 1])
torch.Size([1, 128, 768])


In [21]:
# Compute the embeddings mask, in order to retrieve only important tokens
masked_embeddings = embeddings * mask

In [22]:
masked_embeddings.shape

torch.Size([1, 128, 768])

In [24]:
embeddings

tensor([[[ 3.0681e-01, -7.8806e-02,  1.7431e+00,  ..., -2.5348e-02,
          -1.1080e-01,  4.8310e-02],
         [ 7.1301e-01,  1.0437e-01,  1.8346e+00,  ...,  1.1344e-01,
          -7.5563e-02,  1.2667e-01],
         [ 8.1722e-01,  1.1321e-01,  1.5408e+00,  ..., -3.8067e-01,
           8.7479e-02, -1.9020e-01],
         ...,
         [ 5.4669e-01,  1.7181e-01,  1.1392e+00,  ...,  3.8548e-02,
          -1.5396e-01,  2.3015e-01],
         [ 3.4457e-01,  1.3151e-01,  1.1324e+00,  ..., -1.4203e-03,
          -1.7517e-01,  1.5220e-01],
         [ 3.2320e-01,  3.3353e-03,  1.1888e+00,  ...,  1.6736e-02,
          -2.0863e-01,  8.9315e-02]]], grad_fn=<NativeLayerNormBackward0>)

In [23]:
masked_embeddings

tensor([[[ 0.3068, -0.0788,  1.7431,  ..., -0.0253, -0.1108,  0.0483],
         [ 0.7130,  0.1044,  1.8346,  ...,  0.1134, -0.0756,  0.1267],
         [ 0.8172,  0.1132,  1.5408,  ..., -0.3807,  0.0875, -0.1902],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000, -0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ..., -0.0000, -0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000, -0.0000,  0.0000]]],
       grad_fn=<MulBackward0>)

If we compare values in `embeddings` and `masked_embeddings`, we can see that now some vectors have zeros instead of values. This is due to the application of the mask, which filters out non-important tokens.

<br>

Now we need to compute the max pooling, in order to transform the sparse `masked_embeddings` vectors into dense vectors.

In [31]:
# Compute the sum of all vecotrs dimension
summed = torch.sum(masked_embeddings, 1)
print(f'Summed shape: {summed.shape}')

# Count the 1 values in the mask (there are either 1 or 0) and a small value of avoid divide-by-zero
counts = torch.clamp(mask.sum(1), min=1e-9)
print(f'Counts shape: {counts.shape}')

# Compute the mean
mean_pooled = summed / counts

Summed shape: torch.Size([1, 768])
Counts shape: torch.Size([1, 768])


In [33]:
# This is our dense sentence vector
mean_pooled.shape

torch.Size([1, 768])