<a href="https://colab.research.google.com/github/Taaniya/sentence-embeddings-with-bert/blob/main/sentence_embeddings_with_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install transformers sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transform

In [2]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import numpy as np

In [3]:
# Mean Pooling - Take attention mask into account for correct averaging

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]         #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


In [4]:
# Sentences we want sentence embeddings for
sentences = ['This is an example sentence', 'Each sentence is converted']

In [5]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [6]:
model

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

In [7]:
# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

In [8]:
# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

By default, model output contains -
1. hidden state of last layer. shape: batch_size, sequence_length, hidden_size
2. pooler layer output. shape: batch_size, hidden_size

In [10]:
len(model_output)

2

In [11]:
model_output

BaseModelOutputWithPooling(last_hidden_state=tensor([[[ 9.6638e-02, -1.7067e-01,  7.6037e-03,  ...,  2.3386e-02,
           1.0138e-01, -4.2516e-02],
         [ 6.8078e-02, -1.5394e-01, -1.2517e-01,  ..., -1.1160e-02,
           1.1047e-02, -1.3179e-03],
         [ 1.5145e-02, -3.7817e-01, -1.0184e-01,  ..., -8.7419e-02,
           1.0200e-01,  7.7285e-02],
         ...,
         [ 1.0243e-01, -3.4813e-01, -8.4518e-02,  ..., -1.1049e-01,
           8.5595e-02, -4.6411e-02],
         [-2.5399e-02,  7.7968e-03, -4.4150e-02,  ...,  9.5833e-02,
          -6.1752e-02, -6.1011e-03],
         [ 1.1801e-01, -1.2156e-01,  2.4939e-02,  ..., -9.5817e-03,
           1.4038e-01, -3.2524e-02]],

        [[ 1.2760e-01,  2.1539e-02, -4.1318e-02,  ..., -1.0661e-01,
          -1.9259e-01, -5.4022e-03],
         [ 1.4989e-01, -7.3534e-03, -9.6599e-02,  ..., -8.0247e-02,
          -3.2525e-01, -1.0299e-04],
         [ 2.1020e-01,  7.2042e-02, -3.7893e-02,  ...,  3.0247e-02,
          -3.0968e-01,  1.3309e

In [12]:
model_output[0].shape

torch.Size([2, 7, 768])

In [13]:
model_output[1].shape

torch.Size([2, 768])

In [16]:
# model output with hidden states of all layers
# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input, output_hidden_states=True)

Now, it also contains hidden states of all layers as the 3rd component in the output. This is a tuple with 13 elements, each corresponding to each layer of the model including embeddings layer in the beginning, & excluding pooler layer.

In [21]:
len(model_output)

3

In [20]:
len(model_output[2])

13

In [23]:
# Embeddings layer hidden state

model_output[2][0].shape

torch.Size([2, 7, 768])

In [24]:
# Last layer hidden state

model_output[2][12].shape

torch.Size([2, 7, 768])

We can also verify this by comparing the 1st component of model output (which returns last layer hidden state) and last hidden state from this 3rd component (which returns all layerss hidden states)

In [26]:
np.array_equal(model_output[0], model_output[2][12])

True

In [27]:
# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

In [28]:
sentence_embeddings.shape

torch.Size([2, 768])

In [29]:
# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)


In [None]:
print("Sentence embeddings:")
print(sentence_embeddings)

Sentence embeddings:
tensor([[ 0.0225, -0.0783, -0.0230,  ..., -0.0083,  0.0265, -0.0020],
        [ 0.0417,  0.0011, -0.0155,  ..., -0.0218, -0.0636, -0.0088]])


In [None]:
sentence_embeddings.shape

torch.Size([2, 768])

#### Using sentence transformers library

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

In [None]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

The SBERT architecture uses mean pooling strategy on the output by default.

In [None]:
embeddings = model.encode(sentences)

In [None]:
print(embeddings)

[[ 0.02250258 -0.07829181 -0.02303076 ... -0.00827927  0.0265269
  -0.00201898]
 [ 0.04170236  0.0010974  -0.01553419 ... -0.02181627 -0.06359357
  -0.00875284]]


In [None]:
embeddings.shape

(2, 768)

Compare the embeddings by the two approaches

In [None]:
np.array_equal(sentence_embeddings, embeddings)

True

#### References
* https://huggingface.co/sentence-transformers/all-mpnet-base-v2
* [SBERT paper](https://arxiv.org/pdf/1908.10084.pdf)
* https://www.sbert.net/index.html