In [1]:
# Install Sentence Transformers
# %pip install -U sentence-transformers

SentenceTransformers is a Python framework for state-of-the-art sentence, text and image embeddings.

Link: https://www.sbert.net/docs/quickstart.html

In [2]:
from sentence_transformers import SentenceTransformer, util

  from tqdm.autonotebook import tqdm, trange


In [3]:
model = SentenceTransformer('all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


## Sentence Embeddings and Similarity

In [4]:
# The sentences to encode
sentences = [
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium.",
]

In [5]:
# Calculate embeddings by calling model.encode()
embeddings = model.encode(sentences)

In [6]:
for sent,embed in zip(sentences, embeddings):
  print("Sentence:", sent)
  print(" Shape of Embeddings:", embed.shape)
  # print("Embeddings: ", embed)

Sentence: The weather is lovely today.
 Shape of Embeddings: (384,)
Sentence: It's so sunny outside!
 Shape of Embeddings: (384,)
Sentence: He drove to the stadium.
 Shape of Embeddings: (384,)


In [7]:
# Calculate the embedding similarities
similarities = model.similarity(embeddings, embeddings)
print(similarities)

tensor([[1.0000, 0.6660, 0.1046],
        [0.6660, 1.0000, 0.1411],
        [0.1046, 0.1411, 1.0000]])


In [8]:
# Calculate cosine similarities
cosine_scores = util.cos_sim(embeddings, embeddings)
cosine_scores

tensor([[1.0000, 0.6660, 0.1046],
        [0.6660, 1.0000, 0.1411],
        [0.1046, 0.1411, 1.0000]])

In [9]:
# Paraphrase Mining
paraphrases = util.paraphrase_mining(model, sentences)

In [10]:
# Top 5 similar sentences
for sim in paraphrases[0:5]:
  score, i, j = sim
  print(sentences[i], sentences[j], " --> ",score)

The weather is lovely today. It's so sunny outside!  -->  0.6659553050994873
It's so sunny outside! He drove to the stadium.  -->  0.14114463329315186
The weather is lovely today. He drove to the stadium.  -->  0.10458402335643768
