In [1]:
import random
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
random_seed = 69
random.seed(random_seed)

torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [21]:
text = 'rakesh raj is doing nlp tonight'

encoding = tokenizer.batch_encode_plus(
    text,
    padding = True,
    truncation = True,
    return_tensors = 'pt',
    add_special_tokens = True,
)

input_ids = encoding['input_ids']
# print(input_ids)
attention_mask = encoding['attention_mask']
print(attention_mask)

tensor([[1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 0],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 0],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 0],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 0],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 0],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1]])


In [26]:
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    word_embeddings = outputs.last_hidden_state


print(word_embeddings)

tensor([[-0.4422, -0.0526, -0.0091,  ...,  0.0963,  0.1625,  0.3031],
        [-0.4922, -0.6787,  0.0211,  ...,  0.2248,  0.6457, -0.2831],
        [ 1.0238,  0.0883, -0.3741,  ...,  0.3148, -0.8268, -0.2171]])


In [15]:
type(encoding)

transformers.tokenization_utils_base.BatchEncoding

In [28]:
#Sentence Transformers

from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

In [31]:
sentences = [
    "This framework generates embeddings for each input sentence",
]
sentence_embeddings = model.encode(sentences)

In [32]:
for sentence, embedding in zip(sentences, sentence_embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

Sentence: This framework generates embeddings for each input sentence
Embedding: [-1.37173543e-02 -4.28515673e-02 -1.56286340e-02  1.40537675e-02
  3.95537801e-02  1.21796280e-01  2.94333380e-02 -3.17523777e-02
  3.54959704e-02 -7.93140084e-02  1.75878275e-02 -4.04369757e-02
  4.97259796e-02  2.54912637e-02 -7.18699545e-02  8.14968348e-02
  1.47070771e-03  4.79627289e-02 -4.50335816e-02 -9.92174894e-02
 -2.81769391e-02  6.45046085e-02  4.44670655e-02 -4.76217270e-02
 -3.52952443e-02  4.38671634e-02 -5.28565980e-02  4.33038571e-04
  1.01921484e-01  1.64072551e-02  3.26996669e-02 -3.45986895e-02
  1.21339504e-02  7.94871226e-02  4.58341185e-03  1.57778524e-02
 -9.68208257e-03  2.87626274e-02 -5.05806543e-02 -1.55793773e-02
 -2.87907124e-02 -9.62279830e-03  3.15556526e-02  2.27349252e-02
  8.71449485e-02 -3.85027342e-02 -8.84718746e-02 -8.75496026e-03
 -2.12343317e-02  2.08924077e-02 -9.02078077e-02 -5.25732674e-02
 -1.05638439e-02  2.88311224e-02 -1.61454864e-02  6.17841911e-03
 -1.23234

Working Implementation Below This Cell

In [48]:
import nltk
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine

nltk.download('punkt')
sent_embedding_model = "all-MiniLM-L6-v2"
device_used = "cuda"

[nltk_data] Downloading package punkt to /home/mohitavva/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [49]:
def split_sentence(paragraph):
    sentences = nltk.sent_tokenize(paragraph)
    return sentences

def sent_embedding(paragraph):
    sentences = split_sentence(paragraph)
    if torch.cuda.is_available():
        model = SentenceTransformer(sent_embedding_model, device=device_used)
    else:
        model = SentenceTransformer(sent_embedding_model)
    sentence_embeddings = model.encode(sentences)
    return sentence_embeddings

In [66]:
embeddings = sent_embedding("Hello how are you? The car looks good. Hi there, are you good?")

In [67]:
similarity = 1 - cosine(embeddings[0], embeddings[2])

print(similarity)

0.6687135297826821
