In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from transformers import BertModel, AutoTokenizer
from scipy.spatial.distance import cosine

In [3]:
model_name = "bert-base-cased"

model = BertModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
sentence = "When life gives you lemons, don't make lemonade."
encoded_input = tokenizer(sentence, return_tensors='pt')
encoded_input

{'input_ids': tensor([[  101,  1332,  1297,  3114,  1128, 22782,  1116,   117,  1274,   112,
           189,  1294, 22782,  6397,   119,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [5]:
output = model(**encoded_input)
# same as above
# output = model(input_ids=encoded_input['input_ids'], token_type_ids=encoded_input['token_type_ids'], attention_mask=encoded_input['attention_mask'])

In [6]:
output

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.5752,  0.0460, -0.0275,  ..., -0.1920,  0.4517,  0.0025],
         [ 0.4231, -0.4002,  0.8094,  ..., -0.1844,  0.2036, -0.1496],
         [ 0.2830, -0.0136,  0.1657,  ...,  0.0995,  0.0139,  0.2934],
         ...,
         [ 0.0118,  0.3637, -0.0355,  ..., -0.0079, -0.5741,  0.2081],
         [ 0.6854,  0.0321,  0.3932,  ...,  0.0104,  0.4541,  0.0772],
         [ 0.7814,  0.1665,  0.1044,  ..., -0.1365,  1.1144, -0.4755]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-6.2927e-01,  5.1027e-01,  9.9990e-01, -9.9443e-01,  9.7256e-01,
          9.2796e-01,  9.9039e-01, -9.9347e-01, -9.7411e-01, -6.4015e-01,
          9.7887e-01,  9.9861e-01, -9.9839e-01, -9.9983e-01,  8.3185e-01,
         -9.7874e-01,  9.8912e-01, -6.0293e-01, -9.9997e-01, -8.8200e-01,
         -3.0767e-01, -9.9991e-01,  3.5105e-01,  9.7034e-01,  9.7623e-01,
          5.2552e-02,  9.8623e-01,  9.9997e-01,  9.0236e-01, -4.943

In [7]:
last_hidden_state = output.last_hidden_state
last_hidden_state.size()
#vecto representation fot each tokens in the sentence

torch.Size([1, 16, 768])

In [8]:
pooler_output = output.pooler_output
pooler_output.shape
#vecto representation fot the entire sentence

torch.Size([1, 768])

In [9]:
# Defining a function to encode the input text and get model predictions
def predict(text):
    encoded_inputs = tokenizer(text, return_tensors="pt")
    # return model(**encoded_inputs)[0]
    return model(**encoded_inputs).last_hidden_state

In [10]:
sentence1 = "There was a fly drinking from my soup"
sentence2 = "There is a fly swimming in my juice"
sentence3 = "To become a commercial pilot, he had to fly for 1500 hours."

In [11]:
# Tokenizing the sentences
tokens1 = tokenizer.tokenize(sentence1)
tokens2 = tokenizer.tokenize(sentence2)
tokens3 = tokenizer.tokenize(sentence3)

In [12]:
# Getting model predictions(last_hidden_state) for the sentences
out1 = predict(sentence1)
out2 = predict(sentence2)
out3 = predict(sentence3)

In [13]:
print(out1.shape)
print(out2.shape)
print(out3.shape)

torch.Size([1, 10, 768])
torch.Size([1, 10, 768])
torch.Size([1, 16, 768])


In [14]:
# Extracting embeddings for the word 'fly' in all sentences
emb1 = out1[0:, tokens1.index("fly"), :].detach()[0]
emb2 = out2[0:, tokens2.index("fly"), :].detach()[0]
emb3 = out3[0:, tokens3.index("fly"), :].detach()[0]

In [15]:
# print(emb1)
# print(emb2)
# print(emb3)

In [16]:
# Showing difference between the embeddings of same word(fly) in different sentences.
distance12 = cosine(emb1, emb2)
distance13 = cosine(emb1, emb3)
print(distance12)
print(distance13)

0.06798792255140473
0.4047780122944796
