In [1]:
import random 
import torch 
from transformers import BertTokenizer, BertModel 
from sklearn.metrics.pairwise import cosine_similarity

seed = 42 
random.seed(seed)
torch.manual_seed(seed)

  from .autonotebook import tqdm as notebook_tqdm


<torch._C.Generator at 0x7f667c045df0>

In [2]:
if torch.cuda.is_available(): 
    torch.cuda.manual_seed_all(seed)
    

In [3]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

In [4]:
text = "The AIMT is a fantastic program at the lambton college."

In [5]:
encoding = tokenizer(text, return_tensors='pt', padding= True, truncation=False, max_length=64)
input_ids = encoding["input_ids"]
attention_mask = encoding["attention_mask"]



In [6]:
with torch.no_grad():

# model.eval()

    outputs = model(input_ids, attention_mask=attention_mask)
    word_embedding = outputs.last_hidden_state

In [7]:
print(f"Shape of word embeddings: {word_embedding.shape}")

Shape of word embeddings: torch.Size([1, 15, 768])


In [8]:
decoded_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)

# the decoded text 
print(f"Decoded Text : {decoded_text}")

tokenized_text = tokenizer.tokenize(decoded_text)

print(f"tokenized text {tokenized_text}")

encoded_text = tokenizer.encode(text, return_tensors="pt")
print(encoded_text)

Decoded Text : the aimt is a fantastic program at the lambton college.
tokenized text ['the', 'aim', '##t', 'is', 'a', 'fantastic', 'program', 'at', 'the', 'lamb', '##ton', 'college', '.']
tensor([[  101,  1996,  6614,  2102,  2003,  1037, 10392,  2565,  2012,  1996,
         12559,  2669,  2267,  1012,   102]])


In [9]:
for token, embedding in zip(tokenized_text, word_embedding[0]):
    print(f"Token: {token}")
    print(f"Embedding shape: {embedding.shape}")
    print(f"\n")

Token: the
Embedding shape: torch.Size([768])


Token: aim
Embedding shape: torch.Size([768])


Token: ##t
Embedding shape: torch.Size([768])


Token: is
Embedding shape: torch.Size([768])


Token: a
Embedding shape: torch.Size([768])


Token: fantastic
Embedding shape: torch.Size([768])


Token: program
Embedding shape: torch.Size([768])


Token: at
Embedding shape: torch.Size([768])


Token: the
Embedding shape: torch.Size([768])


Token: lamb
Embedding shape: torch.Size([768])


Token: ##ton
Embedding shape: torch.Size([768])


Token: college
Embedding shape: torch.Size([768])


Token: .
Embedding shape: torch.Size([768])




In [10]:
sentence_embedding = word_embedding.mean(dim=1)

print("Sentence Embedding: ")
print(sentence_embedding)

print(f"Shape of Sentence Embedding: {sentence_embedding.shape}")

Sentence Embedding: 
tensor([[ 1.4300e-01,  9.2324e-02,  1.6427e-01,  1.3220e-01,  3.3594e-01,
         -1.5824e-01,  2.7828e-01,  6.2212e-01, -2.4755e-01, -3.1975e-01,
          2.3491e-01, -1.3382e-01,  4.8492e-01,  4.8614e-01,  8.1320e-02,
         -1.4418e-01,  2.7171e-01, -4.3801e-03, -9.3331e-02, -1.1371e-01,
         -1.9955e-01, -9.0681e-02, -1.7671e-01,  5.2152e-01,  4.3704e-01,
          1.5830e-01,  2.9250e-01,  4.7411e-02, -2.0701e-01,  2.7105e-02,
          1.5560e-01, -7.7934e-03, -2.7743e-01, -2.5712e-01,  2.9287e-02,
          5.7685e-03, -3.7565e-01, -3.6692e-01, -2.2923e-01, -8.4029e-03,
         -4.7351e-01, -4.7651e-01, -9.4768e-02,  5.1951e-02, -1.7419e-01,
         -2.9894e-01, -2.7191e-01, -1.5192e-01,  1.3657e-01, -1.7907e-01,
         -2.1145e-01,  2.3613e-01, -5.8511e-02, -2.5221e-01,  7.3377e-02,
          4.8811e-01, -3.8451e-01, -2.4617e-01, -2.4480e-01, -7.6153e-02,
         -1.4820e-01,  9.9156e-02, -3.0508e-01, -3.8623e-02,  3.4082e-02,
          1.2756e

In [11]:
example_sentence = "The lambton college is a great place and AIMT program open doors to future oppturnities."

# tokenize and encode 
example_encoding = tokenizer(example_sentence, return_tensors="pt", padding=True, truncation=True)
example_input_ids = example_encoding["input_ids"]
example_attention_masks = example_encoding["attention_mask"]

In [12]:
with torch.no_grad():
    example_outputs = model(example_input_ids,attention_mask= example_attention_masks)
    example_sentence_embedding = example_outputs.last_hidden_state.mean(dim=1)
    
similarity = cosine_similarity(sentence_embedding, example_sentence_embedding)

In [13]:
print(f"the similarity was : {similarity}")

the similarity was : [[0.8690177]]
