#### <center> Prajwal Luitel (C0927658)  - Natural Language Processing -</center>

<center>1st August 2024</center>

## Neural Word Embedding

### BERT

In [1]:
import random
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
random_seed = 42
random.seed(random_seed)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [8]:
text = "The AIMT is a fantastic program at the Lambton college"

encoding = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

print(f"Input id: {input_ids}")
print(f"Attention mask: {attention_mask}")

Input id: tensor([[  101,  1996,  6614,  2102,  2003,  1037, 10392,  2565,  2012,  1996,
         12559,  2669,  2267,   102]])
Attention mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


In [9]:
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    word_embeddings = outputs.last_hidden_state

print(f"Shape of word embeddings: {word_embeddings.shape}")


Shape of word embeddings: torch.Size([1, 14, 768])


In [10]:
decoded_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
print(f"Decoded text: {decoded_text}")

tokenized_text = tokenizer.tokenize(decoded_text)
print(f"Tokenized text: {tokenized_text}")

encoded_text = tokenizer.encode(text, return_tensors='pt')
print(f"Encoded text: {encoded_text}")

Decoded text: the aimt is a fantastic program at the lambton college
Tokenized text: ['the', 'aim', '##t', 'is', 'a', 'fantastic', 'program', 'at', 'the', 'lamb', '##ton', 'college']
Encoded text: tensor([[  101,  1996,  6614,  2102,  2003,  1037, 10392,  2565,  2012,  1996,
         12559,  2669,  2267,   102]])


In [13]:
for token, embedding in zip(tokenized_text, word_embeddings[0]):
    print(f"Token: {token}")
    print(f"Embedding shape: {embedding.shape}")
    print(f"\n")

Token: the
Embedding shape: torch.Size([768])


Token: aim
Embedding shape: torch.Size([768])


Token: ##t
Embedding shape: torch.Size([768])


Token: is
Embedding shape: torch.Size([768])


Token: a
Embedding shape: torch.Size([768])


Token: fantastic
Embedding shape: torch.Size([768])


Token: program
Embedding shape: torch.Size([768])


Token: at
Embedding shape: torch.Size([768])


Token: the
Embedding shape: torch.Size([768])


Token: lamb
Embedding shape: torch.Size([768])


Token: ##ton
Embedding shape: torch.Size([768])


Token: college
Embedding shape: torch.Size([768])




In [14]:
sentence_embedding = word_embeddings.mean(dim=1)

print("Sentence Embedding: ")
print(sentence_embedding)

print(f"Shape of Sentence Embedding: {sentence_embedding.shape}")

Sentence Embedding: 
tensor([[-2.0340e-02,  7.2588e-02,  1.7413e-01,  1.0753e-01,  2.9440e-01,
         -1.0248e-01,  2.5406e-01,  6.9635e-01, -3.2303e-01, -3.1278e-01,
          2.3739e-01, -1.4498e-01,  4.1507e-01,  5.6953e-01,  6.8873e-02,
         -1.2940e-01,  3.0408e-01,  3.5122e-02, -2.3468e-01, -1.3765e-01,
         -2.0795e-01, -2.2657e-01, -9.1484e-02,  5.3153e-01,  4.5044e-01,
          2.0673e-01,  3.3365e-01,  7.0274e-02, -1.5105e-01,  1.1173e-01,
          1.2948e-01, -6.0593e-02, -2.1415e-01, -2.6945e-01, -2.0657e-02,
          9.4398e-02, -3.6323e-01, -3.4001e-01, -1.7670e-01,  1.1183e-02,
         -4.2568e-01, -4.5948e-01, -1.4922e-02,  6.0407e-02, -1.2631e-01,
         -3.3636e-01, -1.8430e-01, -1.4505e-01,  1.0150e-01, -2.1398e-01,
         -1.5419e-01,  2.0304e-01, -1.7404e-02, -2.4991e-01,  5.6454e-02,
          4.8624e-01, -3.8084e-01, -2.0731e-01, -2.0960e-01, -9.5725e-02,
         -9.3199e-02,  9.6667e-02, -3.9997e-01,  3.7793e-02, -3.3783e-02,
          9.2909e

In [16]:
example_sentence = "The Lambton college is a great place and AIMT program opens the door to future opportunities"

example_encoding = tokenizer(example_sentence, return_tensors='pt', padding=True, truncation=True)
example_input_ids = example_encoding['input_ids']
example_attention_mask = example_encoding['attention_mask']

with torch.no_grad():
    example_outputs = model(example_input_ids, attention_mask=example_attention_mask)
    example_sentence_embedding = example_outputs.last_hidden_state.mean(dim=1)

similarity_score = cosine_similarity(sentence_embedding, example_sentence_embedding)
print(f"Cosine similarity score: {similarity_score[0][0]}")

Cosine similarity score: 0.8371533155441284
