<a href="https://colab.research.google.com/github/Victor-Mutuku/Semantic-Similarity-with-BERT/blob/main/PolysemyProbe_Dynamic_BERT_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import required Libraries
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Load pre-trained BERT tokenizer and BERT model from Hugging Face
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', force_download=True, use_safetensors=False) #The tokenizer converts sentences into tokens and numerical IDs that BERT understands(input preparations)
bert_model = TFBertModel.from_pretrained('bert-base-uncased', force_download=True, use_safetensors=False) #Pre-trained transformer language model that understands sentence meaning

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/536M [00:00<?, ?B/s]

In [None]:
# Example sentence pairs (some similar, some not)
sentence_pairs = [
    ("How do I learn Python?", "What is the best way to study Python?"),
    ("What is AI?", "How to cook pasta?"),
    ("How do I bake a chocolate cake?", "Give me a chocolate cake recipe."),
    ("How can I improve my coding skills?", "Tips for becoming better at programming."),
    ("Where can I buy cheap laptops?", "Best sites to find affordable computers."),
    ("Where can I start my career?","Here under Githinji regime with impunity everywhere?"),
    ("Who is the Goat of football?","It was not a debate anymore,Messi is the Goat"),
    ("Where can I get a manual customised Rs6?","Scoring distinction in Data & AI track is the aim"),
    ("How to be fluent in english","I love doing workout to keep fit"),
    ("Whats the aim of living","Whats does the solar system consist of?"),
]

In [None]:
# Ground truth similarity labels: 1 = similar, 0 = not similar
labels = [1, 0, 1, 1, 1, 1, 1, 0, 0, 0]

In [None]:
# Function to get the BERT [CLS] embedding for a sentence
def get_sentence_embedding(sentence): #Defining a function that receives one sentence and returns its BERT embedding

  # Tokenize and encode sentence into input tensors
  inputs= tokenizer(sentence, return_tensors="tf", add_special_tokens=True, truncation=True)

  # Get Model output
  output= bert_model(inputs) #sends the tokenized inputs through BERT

# Option 1:Using [ClS]

  # Extract [CLS] token embedding (shape: [1, 768])
  #cls_embedding= output.last_hidden_state[:, 0, :] # cls summarizes the entire sentences and last_hidden_state its the final output embedding for every token in a sentence
  #return cls_embedding.numpy() #returns a simple numpy array

# Option 2:Using mean pooling
  token_embeddings= output.last_hidden_state # shape: [1, 768]
  mean_embedding= tf.reduce_mean(output.last_hidden_state, axis=1)
  return mean_embedding.numpy()

In [None]:
# Calculate cosine similarity for each pair
predictions=[]
sim_scores=[]
for sent1, sent2 in sentence_pairs: # initializes an empty list predictions that loops through each pair sentence
  # Get embeddings for sentences
  embedding1= get_sentence_embedding(sent1)
  embedding2= get_sentence_embedding(sent2)

  # Calculate cosine similarity
  sim_score= cosine_similarity(embedding1, embedding2)[0][0] #Computes the cosine similarity between the two sentence vectors
  pred=1 if sim_score>0.7 else 0
  predictions.append(pred) #stores the predicted label(0/1) for later use

  print(f"\n sentence 1:{sent1}")
  print(f"sentence 2:{sent2}")
  print(f"Similarity Score:{sim_score}")
  print(f"Cosine Similarity: {sim_score:.4f} --> Predicted Similarity: {pred}")

In [None]:
# Evaluate accuracy
correct = 0
for i in range(len(labels)): # Loop only up to the length of the labels list
    if predictions[i] == labels[i]: #Means take element i from predictions and also element i from labels and compare them
       correct += 1 # each time they match it adds 1 to the counter

In [None]:
# Final accuracy calculation
total =len(labels)
accuracy= correct/total # correct is the number of predictions that matched the labels and total is the number of labels
print(f"Accuracy: {accuracy:.2%}") # prints accuracy as % with 2 decimal places