# Text embeddings and semantic search
* Text embeddings represents text as vector
* we can use metrics like cocine similarity to campare how close two embeddings are
* 

In [1]:
#problem
import torch
from transformers import AutoTokenizer, AutoModel

In [2]:
sentences = [
    " I took my dog for a walk",
    "Today is goint to rain",
    "I took my cat for a walk"
]

In [3]:
model_checkpoint = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModel.from_pretrained(model_checkpoint)

In [4]:
encoded_input = tokenizer(sentences,padding=True,truncation=True,return_tensors="pt")

In [5]:
with torch.no_grad():
    model_output = model(**encoded_input)

In [6]:
token_embeddings = model_output.last_hidden_state
print(f"Token embeddings shape:{token_embeddings.size()}")

Token embeddings shape:torch.Size([3, 9, 384])


In [7]:
#Token embeddings shape:torch.Size([num_sentences, num_tokens, embedding_dim])

# Above we can see that each token is represented by one vector but we want one vector per sentence

In [8]:
# For mitigate this we use Mean Polling to create the sentence vectors

In [9]:
import torch
import torch.nn.functional as F

In [10]:
def mean_polling(model_output,attention_mask):
    token_embeddings = model_output.last_hidden_state
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return torch.sum(token_embeddings +input_mask_expanded ,1) / torch.clamp(
        input_mask_expanded.sum(1),min=1e-9)

In [11]:
sentence_embeddings = mean_polling(model_output,encoded_input['attention_mask'])
#Normalizing the embeddings
sentence_embeddings = F.normalize(sentence_embeddings,p=2,dim=1)
print(f"sentence embedding shape :{sentence_embeddings.size()}")

sentence embedding shape :torch.Size([3, 384])


In [12]:
# Above we ca see that it gerenates numeber of embedding according to number of sentences
# here 3 sentences with 384 vectors dim or embedding dim

# Cosine similarity

In [13]:
#once we have our embedding we can calculate the cosine similarity
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
sentence_embeddings = sentence_embeddings.detach().numpy()

In [15]:
scores = np.zeros((sentence_embeddings.shape[0],sentence_embeddings.shape[0]))

In [16]:
for idx in range(sentence_embeddings.shape[0]):
    scores[idx,:] = cosine_similarity([sentence_embeddings[idx]],sentence_embeddings)[0]

# now we can use same tirck to measure the similarity of questions against a corpus of docs 

In [17]:
from datasets import load_dataset

In [18]:
squad = load_dataset("squad",split="validation[:100]")

In [19]:
def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list,padding=True,truncation=True,return_tensors="pt")
    encoded_input = {k:v.to("cpu") for k,v in encoded_input.items()}
    with torch.no_grad():
        model_output = model(**encoded_input)
        return mean_polling(model_output,encoded_input["attention_mask"])
squd_with_embeddings = squad.map(
    lambda x: {"embeddings ":get_embeddings(x['context']).cpu().numpy()[0]})

In [20]:
squd_with_embeddings

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'embeddings '],
    num_rows: 100
})

In [21]:
!pip install faiss-cpu





In [27]:
# we use special FAISS Index for fast nearest nighbour lookup
squd_with_embeddings.add_faiss_index(column="embeddings ")

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'embeddings '],
    num_rows: 100
})

In [23]:
question = "Who headlined the halftime show for Super Bowl 50?"
question_embeddings = get_embeddings([question]).cpu().detach().numpy()
question_embeddings.shape

(1, 384)

In [24]:
scores,samples = squd_with_embeddings.get_nearest_examples(
    "embeddings ",question_embeddings,k=5)

In [25]:
scores

array([23.663605, 23.663605, 23.663605, 23.663605, 23.663605],
      dtype=float32)

In [26]:
samples

{'id': ['56be5333acb8001400a5030a',
  '56be5333acb8001400a5030b',
  '56be5333acb8001400a5030c',
  '56be5333acb8001400a5030d',
  '56be5333acb8001400a5030e'],
 'title': ['Super_Bowl_50',
  'Super_Bowl_50',
  'Super_Bowl_50',
  'Super_Bowl_50',
  'Super_Bowl_50'],
 'context': ['CBS broadcast Super Bowl 50 in the U.S., and charged an average of $5 million for a 30-second commercial during the game. The Super Bowl 50 halftime show was headlined by the British rock group Coldplay with special guest performers Beyoncé and Bruno Mars, who headlined the Super Bowl XLVII and Super Bowl XLVIII halftime shows, respectively. It was the third-most watched U.S. broadcast ever.',
  'CBS broadcast Super Bowl 50 in the U.S., and charged an average of $5 million for a 30-second commercial during the game. The Super Bowl 50 halftime show was headlined by the British rock group Coldplay with special guest performers Beyoncé and Bruno Mars, who headlined the Super Bowl XLVII and Super Bowl XLVIII halftime s