In [1]:
documents = [
  "Quantum mechanics describes the behavior of very small particles.",
  "Photosynthesis is the process by which green plants make food using sunlight.",
  "Shakespeare's plays are a testament to English literature.",
  "Artificial Intelligence aims to create machines that can think and learn.",
  "The pyramids of Egypt are historical monuments that have stood for thousands of years.",
  "Biology is the study of living organisms and their interactions with the environment.",
  "Music therapy can aid in the mental well-being of individuals.",
  "The Milky Way is just one of billions of galaxies in the universe.",
  "Economic theories help understand the distribution of resources in society.",
  "Yoga is an ancient practice that involves physical postures and meditation."
]

In [2]:
documents

['Quantum mechanics describes the behavior of very small particles.',
 'Photosynthesis is the process by which green plants make food using sunlight.',
 "Shakespeare's plays are a testament to English literature.",
 'Artificial Intelligence aims to create machines that can think and learn.',
 'The pyramids of Egypt are historical monuments that have stood for thousands of years.',
 'Biology is the study of living organisms and their interactions with the environment.',
 'Music therapy can aid in the mental well-being of individuals.',
 'The Milky Way is just one of billions of galaxies in the universe.',
 'Economic theories help understand the distribution of resources in society.',
 'Yoga is an ancient practice that involves physical postures and meditation.']

# Building Robust Semantic Search Engines with Transformers Model

In [3]:
!pip install -U sentence-transformers



In [4]:
!nvidia-smi

Sat Dec 28 17:47:48 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [5]:
from sentence_transformers import SentenceTransformer, util
import torch

In [6]:
# Load Pre-trained model transformers model
model = SentenceTransformer('all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [7]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [8]:
len(documents)

10

In [9]:
document_embedding = model.encode(documents)

In [10]:
document_embedding.shape

(10, 384)

In [11]:
# Lets try to find the most simliar document for one query

new_query = 'what is AI?'
new_query

'what is AI?'

In [12]:
#get embedding for new query
next_text_embedding = model.encode(new_query)
next_text_embedding

array([-2.49644294e-02, -9.13366675e-03, -7.46155018e-03,  1.50090652e-02,
        1.33104008e-02, -1.00360438e-02,  7.45601356e-02,  4.26714569e-02,
        1.69885065e-02,  5.59509546e-02, -2.96819080e-02, -4.35607787e-03,
        2.05323361e-02, -4.82827090e-02, -5.86694703e-02,  4.23620231e-02,
       -1.89194735e-02, -5.29925190e-02, -8.71717334e-02, -6.99818581e-02,
       -8.72021820e-03,  1.96782555e-02, -4.86003831e-02, -4.87065464e-02,
       -3.24669331e-02,  9.29557160e-02,  4.01412556e-03, -6.72163144e-02,
       -2.18765275e-03, -1.11629646e-02,  1.21077262e-02, -2.36866456e-02,
        1.02300443e-01,  1.91985052e-02, -6.65093437e-02,  4.12883870e-02,
       -4.09804024e-02, -2.75316685e-02,  6.85106069e-02, -2.96445526e-02,
       -1.92079283e-02, -5.42026609e-02,  1.68526936e-02, -7.19662681e-02,
        1.08496055e-01,  1.24557033e-01, -7.32609704e-02, -1.34330308e-02,
        3.10076419e-02,  5.09724393e-02, -1.31916091e-01, -1.21613510e-03,
       -5.76595543e-03,  

In [14]:
# Get cosine similarity score of document embedding compared to new query embedding
cos_scores = util.pytorch_cos_sim(next_text_embedding, document_embedding)[0]
cos_scores

tensor([0.0899, 0.0313, 0.0684, 0.6847, 0.0199, 0.1553, 0.0797, 0.0424, 0.1250,
        0.1547])

In [15]:
# This is the analogy for Value = Query * Key,
# Since, query = next_text_embedding
# key = document_embedding
# value = cosine similartity between query and key (here cos_scores)

In [16]:
import numpy as np
np.argmax(cos_scores)

tensor(3)

In [17]:
top_result = torch.topk(cos_scores, k=1)
top_result

torch.return_types.topk(
values=tensor([0.6847]),
indices=tensor([3]))

In [20]:
torch.topk(cos_scores, k=3)

torch.return_types.topk(
values=tensor([0.6847, 0.1553, 0.1547]),
indices=tensor([3, 5, 9]))

In [18]:
idx = top_result.indices[0]
idx

tensor(3)

In [19]:
# Get most similar document
documents[idx]

'Artificial Intelligence aims to create machines that can think and learn.'

In [21]:
# Create a function to return the top similar document based on any query

def semantic_search_engine(query, embedder_model):

  query_embedding = embedder_model.encode(query)
  cos_score = util.pytorch_cos_sim(query_embedding, document_embedding)[0]
  top_result = torch.topk(cos_score, k=1)
  idx = top_result.indices[0]
  return documents[idx]

In [22]:
# Try out the function

new_sentence = 'Tell me about AI'
sentence_2 = 'Do you know about the pyramids'
sentence_3 = 'How do plants survive?'
sentence_4 = 'What about english literature?'

In [24]:
semantic_search_engine(new_sentence, model)

'Artificial Intelligence aims to create machines that can think and learn.'

In [25]:
semantic_search_engine(sentence_2, model)

'The pyramids of Egypt are historical monuments that have stood for thousands of years.'

In [26]:
semantic_search_engine(sentence_3, model)

'Photosynthesis is the process by which green plants make food using sunlight.'

In [27]:
semantic_search_engine(sentence_4, model)

"Shakespeare's plays are a testament to English literature."