In [2]:
content=[
   "Vector embeddings are numerical representations of data (like text, images, or audio) that capture their meaning in a mathematical form.",
   "They convert complex information into vectors (lists of numbers) so that machines can understand and compare them.",
   "Similar items have embeddings that are closer together in vector space, while different items are farther apart.",
   "In NLP, words or sentences with similar meanings have similar embeddings.",
   "Vector embeddings are widely used in search engines, recommendation systems, clustering, and RAG (Retrieval-Augmented Generation).",
   "They help models find relevance, similarity, and context efficiently."
]


In [3]:
import numpy as np
def cosine_similarity(vec1,vec2):
    dot_pro=np.dot(vec1,vec2)
    norm_a=np.linalg.norm(vec1)
    norm_b=np.linalg.norm(vec2)
    return dot_pro/(norm_a*norm_b)

    


In [4]:
from langchain_openai import OpenAIEmbeddings
embed=OpenAIEmbeddings(
    model="text-embedding-3-small"

)
embed

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x000001512F8E9550>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x000001512F8E9E80>, model='text-embedding-3-small', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [5]:
sentence_embed=embed.embed_documents(content)
sentence_embed

[[-0.010003970935940742,
  -0.015957072377204895,
  -0.02026420086622238,
  -0.008397421799600124,
  0.02255082316696644,
  0.006041807122528553,
  0.004994593560695648,
  -0.010605194605886936,
  0.00016755418619140983,
  -0.005544072482734919,
  -0.010408071801066399,
  0.01807614043354988,
  3.2783180358819664e-05,
  -0.06177821755409241,
  0.02304362878203392,
  0.014685632660984993,
  -0.0008488590829074383,
  -0.0131677882745862,
  0.01394642237573862,
  0.007889831438660622,
  -0.0006166240782476962,
  -0.026986081153154373,
  0.007111196871846914,
  0.0036245915107429028,
  0.01962355338037014,
  0.020638734102249146,
  0.013483184389770031,
  0.011709081009030342,
  0.03875429928302765,
  -0.038517750799655914,
  -0.03493012115359306,
  -0.008895155973732471,
  0.009131703525781631,
  -0.02462060935795307,
  0.004085365682840347,
  0.011107857339084148,
  -0.015681101009249687,
  -0.02040218748152256,
  -0.004560923669487238,
  0.021742621436715126,
  0.01803671568632126,
  0.

In [9]:
#now finding similarity between all the pairs

for i in range(len(content)):
    for j in range(i+1,len(content)):
        similarity=cosine_similarity(sentence_embed[i],sentence_embed[j])
        print(f"{content[i]} and {content[j]}")

        print(f"the similarity {similarity:.3f}\n")


Vector embeddings are numerical representations of data (like text, images, or audio) that capture their meaning in a mathematical form. and They convert complex information into vectors (lists of numbers) so that machines can understand and compare them.
the similarity 0.557

Vector embeddings are numerical representations of data (like text, images, or audio) that capture their meaning in a mathematical form. and Similar items have embeddings that are closer together in vector space, while different items are farther apart.
the similarity 0.460

Vector embeddings are numerical representations of data (like text, images, or audio) that capture their meaning in a mathematical form. and In NLP, words or sentences with similar meanings have similar embeddings.
the similarity 0.438

Vector embeddings are numerical representations of data (like text, images, or audio) that capture their meaning in a mathematical form. and Vector embeddings are widely used in search engines, recommendation 

In [10]:
##semantic search
#Retrieving similar vetors(semtences)

points=[
   " Machine learning helps computers learn patterns from data.",

   "AI systems can automatically improve their performance using experience.",

   "Deep learning models use neural networks with many layers.",

   "Neural networks are inspired by the way the human brain works.",

   "Data science involves analyzing large amounts of data to extract insights.",

   "Artificial intelligence is widely used in healthcare, finance, and education."
]

In [11]:
query="what is machine learning"


In [19]:
def semantic_search(query,points,embed,top_k=3):
    query_embedding=embed.embed_query(query)
    points_embedding=embed.embed_documents(points)

    #calculating similariy score
    simi=[]
    for i,doc in enumerate(points_embedding):
        similarity=cosine_similarity(query_embedding,doc)
        simi.append((similarity,points[i]))


        #sort on the basis of similarity
    simi.sort(reverse=True)
    return simi[:top_k]
    


In [20]:
results=semantic_search(query,points,embed)
results

[(np.float64(0.5708541214439551),
  ' Machine learning helps computers learn patterns from data.'),
 (np.float64(0.3609578723062229),
  'Deep learning models use neural networks with many layers.'),
 (np.float64(0.32920597032064663),
  'Artificial intelligence is widely used in healthcare, finance, and education.')]