### Cosine Similarity With OpenAI Embeddings

In [1]:
# Example 1: Finding similar sentences
sentences = [
    "The cat sat on the mat",
    "A feline rested on the rug",
    "The dog played in the yard",
    "I love programming in Python",
    "Python is my favorite programming language"
]

In [3]:
import numpy as np

def cosine_similarity(vector1, vector2):
    """
    Cosine similarity measures the angle between two vectors.
    - Result close to 1: Very similar
    - Result close to 0: Not related
    - Result close to -1: Opposite meanings
    """
    dot_product=np.dot(vector1, vector2)
    norm_a=np.linalg.norm(vector1)
    norm_b=np.linalg.norm(vector2)
    return dot_product/(norm_a * norm_b)

In [4]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model='text-embedding-3-small')
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x0000023D61781010>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x0000023D61724BC0>, model='text-embedding-3-small', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [5]:
sentence_embeddings = embeddings.embed_documents(sentences)
sentence_embeddings

[[-0.030758168548345566,
  -0.04960077628493309,
  -0.005053127650171518,
  -0.0015193307772278786,
  0.03628946840763092,
  -0.002040313323959708,
  -0.008891437202692032,
  0.027165407314896584,
  0.007082132622599602,
  -0.011896173469722271,
  0.04161399230360985,
  -0.0013739402638748288,
  0.04523260146379471,
  0.052702441811561584,
  0.03202468156814575,
  0.03248992934823036,
  -0.012426041066646576,
  0.0031016638968139887,
  -0.06601374596357346,
  0.04748130589723587,
  0.025821352377533913,
  -0.04538768157362938,
  -0.0034893720876425505,
  0.014590744860470295,
  0.00907236710190773,
  0.014771674759685993,
  -0.011172452010214329,
  -0.012070642784237862,
  0.010778282769024372,
  0.01282667275518179,
  0.01228388212621212,
  -0.036056842654943466,
  -0.026519227772951126,
  -0.04536183550953865,
  -0.03458355367183685,
  0.004807579331099987,
  -0.01986357383430004,
  -0.011728166602551937,
  -0.042027547955513,
  -0.02282307855784893,
  -0.036341164261102676,
  -0.005

In [7]:
# Calculate the simialrity betwween all pairs

for i in range(len(sentences)):
    for j in range(i+1, len(sentences)):
        similarity = cosine_similarity(sentence_embeddings[i], sentence_embeddings[j])
        
        print(f"'{sentences[i]}' vs '{sentences[j]}'")
        print(f"Similarity: {similarity:.3f}\n")

'The cat sat on the mat' vs 'A feline rested on the rug'
Similarity: 0.656

'The cat sat on the mat' vs 'The dog played in the yard'
Similarity: 0.324

'The cat sat on the mat' vs 'I love programming in Python'
Similarity: 0.090

'The cat sat on the mat' vs 'Python is my favorite programming language'
Similarity: 0.120

'A feline rested on the rug' vs 'The dog played in the yard'
Similarity: 0.296

'A feline rested on the rug' vs 'I love programming in Python'
Similarity: 0.055

'A feline rested on the rug' vs 'Python is my favorite programming language'
Similarity: 0.103

'The dog played in the yard' vs 'I love programming in Python'
Similarity: 0.126

'The dog played in the yard' vs 'Python is my favorite programming language'
Similarity: 0.085

'I love programming in Python' vs 'Python is my favorite programming language'
Similarity: 0.707



In [8]:
# Example- Semantic Search- Retireve the similar sentence
documents = [
    "LangChain is a framework for developing applications powered by language models",
    "Python is a high-level programming language",
    "Machine learning is a subset of artificial intelligence",
    "Embeddings convert text into numerical vectors",
    "The weather today is sunny and warm"
]
query = "What is Langchain?"

In [11]:
def semantic_search(query, documents, embeddings_models, top_k=3):
    """Simple semantic search implementation"""
    
    # Embed query and doument
    query_embedding = embeddings_models.embed_query(query)
    doc_embeddings = embeddings_models.embed_documents(documents)
    
    # Calculate the similarity score
    similarties = []
    for i, doc_emb in enumerate(doc_embeddings):
        similarity = cosine_similarity(query_embedding, doc_emb)
        similarties.append((similarity, documents[i]))
        
        
    ## Sort by similarity
    similarties.sort(reverse=True)
    return similarties[:top_k]

In [12]:
results = semantic_search(query, documents, embeddings)
results

[(np.float64(0.6756871264409868),
  'LangChain is a framework for developing applications powered by language models'),
 (np.float64(0.13012682759166014),
  'Python is a high-level programming language'),
 (np.float64(0.10110786905843959),
  'Embeddings convert text into numerical vectors')]

In [13]:
print(f"\n🔎 Semantic Search Results for: '{query}'")
for score, doc in results:
    print(f"Score: {score:.3f} | {doc}")


🔎 Semantic Search Results for: 'What is Langchain?'
Score: 0.676 | LangChain is a framework for developing applications powered by language models
Score: 0.130 | Python is a high-level programming language
Score: 0.101 | Embeddings convert text into numerical vectors


In [14]:
query="What is Embeddings?"
results=semantic_search(query,documents,embeddings)
results

[(np.float64(0.6227387139613368),
  'Embeddings convert text into numerical vectors'),
 (np.float64(0.25206899523723963),
  'Machine learning is a subset of artificial intelligence'),
 (np.float64(0.2291919996321144),
  'LangChain is a framework for developing applications powered by language models')]

In [15]:
print(f"\n🔎 Semantic Search Results for: '{query}'")
for score, doc in results:
    print(f"Score: {score:.3f} | {doc}")


🔎 Semantic Search Results for: 'What is Embeddings?'
Score: 0.623 | Embeddings convert text into numerical vectors
Score: 0.252 | Machine learning is a subset of artificial intelligence
Score: 0.229 | LangChain is a framework for developing applications powered by language models
