## What are embeddings?

A way to convert words into number, in a way the computer can understand

In [2]:
import numpy as np
import matplotlib.pyplot as plt

In [6]:
def cosine_similarity(vec1, vec2):
    """Calculate the cosine similarity between two vectors.
    Results range from -1 (opposite) to 1 (identical).
    0 means not related.
    """
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

# Creating embedding

In [3]:
## Huggingface and OpenAI Models

from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)
embeddings

  from .autonotebook import tqdm as notebook_tqdm


HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_encode_kwargs={}, multi_process=False, show_progress=False)

In [4]:
## This model converts a text or paragraphs into a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
text = "Hello, I am learning about embeddings."

vector = embeddings.embed_query(text)

print(f"Text: {text}")
print(f"Embedding Length: {len(vector)}")
print(vector)

Text: Hello, I am learning about embeddings.
Embedding Length: 384
[-0.023014338687062263, -0.10264783352613449, -0.00010355715494370088, 0.003793434938415885, 0.015323277562856674, 0.04926801100373268, -0.012406931258738041, 0.013899898156523705, 0.038785334676504135, -0.023891078308224678, 0.026761580258607864, 0.08163347095251083, 0.04130309075117111, 0.022510934621095657, -0.02644752897322178, 0.016060253605246544, 0.04206305369734764, 0.07388664782047272, -0.06676710397005081, -0.0010623931884765625, -0.02625349909067154, -0.042378295212984085, 0.0290607251226902, -0.09618093818426132, 0.037492040544748306, -0.030485300347208977, -0.031298454850912094, 0.05315070599317551, 0.09716412425041199, -0.07011305540800095, 0.022248143330216408, -0.0231437049806118, -0.02078201249241829, 0.06618605554103851, -0.05602779984474182, 0.1268676519393921, 0.03423291817307472, -0.0031785815954208374, -0.07056888192892075, -0.007944576442241669, 0.03417993709445, 0.03974391520023346, -0.0277243852

In [8]:
sentences = ["A cat sat on the mat.", "A feline rested on the rug", "The dog played in the yard.", "I love programming in Python", "Python is my favorite programming language."]

embedding_sentence = embeddings.embed_documents(sentences)
print(len(embedding_sentence), len(embedding_sentence[0]))

print(embedding_sentence[0])
print(embedding_sentence[1])

5 384
[0.12081317603588104, -0.02353622205555439, -0.036689478904008865, 0.05646262317895889, -0.06598735600709915, 0.028884282335639, 0.04533516615629196, 0.0245351679623127, -0.021596234291791916, 0.06051689386367798, -0.030385838821530342, 0.0679214745759964, 0.04265797883272171, 0.04072149470448494, -0.05587439984083176, -0.04850936681032181, -0.04652617126703262, -0.0386337973177433, 0.061583682894706726, 0.04188208654522896, -0.039417851716279984, 0.003178470069542527, 0.028615936636924744, -0.06368482857942581, -0.0484912171959877, 0.06777587532997131, -0.029583968222141266, -0.0541514977812767, 0.03716428577899933, -0.005214336793869734, -0.07258733361959457, -0.0008463573176413774, -0.03815697506070137, 0.05930107831954956, 0.025240201503038406, -0.09025305509567261, 0.018273409456014633, -0.0672234296798706, 0.04536253213882446, 0.022970039397478104, 0.08785846084356308, -0.008037894032895565, -0.006742424331605434, -0.06397843360900879, 0.024190060794353485, 0.01639444194734

# Applying cosine similarity between each of the sentences

In [9]:
## Applying cosine similarity between each of the sentences

for i in range(len(sentences)):
    for j in range(i + 1, len(sentences)):
        sim = cosine_similarity(embedding_sentence[i], embedding_sentence[j])
        print(f"Cosine Similarity between '{sentences[i]}' and '{sentences[j]}': {sim:.4f}")

Cosine Similarity between 'A cat sat on the mat.' and 'A feline rested on the rug': 0.5547
Cosine Similarity between 'A cat sat on the mat.' and 'The dog played in the yard.': 0.1955
Cosine Similarity between 'A cat sat on the mat.' and 'I love programming in Python': 0.0261
Cosine Similarity between 'A cat sat on the mat.' and 'Python is my favorite programming language.': 0.0098
Cosine Similarity between 'A feline rested on the rug' and 'The dog played in the yard.': 0.2730
Cosine Similarity between 'A feline rested on the rug' and 'I love programming in Python': 0.0652
Cosine Similarity between 'A feline rested on the rug' and 'Python is my favorite programming language.': 0.0448
Cosine Similarity between 'The dog played in the yard.' and 'I love programming in Python': 0.1223
Cosine Similarity between 'The dog played in the yard.' and 'Python is my favorite programming language.': 0.0946
Cosine Similarity between 'I love programming in Python' and 'Python is my favorite programming

In [17]:
### Retrieve the similar sentence
# Test Semantic Search
documents = [
    "Langchain is a framework for developing applications powered by language models.",
    "Python is a high-level programming language.",
    "Machine learning is a subset of artificial intelligence.",
    "Embeddings convert text into numerical representations.",
    "The weather today is sunny with a chance of rain."
]
query = "What is Embeddings?"

# query_embedding = embeddings.embed_query(query)
# similarities = [cosine_similarity(query_embedding, doc_embedding) for doc_embedding in embeddings.embed_documents(documents)]

In [14]:
def semantic_search(query, documents, embeddings, top_k=2):
    query_embedding = embeddings.embed_query(query)
    document_embeddings = embeddings.embed_documents(documents)
    
    similarities = [(cosine_similarity(query_embedding, doc_embedding),i) for i, doc_embedding in enumerate(document_embeddings)]
    
    # sort by similarity score
    # similarities.sort(reverse=True, key=lambda x: x[0])
    similarities.sort(reverse=True)
    top_documents = [(_,documents[i]) for _, i in similarities[:top_k]]
    return top_documents

In [18]:
semantic_search(query, documents, embeddings, 3)

[(np.float64(0.5993902903350031),
  'Embeddings convert text into numerical representations.'),
 (np.float64(0.2304667574556656),
  'Machine learning is a subset of artificial intelligence.'),
 (np.float64(0.18857268248667056),
  'Langchain is a framework for developing applications powered by language models.')]