- A simple rag using hugging face and open ai
- I used both the models and their embedding techniques
- The differnce in output can be seen in this response

In [29]:
#convert text to vectors
import os
from dotenv import load_dotenv

load_dotenv()

os.environ['HF_TOKEN']=os.getenv('HF_TOKEN')
os.environ['OPENAI_API_KEY']=os.getenv('OPENAI_API_KEY')



In [30]:
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_openai import OpenAIEmbeddings

#https://platform.openai.com/docs/guides/embeddings



document_1 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
    id=1,
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
    id=2,
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
    id=3,
)

document_4 = Document(
    page_content="Robbers broke into the city bank and stole $1 million in cash.",
    metadata={"source": "news"},
    id=4,
)

document_5 = Document(
    page_content="Wow! That was an amazing movie. I can't wait to see it again. ",
    metadata={"source": "tweet"},
    id=5,
)

document_6 = Document(
    page_content="Is the new iPhone worth the price? Read this review to find out.",
    metadata={"source": "website"},
    id=6,
)

document_7 = Document(
    page_content="Lets Watch The KungFu Panda -3",
    metadata={"source": "website"},
    id=7,
)

document_8 = Document(
    page_content="LangGraph is the best framework for building stateful, agentic applications!",
    metadata={"source": "tweet"},
    id=8,
)

document_9 = Document(
    page_content="The stock market is down 500 points today due to fears of a recession.",
    metadata={"source": "news"},
    id=9,
)

document_10 = Document(
    page_content="I have a bad feeling I am going to get deleted :(",
    metadata={"source": "tweet"},
    id=10,
)

documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10,
]


model_name = "all-MiniLM-L6-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}

hf_embedding_technique = HuggingFaceEmbeddings(model_name=model_name)

open_ai_embedding_technique = OpenAIEmbeddings(model="text-embedding-3-small")




In [32]:
#Converting text to vectors
#Storing vectors to vector db

from langchain_chroma import Chroma

chroma_vector_stores = Chroma.from_documents(documents,hf_embedding_technique,persist_directory="./chroma_langchain_db3")


chroma_vector_stores_openai = Chroma.from_documents(documents,open_ai_embedding_technique,persist_directory="./chroma_langchain_db4")

In [33]:
chroma_vector_stores_openai.similarity_search("What is the movie name")

[Document(id='7', metadata={'source': 'website'}, page_content='Lets Watch The KungFu Panda -3'),
 Document(id='5', metadata={'source': 'tweet'}, page_content="Wow! That was an amazing movie. I can't wait to see it again. "),
 Document(id='3', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(id='10', metadata={'source': 'tweet'}, page_content='I have a bad feeling I am going to get deleted :(')]

In [20]:
chroma_vector_stores.similarity_search("What is the movie name")

[Document(id='5', metadata={'source': 'tweet'}, page_content="Wow! That was an amazing movie. I can't wait to see it again. "),
 Document(id='7', metadata={'source': 'website'}, page_content='Lets Watch The KungFu Panda -3'),
 Document(id='4', metadata={'source': 'news'}, page_content='Robbers broke into the city bank and stole $1 million in cash.'),
 Document(id='10', metadata={'source': 'tweet'}, page_content='I have a bad feeling I am going to get deleted :(')]

In [34]:
chroma_vector_stores_openai.similarity_search_with_score("What is the movie name")

[(Document(id='7', metadata={'source': 'website'}, page_content='Lets Watch The KungFu Panda -3'),
  1.2704536163352678),
 (Document(id='5', metadata={'source': 'tweet'}, page_content="Wow! That was an amazing movie. I can't wait to see it again. "),
  1.3212598199934835),
 (Document(id='3', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
  1.7216422970720335),
 (Document(id='10', metadata={'source': 'tweet'}, page_content='I have a bad feeling I am going to get deleted :('),
  1.7736303095923878)]

In [22]:
chroma_vector_stores.similarity_search_with_score("What is the movie name")

[(Document(id='5', metadata={'source': 'tweet'}, page_content="Wow! That was an amazing movie. I can't wait to see it again. "),
  1.1630622148513794),
 (Document(id='7', metadata={'source': 'website'}, page_content='Lets Watch The KungFu Panda -3'),
  1.4903408288955688),
 (Document(id='4', metadata={'source': 'news'}, page_content='Robbers broke into the city bank and stole $1 million in cash.'),
  1.7432069778442383),
 (Document(id='10', metadata={'source': 'tweet'}, page_content='I have a bad feeling I am going to get deleted :('),
  1.926144003868103)]

### Retrievers
LangChain VectorStore objects do not subclass Runnable, and so cannot immediately be integrated into LangChain Expression Language chains.

LangChain Retrievers are Runnables, so they implement a standard set of methods (e.g., synchronous and asynchronous invoke and batch operations) and are designed to be incorporated in LCEL chains.

We can create a simple version of this ourselves, without subclassing Retriever. If we choose what method we wish to use to retrieve documents, we can create a runnable easily. Below we will build one around the similarity_search method:

In [24]:
# This is a RunnableLambda
from langchain_core.runnables import RunnableLambda

def add_one(x: int) -> int:
    return x + 1

runnable = RunnableLambda(add_one)

print(runnable.invoke(1))
print(runnable.batch([1, 2, 3]))


2
[2, 3, 4]


In [27]:
from typing import List

from langchain_core.documents import Document
from langchain_core.runnables import chain


@chain
def retriever(query: str) -> List[Document]:
    return chroma_vector_stores.similarity_search(query, k=1)


retriever.batch(
    [
        "Food",
        "What is the movie name",
    ],
)

[[Document(id='1', metadata={'source': 'tweet'}, page_content='I had chocolate chip pancakes and scrambled eggs for breakfast this morning.')],
 [Document(id='5', metadata={'source': 'tweet'}, page_content="Wow! That was an amazing movie. I can't wait to see it again. ")]]

In [35]:
from typing import List

from langchain_core.documents import Document
from langchain_core.runnables import chain


@chain
def retriever(query: str) -> List[Document]:
    return chroma_vector_stores_openai.similarity_search(query, k=1)


retriever.batch(
    [
        "Food",
        "What is the movie name",
    ],
)

[[Document(id='1', metadata={'source': 'tweet'}, page_content='I had chocolate chip pancakes and scrambled eggs for breakfast this morning.')],
 [Document(id='7', metadata={'source': 'website'}, page_content='Lets Watch The KungFu Panda -3')]]

- Vectorstores implement an as_retriever method that will generate a Retriever, specifically a VectorStoreRetriever. These      retrievers include specific search_type and search_kwargs attributes that identify what methods of the underlying vector store to call, and how to parameterize them. For instance, we can replicate the above with the following:

In [36]:
retriever = chroma_vector_stores.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 1},
)

retriever.batch(
    [
        "How many distribution centers does Nike have in the US?",
        "When was Nike incorporated?",
    ],
)

[[Document(id='6', metadata={'source': 'website'}, page_content='Is the new iPhone worth the price? Read this review to find out.')],
 [Document(id='1', metadata={'source': 'tweet'}, page_content='I had chocolate chip pancakes and scrambled eggs for breakfast this morning.')]]

In [37]:
retriever = chroma_vector_stores_openai.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 1},
)

retriever.batch(
    [
        "How many distribution centers does Nike have in the US?",
        "When was Nike incorporated?",
    ],
)

[[Document(id='9', metadata={'source': 'news'}, page_content='The stock market is down 500 points today due to fears of a recession.')],
 [Document(id='6', metadata={'source': 'website'}, page_content='Is the new iPhone worth the price? Read this review to find out.')]]