In [1]:
!pip install langchain langchain_community langchain-google-genai python-dotenv langchain_experimental langchain_chroma langchainhub pypdf rdflib rank_bm25

Collecting langchain_community
  Downloading langchain_community-0.3.13-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-google-genai
  Downloading langchain_google_genai-2.0.7-py3-none-any.whl.metadata (3.6 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting langchain_experimental
  Downloading langchain_experimental-0.3.4-py3-none-any.whl.metadata (1.7 kB)
Collecting langchain_chroma
  Downloading langchain_chroma-0.1.4-py3-none-any.whl.metadata (1.6 kB)
Collecting langchainhub
  Downloading langchainhub-0.1.21-py3-none-any.whl.metadata (659 bytes)
Collecting pypdf
  Downloading pypdf-5.1.0-py3-none-any.whl.metadata (7.2 kB)
Collecting rdflib
  Downloading rdflib-7.1.1-py3-none-any.whl.metadata (11 kB)
Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)

In [2]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

# Step 1: Load the PDF document
loader = PyPDFLoader("/content/customer_product_support.pdf")
data = loader.load()  # entire PDF is loaded as a single Document

# Verify the data
print(data)

[Document(metadata={'source': '/content/customer_product_support.pdf', 'page': 0}, page_content='This is the product name and purchase data of customers \nCustomer name: this is the name of the customer \nContact: it will have the contact details of customer \nPurchase date: it will have the date of product purchase \nProduct name: it will have the name of product \nWarranty Details for Apple iPhone 16 Pro Max \nWarranty Details for Apple iPhone 16 Pro Max \n \nProduct: Apple iPhone 16 Pro Max \nWarranty Period: 1 year \nWarranty Start Date: Date of Purchase \nWarranty Coverage: Covers manufacturing defects and hardware malfunctions. Does not cover \nphysical damage or unauthorized repairs. \n \nFor more information, please refer to the user manual or contact our support team. \nWarranty Details for Samsung S24 Ultra \nWarranty Details for Samsung S24 Ultra \n \nProduct: Samsung S24 Ultra \nWarranty Period: 2year \nWarranty Start Date: Date of Purchase \nWarranty Coverage: Covers manuf

In [3]:
# Step 2: Split the document into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
docs = text_splitter.split_documents(data)

print("Total number of documents: ", len(docs))

Total number of documents:  11


In [22]:
# Step 3: Initialize the embeddings
import os
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vector = embeddings.embed_query("hello, world!")
print(vector[:5])

[0.05168594419956207, -0.030764883384108543, -0.03062233328819275, -0.02802734263241291, 0.01813093200325966]


In [5]:
# Step 4: Create a vector store and retriever
vectorstore = Chroma.from_documents(documents=docs, embedding=GoogleGenerativeAIEmbeddings(model="models/embedding-001"))
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 10})

In [6]:
# Step 5: Set up the LLM and RAG chain for simple RAG
llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", temperature=0.3, max_tokens=500)

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "You are an assistant for customer support"
    "Answer for customer question based on context of product name and purchase date"
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [7]:
# Step 6: Use the simple RAG pipeline
simple_response = rag_chain.invoke({"input": "Iam john doe what is the warranty of my device?"})
print("Simple RAG Output:", simple_response["answer"])



Simple RAG Output: This question cannot be answered without the product name and purchase date.  John Doe's information does not include these details needed to determine the device's warranty. Please provide the product name and purchase date.



In [8]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.document_loaders import CSVLoader
import os

# Step 1: Load the PDF document
pdf_loader = PyPDFLoader("/content/customer_product_support.pdf")
pdf_data = pdf_loader.load()

# Step 2: Split the PDF document into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
pdf_docs = text_splitter.split_documents(pdf_data)

In [9]:
# Step 3: Load and process CSV data
csv_loader = CSVLoader("/content/Emp_data_with_product.csv")
csv_documents = csv_loader.load()
csv_docs = text_splitter.split_documents(csv_documents)

In [16]:
# Step 4: Initialize the embeddings
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [17]:
# Step 5: Create separate vector stores
pdf_vectorstore = Chroma.from_documents(documents=pdf_docs, embedding=embeddings)
csv_vectorstore = Chroma.from_documents(documents=csv_docs, embedding=embeddings)

In [18]:
# Step 6: Create retrievers
pdf_retriever = pdf_vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 10})
csv_retriever = csv_vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 10})

# Step 7: Set up the LLM and RAG chain
llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", temperature=0.3, max_tokens=500)


In [19]:
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "You are an assistant for customer support. "
    "Answer customer questions based on the context of product name and purchase date. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, prompt)

In [20]:
# Step 8: Combine results from both retrievers
def hybrid_retrieval(query):
    pdf_results = pdf_retriever.invoke(query)
    csv_results = csv_retriever.invoke(query)
    combined_results = pdf_results + csv_results
    return combined_results


In [21]:
# Step 9: Use the hybrid retrieval function
query = "Iam john doe what is the warranty of my device?"
context = hybrid_retrieval(query)
hybrid_response = question_answer_chain.invoke({"input": query, "context": context})
print("Hybrid RAG Output:", hybrid_response)

Hybrid RAG Output: Your Samsung S24 Ultra has a 2-year warranty from the date of purchase (2022-12-10).  This covers manufacturing defects and hardware malfunctions, but not physical damage or unauthorized repairs.  

