In [1]:
import os
from dotenv import load_dotenv

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec
from langchain_google_genai import GoogleGenerativeAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

load_dotenv()

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")


  from tqdm.autonotebook import tqdm


In [2]:
loader = PyPDFLoader("/Users/praddy/Documents/CODE/learning_langchain_folder/chatbot-docqa/data/attention.pdf")  # Replace with your filename
documents = loader.load()
print(f"Loaded {len(documents)} pages")


Loaded 15 pages


In [3]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100,
)
docs = text_splitter.split_documents(documents)
print(f"Split into {len(docs)} chunks")


Split into 103 chunks


In [4]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [5]:
pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "doc-chatbot"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )


In [53]:
vectorstore = PineconeVectorStore.from_documents(
    documents=docs,
    embedding=embedding_model,
    index_name=index_name,
    namespace="doc-bot"
)


In [9]:
vectorstore = PineconeVectorStore(
    index_name="doc-chatbot",
    embedding=embedding_model,
    namespace="doc-bot"
)


In [6]:
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY  # Needed for langchain_google_genai

llm = GoogleGenerativeAI(model="gemini-2.0-flash")  # or gemini-2.0-flash


In [7]:
prompt_template = PromptTemplate.from_template(
    """You are a helpful assistant. Use only the information from the context below to answer the question.
If the question cannot be answered using the context, say:
"I’m sorry, I cannot answer that based on the provided document."

<context>
{context}
</context>

Question: {question}
Answer:"""
)


In [10]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt_template},
    return_source_documents=True
)


In [16]:
query = "Summarize this paper?"
result = qa_chain.invoke({"query": query})

print("Answer:\n", result["result"])


Answer:
 I’m sorry, I cannot answer that based on the provided document.
