In [60]:
#!pip install langchain_google-genai
#!pip install tiktoken
#!pip install pypdf
#!pip install langchain langchain-community
#!pip install openai
#!pip install langchain-openai
#!pip install python-dotenv
#!pip install langchain-pinecone

In [61]:
import os
from dotenv import load_dotenv

load_dotenv()

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
DATA_DIR = os.getenv("DATA_DIR", "..\\RAG Project Dataset")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY", "")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME", "rag-qna")
PINECONE_NAMESPACE = os.getenv("PINECONE_NAMESPACE", "default")
PINECONE_METRIC = os.getenv("PINECONE_METRIC", "cosine")

CHUNK_SIZE = 800
CHUNK_OVERLAP = 150
RETRIEVER_K = 4

EMBEDDING_MODEL = "text-embedding-3-small"
LLM_MODEL = "gpt-4o-mini"
LLM_TEMPERATURE = 0

PROMPT_TEMPLATE = """
You are a research assistant.
Answer ONLY using the provided context.
If the answer is not present, say:
\"I could not find sufficient information in the documents.\"

Context:
{context}

Question:
{question}

Answer:
"""


In [62]:
from pathlib import Path
from langchain_community.document_loaders import PyPDFLoader

def load_pdfs(data_dir: str):
    data_path = Path(data_dir)
    if not data_path.exists():
        raise FileNotFoundError(f"DATA_DIR not found: {data_path}")

    pdf_paths = sorted(p for p in data_path.rglob("*.pdf"))
    if not pdf_paths:
        raise FileNotFoundError(f"No PDFs found in: {data_path}")

    docs = []
    for pdf_path in pdf_paths:
        loader = PyPDFLoader(str(pdf_path))
        docs.extend(loader.load())
    return docs

docs = load_pdfs(DATA_DIR)

In [63]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

def chunk_documents(docs):
    splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
    )
    chunks = splitter.split_documents(docs)
    counts = {}
    for chunk in chunks:
        src = chunk.metadata.get("source", "unknown")
        counts[src] = counts.get(src, 0) + 1
    for src, count in sorted(counts.items()):
        print(f"{src}: {count} chunks")
    return chunks
chunked_docs = chunk_documents(docs)

..\RAG Project Dataset\1706.03762v7.pdf: 22 chunks
..\RAG Project Dataset\2005.11401v4.pdf: 34 chunks
..\RAG Project Dataset\2005.14165v4.pdf: 120 chunks


In [64]:
from os import getenv

# Support multiple embedding providers via the EMBEDDING_PROVIDER env var
# Options: "google" (default), "openai", "hf" (Hugging Face)
env_provider = getenv("EMBEDDING_PROVIDER")
if env_provider:
    provider = env_provider.lower()
else:
    # prefer openai automatically if OPENAI_API_KEY exists
    provider = "openai" if getenv("OPENAI_API_KEY") else "google"

if provider == "google":
    from langchain_google_genai import GoogleGenerativeAIEmbeddings
    embeddings = GoogleGenerativeAIEmbeddings(model=EMBEDDING_MODEL)

elif provider == "openai":
    # OpenAI embeddings are in the langchain_openai package
    try:
        from langchain_openai import OpenAIEmbeddings
    except ImportError as e:
        raise ImportError(
            "OpenAIEmbeddings not found. Install langchain_openai with: pip install langchain-openai"
        ) from e
    # Use 1024 dimensions to match the Pinecone index
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small", dimensions=1024)
else:
    raise ValueError(f"Unsupported EMBEDDING_PROVIDER: {provider}")

print(f"Using embedding provider: {provider}")


Using embedding provider: openai


In [65]:
from langchain_pinecone import PineconeVectorStore

# Create Pinecone vector store with error handling for embedding/quota issues

vectorstore = PineconeVectorStore.from_documents(
        documents=chunked_docs,
        embedding=embeddings,
        index_name=PINECONE_INDEX_NAME,
        namespace=PINECONE_NAMESPACE
    )


In [66]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough 

In [67]:
# Build and run the QA chain using LCEL with OpenAI
prompt = PromptTemplate(
    template=PROMPT_TEMPLATE,
    input_variables=['context', 'question'],
)
llm = ChatOpenAI(model=LLM_MODEL, temperature=LLM_TEMPERATURE)
retriever = vectorstore.as_retriever(search_kwargs={'k': RETRIEVER_K})

# Format retrieved documents into context
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Build chain
chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# Run query
question = "Explain how positional encoding is implemented in Transformers and why it is necessary."
answer = chain.invoke(question)

# Get source documents
retrieved_docs = retriever.invoke(question)
sources = [
    {'document': doc.metadata.get('source'), 'page': doc.metadata.get('page')}
    for doc in retrieved_docs
]

print('Answer:', answer)
print('\nSources:')
for s in sources:
    name = os.path.basename(s['document'] or '')
    page = s['page']
    if page is not None:
        print(f'- {name} (Page {page})')
    else:
        print(f'- {name}')


Answer: I could not find sufficient information in the documents.

Sources:
- 1706.03762v7.pdf (Page 2.0)
- 1706.03762v7.pdf (Page 1.0)
- 1706.03762v7.pdf (Page 1.0)
- 1706.03762v7.pdf (Page 4.0)
