# RAG Demo (Gemini + Google Embeddings + Pinecone)

In [None]:
from src.loaders import load_pdfs
from src.chunking import chunk_documents
from src.vectorstore import build_pinecone_index
from src.config import DATA_DIR

docs = load_pdfs(DATA_DIR)
chunks = chunk_documents(docs)
vectorstore = build_pinecone_index(chunks)
print("Pinecone index populated")

In [None]:
import os
from pathlib import Path
from dotenv import load_dotenv
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

load_dotenv()
DATA_DIR = os.getenv('DATA_DIR', '..\\RAG Project Dataset')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY', '')
PINECONE_INDEX_NAME = os.getenv('PINECONE_INDEX_NAME', '')
PINECONE_CLOUD = os.getenv('PINECONE_CLOUD', 'aws')
PINECONE_REGION = os.getenv('PINECONE_REGION', 'us-east-1')

PROMPT = """
You are a research assistant.
Answer ONLY using the provided context.
If the answer is not present, say:
"I could not find sufficient information in the documents."

Context:
{context}

Question:
{question}

Answer:
"""


In [None]:
def load_pdfs(data_dir):
    pdf_paths = sorted(Path(data_dir).rglob('*.pdf'))
    docs = []
    for p in pdf_paths:
        docs.extend(PyPDFLoader(str(p)).load())
    return docs

docs = load_pdfs(DATA_DIR)
splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=800,
    chunk_overlap=150,
)
chunks = splitter.split_documents(docs)

embeddings = GoogleGenerativeAIEmbeddings(model='models/embedding-001')
pc = Pinecone(api_key=PINECONE_API_KEY)
dimension = len(embeddings.embed_query('dimension_probe'))
if PINECONE_INDEX_NAME not in pc.list_indexes().names():
    pc.create_index(
        name=PINECONE_INDEX_NAME,
        dimension=dimension,
        metric='cosine',
        spec=ServerlessSpec(cloud=PINECONE_CLOUD, region=PINECONE_REGION),
    )
vectorstore = PineconeVectorStore.from_documents(
    chunks,
    embeddings,
    index_name=PINECONE_INDEX_NAME,
)


In [None]:
prompt = PromptTemplate(
    template=PROMPT,
    input_variables=['context', 'question'],
)
llm = ChatGoogleGenerativeAI(model='gemini-pro', temperature=0)
retriever = vectorstore.as_retriever(search_kwargs={'k': 4})

qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type='stuff',
    return_source_documents=True,
    chain_type_kwargs={'prompt': prompt},
)

result = qa('What are the two sub-layers in each Transformer encoder layer?')
answer = result['result']
sources = [
    {'document': doc.metadata.get('source'), 'page': doc.metadata.get('page')}
    for doc in result['source_documents']
]

print('Answer:', answer)
print('Sources:')
for s in sources:
    name = os.path.basename(s['document'] or '')
    page = s['page']
    if page is not None:
        print(f'- {name} (Page {page})')
    else:
        print(f'- {name}')
