In [None]:
!pip3 install -r requirements.txt

In [None]:
from pinecone import Pinecone
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings


In [None]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small", dimensions=512)

# RAG Pipeline
Takes the documents from data/pdf folder and chunks and stores them in Pinecone DB


In [None]:
import re

def clean_text(text):
    """Clean text to remove invalid Unicode characters"""
    # Remove or replace problematic Unicode characters
    text = re.sub(r'[\ud800-\udfff]', '', text)  # Remove surrogates
    text = re.sub(r'[^\x00-\x7F\u00A0-\uFFFF]', '', text)  # Keep only valid Unicode
    text = text.encode('utf-8', errors='ignore').decode('utf-8')  # Clean encoding
    return text.strip()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

# Load all PDFs
loader = DirectoryLoader("./data/pdf", glob="**/*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()

# Clean and add metadata
for doc in documents:
    filename = doc.metadata['source'].split('/')[-1]
    doc.metadata['book_title'] = filename.replace('.pdf', '')
    # Clean the page content
    doc.page_content = clean_text(doc.page_content)

# Split documents into chunks
chunks = text_splitter.split_documents(documents)

# Additional cleaning for chunks
for chunk in chunks:
    chunk.page_content = clean_text(chunk.page_content)

# Filter out empty chunks
chunks = [chunk for chunk in chunks if chunk.page_content.strip()]

# Create vector store and add documents
vector_store = PineconeVectorStore(index=index, embedding=embeddings)
vector_store.add_documents(documents=chunks)

print(f"Successfully embedded {len(chunks)} chunks from {len(documents)} pages")

# Fetching the RAG response

Gets the appropriate chunks from the vector store and returns response with citations


In [None]:
# RAG Query System
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

# Initialize the LLM
llm = ChatOpenAI(model="gpt-5", temperature = 1)

# Create a retriever from the vector store
retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}  # Retrieve top 3 most similar chunks
)

def fetch_rag_response(llm, retriever, query):  
    # Create a custom prompt template
    prompt_template = """Use the following pieces of context to answer the question at the end. 
    If you don't know the answer, just say that you don't know, don't try to make up an answer.

    Context:
    {context}

    Question: {question}
    Answer:"""

    PROMPT = PromptTemplate(
        template=prompt_template, 
        input_variables=["context", "question"]
    )

    # Create the RAG chain
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True,
        chain_type_kwargs={"prompt": PROMPT}
    )

    # Example query
    result = qa_chain({"query": query})

    return result

query = "How to design a Designing a URL Shortening service like TinyURL?"
result = fetch_rag_response(llm, retriever, query)

print(f"Question: {query}")
print(f"\nAnswer: {result['result']}")
print(f"\nSource documents:")
for i, doc in enumerate(result['source_documents']):
    print(f"{i+1}. {doc.metadata['book_title']} - Page {doc.metadata.get('page', 'N/A')}")
    print(f"   Content: {doc.page_content[:150]}...")
    print()