### RAG Pipelines- Data Ingestion to vector DB pipeline

In [1]:
import os
from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
### Read all the pdf's inside the directory
def process_all_pdfs(pdf_dir):
    """Process all PDF files in a directory and return a list of Documents."""
    # local import so the function works even if the import cell wasn't run
    from pathlib import Path
    all_documents = []
    pdf_dir = Path(pdf_dir)

    # find all files and filter for PDFs (case-insensitive)
    pdf_files = [p for p in pdf_dir.rglob('*') if p.suffix.lower() == '.pdf']

    print(f"Found {len(pdf_files)} PDF files.")

    for pdf_file in pdf_files:
        print(f"Processing: {pdf_file}")
        try:
            # Try PyPDFLoader first, fall back to PyMuPDFLoader on failure
            try:
                loader = PyPDFLoader(str(pdf_file))
                documents = loader.load()
            except Exception:
                loader = PyMuPDFLoader(str(pdf_file))
                documents = loader.load()

            # Add source information to metadata
            for doc in documents:
                doc.metadata["source_file"] = str(pdf_file.name)
                doc.metadata["file_type"] = "pdf"

            all_documents.extend(documents)
            print(f" Loaded {len(documents)} pages from {pdf_file.name}")
        except Exception as e:
            print(f" Error processing {pdf_file.name}: {e}")
    
    print(f"Total documents loaded: {len(all_documents)}")
    return all_documents

all_pdf_documents = process_all_pdfs("data/pdf")

Found 3 PDF files.
Processing: data\pdf\1758773206507.pdf
 Loaded 7 pages from 1758773206507.pdf
Processing: data\pdf\1759118729479.pdf
 Loaded 7 pages from 1758773206507.pdf
Processing: data\pdf\1759118729479.pdf
 Loaded 10 pages from 1759118729479.pdf
Processing: data\pdf\1760232290618.pdf
 Loaded 10 pages from 1759118729479.pdf
Processing: data\pdf\1760232290618.pdf
 Loaded 44 pages from 1760232290618.pdf
Total documents loaded: 61
 Loaded 44 pages from 1760232290618.pdf
Total documents loaded: 61


In [3]:
all_pdf_documents


[Document(metadata={'producer': 'iLovePDF', 'creator': 'PyPDF', 'creationdate': '', 'moddate': '2025-09-25T04:06:03+00:00', 'source': 'data\\pdf\\1758773206507.pdf', 'total_pages': 7, 'page': 0, 'page_label': '1', 'source_file': '1758773206507.pdf', 'file_type': 'pdf'}, page_content=''),
 Document(metadata={'producer': 'iLovePDF', 'creator': 'PyPDF', 'creationdate': '', 'moddate': '2025-09-25T04:06:03+00:00', 'source': 'data\\pdf\\1758773206507.pdf', 'total_pages': 7, 'page': 1, 'page_label': '2', 'source_file': '1758773206507.pdf', 'file_type': 'pdf'}, page_content=''),
 Document(metadata={'producer': 'iLovePDF', 'creator': 'PyPDF', 'creationdate': '', 'moddate': '2025-09-25T04:06:03+00:00', 'source': 'data\\pdf\\1758773206507.pdf', 'total_pages': 7, 'page': 2, 'page_label': '3', 'source_file': '1758773206507.pdf', 'file_type': 'pdf'}, page_content=''),
 Document(metadata={'producer': 'iLovePDF', 'creator': 'PyPDF', 'creationdate': '', 'moddate': '2025-09-25T04:06:03+00:00', 'source':

In [4]:
def split_documents(documents, chunk_size=1000, chunk_overlap=100):
    """ Split documents into smaller chunks for better RAG performance. """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        # separator=["\n\n", "\n", " ", ""]
    )

    split_docs = text_splitter.split_documents(documents)
    print(f"Split into {len(split_docs)} chunks.")

    if split_docs:
        print(f"\nExample chunks:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

In [5]:
chunks = split_documents(all_pdf_documents)
chunks

Split into 54 chunks.

Example chunks:
Content: PhonePe
SDE 2
Interview Experience
E x p e r i e n c e  -  2 0 0...
Metadata: {'producer': 'Canva', 'creator': 'Canva', 'creationdate': '2025-09-29T04:03:43+00:00', 'title': 'Copy of Copy of Copy of Copy of Copy of Copy of Copy of Copy of Navneet - Microsoft', 'moddate': '2025-09-29T04:03:41+00:00', 'keywords': 'DAG0U6_7rHg,BACefIoeck4,0', 'author': 'Navneet Rabadiya', 'source': 'data\\pdf\\1759118729479.pdf', 'total_pages': 10, 'page': 0, 'page_label': '1', 'source_file': '1759118729479.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'Canva', 'creator': 'Canva', 'creationdate': '2025-09-29T04:03:43+00:00', 'title': 'Copy of Copy of Copy of Copy of Copy of Copy of Copy of Copy of Navneet - Microsoft', 'moddate': '2025-09-29T04:03:41+00:00', 'keywords': 'DAG0U6_7rHg,BACefIoeck4,0', 'author': 'Navneet Rabadiya', 'source': 'data\\pdf\\1759118729479.pdf', 'total_pages': 10, 'page': 0, 'page_label': '1', 'source_file': '1759118729479.pdf', 'file_type': 'pdf'}, page_content='PhonePe\nSDE 2\nInterview Experience\nE x p e r i e n c e  -  2 0 0'),
 Document(metadata={'producer': 'Canva', 'creator': 'Canva', 'creationdate': '2025-09-29T04:03:43+00:00', 'title': 'Copy of Copy of Copy of Copy of Copy of Copy of Copy of Copy of Navneet - Microsoft', 'moddate': '2025-09-29T04:03:41+00:00', 'keywords': 'DAG0U6_7rHg,BACefIoeck4,0', 'author': 'Navneet Rabadiya', 'source': 'data\\pdf\\1759118729479.pdf', 'total_pages': 10, 'page': 1, 'page_label': '2', 'source_file': '1759118729479.pdf', 'file_type': '

Embedding And VectorStore DB

In [7]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
import os

# Initialize HuggingFace Embeddings (runs locally)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

# Create Vector Store using Chroma
vector_store = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory="./chroma_db"
)

print("Vector store created successfully with HuggingFace embeddings.")

Vector store created successfully with HuggingFace embeddings.


In [8]:
chunks

[Document(metadata={'producer': 'Canva', 'creator': 'Canva', 'creationdate': '2025-09-29T04:03:43+00:00', 'title': 'Copy of Copy of Copy of Copy of Copy of Copy of Copy of Copy of Navneet - Microsoft', 'moddate': '2025-09-29T04:03:41+00:00', 'keywords': 'DAG0U6_7rHg,BACefIoeck4,0', 'author': 'Navneet Rabadiya', 'source': 'data\\pdf\\1759118729479.pdf', 'total_pages': 10, 'page': 0, 'page_label': '1', 'source_file': '1759118729479.pdf', 'file_type': 'pdf'}, page_content='PhonePe\nSDE 2\nInterview Experience\nE x p e r i e n c e  -  2 0 0'),
 Document(metadata={'producer': 'Canva', 'creator': 'Canva', 'creationdate': '2025-09-29T04:03:43+00:00', 'title': 'Copy of Copy of Copy of Copy of Copy of Copy of Copy of Copy of Navneet - Microsoft', 'moddate': '2025-09-29T04:03:41+00:00', 'keywords': 'DAG0U6_7rHg,BACefIoeck4,0', 'author': 'Navneet Rabadiya', 'source': 'data\\pdf\\1759118729479.pdf', 'total_pages': 10, 'page': 1, 'page_label': '2', 'source_file': '1759118729479.pdf', 'file_type': '

In [9]:
texts = [doc.page_content for doc in chunks]
texts

['PhonePe\nSDE 2\nInterview Experience\nE x p e r i e n c e  -  2 0 0',
 'Summary\nüìå   Job Role: Software Development Engineer\nüî¢   Number of Rounds: 4\nüìú   Offer Status: Offer\nüìç   Location: Banglore\nüë§   Candidate Name: Not disclosing due to signed NDA',
 'Interview Process:\nThe interview was conducted virtually for the Bangalore\nlocation. It consisted of 4 rounds in total ‚Äì Machine Coding\nRound, Problem Solving/Data Structures Round, System\nDesign Round, and Hiring Manager Round.\nAfter completing all the rounds, I was informed within 1\nworking day that I had successfully cleared the interviews and\nthey would extend me an offer.',
 'Preparation Guide\nThe interviews at PhonePe were a mix of hands-on coding,\ndata structures and algorithms, system design (both HLD and\nLLD), and behavioral discussions.\nKey areas to prepare before such an interview include:\nObject-Oriented Programming and Machine Coding Practices\n‚Äì focus on code reusability, extensibility, 

Retriever Pipeline from vectorStore

In [12]:
db = Chroma.from_documents(chunks, embeddings, persist_directory="./chroma_db")

retriever = db.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}
)

relevant_docs = retriever.invoke("phonepe interview round?")

for i, doc in enumerate(relevant_docs):
    print(f"\nDocument {i+1}:\n")
    print(doc.page_content)
    if doc.metadata:
        print(f"\nMetadata: {doc.metadata}")


Document 1:

Round 4: Hiring
Manager
Duration: 60 minutes
Difficulty Level: Medium
Experience:
 This was more of a behavioral and managerial round. I was
asked about my past projects, challenges I faced, and how I
solved them. There were situational and behavioral questions
to assess my problem-solving style and communication.
I was done with formal questions in about 40 minutes, after
which the discussion shifted to PhonePe‚Äôs teams,
organizational structure, and work culture.
Key Learnings:
Prepare to talk in-depth about your projects‚Äîboth technical
and decision-making aspects.
Reflect on challenges you‚Äôve faced and how you handled
them.
Be curious and ask about the company‚Äôs culture, it shows
genuine interest.

Metadata: {'file_type': 'pdf', 'producer': 'Canva', 'title': 'Copy of Copy of Copy of Copy of Copy of Copy of Copy of Copy of Navneet - Microsoft', 'source': 'data\\pdf\\1759118729479.pdf', 'author': 'Navneet Rabadiya', 'moddate': '2025-09-29T04:03:41+00:00', 'keyword