### Data Ingestion



In [84]:
### =Document Structure

from langchain_core.documents import Document

In [85]:
doc = Document(
    page_content="This is the main content of the document. I am using it to create RAG",
    metadata={"source": "idontknow.txt",
               "author": "Sumit Padwal", 
               "pages" : 5,
               "date_created": "2026-02-02"
               }
)
doc

Document(metadata={'source': 'idontknow.txt', 'author': 'Sumit Padwal', 'pages': 5, 'date_created': '2026-02-02'}, page_content='This is the main content of the document. I am using it to create RAG')

In [86]:
## Create a simple txt file
import os
os.makedirs("../data/data", exist_ok=True)

In [87]:
sample_texts={
    "../data/data/python_intro.txt":"""Python Programming Language

Python is a high-level, interpreted programming language known for its simplicity and readability.
It was created by Guido van Rossum and first released in 1991.

Python is widely used in:
- Web development
- Data science and machine learning
- Automation and scripting
- Artificial intelligence
- Software development

One of Python’s biggest strengths is its easy-to-understand syntax, which allows developers to write
clear and concise code. This makes Python an excellent choice for beginners as well as professionals.

Python supports multiple programming paradigms, including procedural, object-oriented, and functional
programming. It also has a large standard library and a strong community, which helps developers build
applications faster and more efficiently.
"""
}

for filepath, content in sample_texts.items():
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(content)
print("Sample text files created.")

Sample text files created.


In [88]:
###TextLoader
from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/data/python_intro.txt", encoding="utf-8")
Documents = loader.load()
print(Documents)

[Document(metadata={'source': '../data/data/python_intro.txt'}, page_content='Python Programming Language\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nIt was created by Guido van Rossum and first released in 1991.\n\nPython is widely used in:\n- Web development\n- Data science and machine learning\n- Automation and scripting\n- Artificial intelligence\n- Software development\n\nOne of Python’s biggest strengths is its easy-to-understand syntax, which allows developers to write\nclear and concise code. This makes Python an excellent choice for beginners as well as professionals.\n\nPython supports multiple programming paradigms, including procedural, object-oriented, and functional\nprogramming. It also has a large standard library and a strong community, which helps developers build\napplications faster and more efficiently.\n')]


In [89]:
###DirectoryLoader
from langchain_community.document_loaders import DirectoryLoader

## load all the text files from the directory
dir_loader = DirectoryLoader(
    "../data/data",
    glob="**/*.txt",
    loader_cls=TextLoader,
    loader_kwargs={"encoding": "utf-8"},
    show_progress=False
)

Documents = dir_loader.load()
Documents

[Document(metadata={'source': '..\\data\\data\\python_intro.txt'}, page_content='Python Programming Language\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nIt was created by Guido van Rossum and first released in 1991.\n\nPython is widely used in:\n- Web development\n- Data science and machine learning\n- Automation and scripting\n- Artificial intelligence\n- Software development\n\nOne of Python’s biggest strengths is its easy-to-understand syntax, which allows developers to write\nclear and concise code. This makes Python an excellent choice for beginners as well as professionals.\n\nPython supports multiple programming paradigms, including procedural, object-oriented, and functional\nprogramming. It also has a large standard library and a strong community, which helps developers build\napplications faster and more efficiently.\n')]

In [90]:

from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
## load all the text files from the directory
dir_loader = DirectoryLoader(
    "../data/data/pdf",
    glob="**/*.pdf",
    loader_cls=PyMuPDFLoader,
    show_progress=False
)
pdf_Documents = dir_loader.load()
pdf_Documents

[Document(metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2026-02-01T14:23:52-06:00', 'source': '..\\data\\data\\pdf\\Resume Sumit Padwal.pdf', 'file_path': '..\\data\\data\\pdf\\Resume Sumit Padwal.pdf', 'total_pages': 1, 'format': 'PDF 1.7', 'title': '', 'author': 'Sumit padwal', 'subject': '', 'keywords': '', 'moddate': '2026-02-01T14:23:52-06:00', 'trapped': '', 'modDate': "D:20260201142352-06'00'", 'creationDate': "D:20260201142352-06'00'", 'page': 0}, page_content='SUMIT SUNIL PADWAL \nChicago, IL | 312-358-0500 | sumitpadwal8@gmail.com| Linkedin | Github \nEDUCATION \n \nIllinois Institute of Technology, Chicago, IL                                                                                               Expected Graduation – May 2026 \nMaster of Science, Computer Science \nAjeenkya D Y Patil University, India                                                                                                                      

In [91]:
type(pdf_Documents[0])

langchain_core.documents.base.Document

In [92]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

def split_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=150,
        separators=["\n\n", "\n", " ", ""]
    )

    split_docs = text_splitter.split_documents(documents)

    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")

    # Show example of a chunk
    if split_docs:
        print("\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")

    return split_docs

In [93]:
chunks = split_documents(pdf_Documents)

chunks


Split 1 documents into 6 chunks

Example chunk:
Content: SUMIT SUNIL PADWAL 
Chicago, IL | 312-358-0500 | sumitpadwal8@gmail.com| Linkedin | Github 
EDUCATION 
 
Illinois Institute of Technology, Chicago, IL                                                  ...
Metadata: {'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2026-02-01T14:23:52-06:00', 'source': '..\\data\\data\\pdf\\Resume Sumit Padwal.pdf', 'file_path': '..\\data\\data\\pdf\\Resume Sumit Padwal.pdf', 'total_pages': 1, 'format': 'PDF 1.7', 'title': '', 'author': 'Sumit padwal', 'subject': '', 'keywords': '', 'moddate': '2026-02-01T14:23:52-06:00', 'trapped': '', 'modDate': "D:20260201142352-06'00'", 'creationDate': "D:20260201142352-06'00'", 'page': 0}


[Document(metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2026-02-01T14:23:52-06:00', 'source': '..\\data\\data\\pdf\\Resume Sumit Padwal.pdf', 'file_path': '..\\data\\data\\pdf\\Resume Sumit Padwal.pdf', 'total_pages': 1, 'format': 'PDF 1.7', 'title': '', 'author': 'Sumit padwal', 'subject': '', 'keywords': '', 'moddate': '2026-02-01T14:23:52-06:00', 'trapped': '', 'modDate': "D:20260201142352-06'00'", 'creationDate': "D:20260201142352-06'00'", 'page': 0}, page_content='SUMIT SUNIL PADWAL \nChicago, IL | 312-358-0500 | sumitpadwal8@gmail.com| Linkedin | Github \nEDUCATION \n \nIllinois Institute of Technology, Chicago, IL                                                                                               Expected Graduation – May 2026 \nMaster of Science, Computer Science \nAjeenkya D Y Patil University, India                                                                                                                      

### Embedding and VectorStoreDB

In [94]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any,Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [95]:
class EmbeddingManager:
    """Handles document embedding generation using Sentence Transformers. """

    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        """
        Initialize the embedding manager
        
        Args:
            model_name: HuggingFace model name for SentenceTransformer
        """
        self.model_name = model_name
        self.model = None
        self._load_model()
    
    def _load_model(self):
        """Load the SentenceTransformer model."""
        try:
            print(f"loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise 

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of texts.
        
        Args:
            texts: List of strings to embed.
            
        Returns:
            numpy array of embeddings with shape (len(texts), embedding_dim)
        """

        if not self.model:
            raise ValueError("Model not loaded.")
        
        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    
## initialize the embedding manager
embedding_manager = EmbeddingManager()
embedding_manager

loading embedding model: all-MiniLM-L6-v2


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 868.78it/s, Materializing param=pooler.dense.weight]                             
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x127004537f0>

VectorStore

In [96]:
class VectorStore:
    """Manages document embedding in a ChromaDB vector store."""

    def __init__(self, collection_name: str = "pdf_Documents", persist_directory: str = "../data/vector_store/"):
        """
        Initialize the vector store.
        
        Args:
            collection_name: Name of the ChromaDB collection.
            persist_directory: Directory to persist the vector store.
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store() 

    def _initialize_store(self):
        """Initialize ChromaDB clients= and collection"""
        try: 
            #Create persistnat ChromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            #Get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF Documents embedding for RAG"}
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Exisiting documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise    
    
    def add_documents(self, documents: List[Any], embedding_manager: np.ndarray):
        """
        Add documents and their embeddings to the vector store

        Args:
            documents: List of LangChain Documents.
            embeddings: Corresponding embeddings for the documents
        """

        if len(documents) != len(embedding_manager):
            raise ValueError("Number of documents must match number of embeddings.")
        
        print(f"Adding {len(documents)} documents to vector store...")

        #Prepare data for chromadb
        ids = []
        metadatas = []
        documents_texts = []
        embeddings_list = []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_id_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            # Document content
            documents_texts.append(doc.page_content)

            # Embedding
            embeddings_list.append(embedding.tolist())

            # Add to collection
            try:
                self.collection.add(
                    ids=ids,
                    metadatas=metadatas,
                    documents=documents_texts,
                    embeddings=embeddings_list
                )
                print(f"Successfully added {len(documents)} documents to vector store.")
                print(f"Total documents in collection: {self.collection.count()}")

            except Exception as e:
                print(f"Error adding documents to vector store: {e}")
                raise

vectorstore = VectorStore()
vectorstore

Vector store initialized. Collection: pdf_Documents
Exisiting documents in collection: 16


<__main__.VectorStore at 0x12702be2050>

In [97]:
chunks

[Document(metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2026-02-01T14:23:52-06:00', 'source': '..\\data\\data\\pdf\\Resume Sumit Padwal.pdf', 'file_path': '..\\data\\data\\pdf\\Resume Sumit Padwal.pdf', 'total_pages': 1, 'format': 'PDF 1.7', 'title': '', 'author': 'Sumit padwal', 'subject': '', 'keywords': '', 'moddate': '2026-02-01T14:23:52-06:00', 'trapped': '', 'modDate': "D:20260201142352-06'00'", 'creationDate': "D:20260201142352-06'00'", 'page': 0}, page_content='SUMIT SUNIL PADWAL \nChicago, IL | 312-358-0500 | sumitpadwal8@gmail.com| Linkedin | Github \nEDUCATION \n \nIllinois Institute of Technology, Chicago, IL                                                                                               Expected Graduation – May 2026 \nMaster of Science, Computer Science \nAjeenkya D Y Patil University, India                                                                                                                      

In [98]:
### Convert the text to embeddings
texts = [doc.page_content for doc in chunks]

### Generate embeddings

embeddings = embedding_manager.generate_embeddings(texts)

## Store in vector database
vectorstore.add_documents(chunks, embeddings)


Generating embeddings for 6 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 11.38it/s]

Generated embeddings with shape: (6, 384)
Adding 6 documents to vector store...
Successfully added 6 documents to vector store.
Total documents in collection: 17
Successfully added 6 documents to vector store.
Total documents in collection: 18
Successfully added 6 documents to vector store.
Total documents in collection: 19
Successfully added 6 documents to vector store.
Total documents in collection: 20





Successfully added 6 documents to vector store.
Total documents in collection: 21
Successfully added 6 documents to vector store.
Total documents in collection: 22


### Retriever Pipeline

In [99]:
class RAGRetriever:
    """Retrieves relevant documents from vector store based on query embeddings."""

    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        """
        Initialize the RAG retriever.
        
        Args:
            vector_store: Vector Store containing documents embeddings
            embedding_manager: Manager for generating query embeddings.
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager
        
    def retrieve(self, query: str, top_k: int = 5, score_threshold:float =0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query
        
        Args:
            query: User query string
            top_k: Number of top documents to return
            score_threshold: Minimum similarity score threshold

        Returns:
            List of dictionaries containing retrieveddocument and metadata    
        """

        print(f"Retrieving documents for query: {query}")
        print(f"Top k : {top_k}, Score threshold: {score_threshold}")

        #Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]

        #Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k,
            )

            # Process results
            retrieved_docs = []

            if results['documents'] and results['documents'][0]:
                    Documents = results['documents'][0],
                    metadatas = results['metadatas'][0],
                    distances = results['distances'][0]
                    ids = results['ids'][0] 

                    for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, Documents, metadatas, distances)):
                        # Convert distance to similarity score
                        similarity_score = 1 - distance  

                        if similarity_score >= score_threshold:
                            retrieved_docs.append({
                                "id": doc_id,
                                "content": document,
                                "metadata": metadata,
                                "similarity_score": similarity_score,
                                "distance": distance,
                                "rank": i + 1
                                
                            })
                    print(f"Retrieved {len(retrieved_docs)} documents after filtering") 
            else:
                print("No documents found")

            return retrieved_docs

        except Exception as e:
                print(f"Error retrieving documents: {e}")
                return []    
                
rag_retriever = RAGRetriever(vectorstore, embedding_manager)                    

In [100]:
rag_retriever

<__main__.RAGRetriever at 0x12701bbc910>

In [101]:
rag_retriever.retrieve("What is Sumit Padwal highest qualification?")

Retrieving documents for query: What is Sumit Padwal highest qualification?
Top k : 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 102.11it/s]

Generated embeddings with shape: (1, 384)
Retrieved 0 documents after filtering





[]