In [24]:
from langchain_core.documents import Document
from langchain.document_loaders import PyPDFLoader, DirectoryLoader, PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [25]:
dir_loader = DirectoryLoader("../data/pdf_files", glob="*.pdf", loader_cls=PyMuPDFLoader)
documents = dir_loader.load()
documents

[Document(metadata={'producer': 'pdfTeX-1.40.26', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-08-12T09:48:22+00:00', 'source': '../data/pdf_files/ShrivatsaSaankhya_Env_Resume.pdf', 'file_path': '../data/pdf_files/ShrivatsaSaankhya_Env_Resume.pdf', 'total_pages': 1, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-08-12T09:48:22+00:00', 'trapped': '', 'modDate': 'D:20250812094822Z', 'creationDate': 'D:20250812094822Z', 'page': 0}, page_content='Saankhya Shrivatsa\n+1 (619) 415-3843 | shrivatsaankhya.bus@gmail.com | linkedin.com/in/saankhya-shrivatsa |\nEducation\nUniversity of California San Diego\nSan Diego, CA\nBachelor of Science in Global Health; Minor in Computational Social Sciences\nSept. 2022 – December 2025\nInventure Academy\nBangalore, India\nHigh School Diploma; 5 CAIE AS/A Levels including Physics, Chemistry, Math, and Biology\nMay 2020 – June 2022\nExperience\nResearch Assistant – Climate Epidemiology\nJune 2024 – 

In [26]:
# Text Splitting

def split_documents(documents, chunk_size=500, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]

    )
    return text_splitter.split_documents(documents)

split_docs = split_documents(documents)
split_docs

[Document(metadata={'producer': 'pdfTeX-1.40.26', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-08-12T09:48:22+00:00', 'source': '../data/pdf_files/ShrivatsaSaankhya_Env_Resume.pdf', 'file_path': '../data/pdf_files/ShrivatsaSaankhya_Env_Resume.pdf', 'total_pages': 1, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-08-12T09:48:22+00:00', 'trapped': '', 'modDate': 'D:20250812094822Z', 'creationDate': 'D:20250812094822Z', 'page': 0}, page_content='Saankhya Shrivatsa\n+1 (619) 415-3843 | shrivatsaankhya.bus@gmail.com | linkedin.com/in/saankhya-shrivatsa |\nEducation\nUniversity of California San Diego\nSan Diego, CA\nBachelor of Science in Global Health; Minor in Computational Social Sciences\nSept. 2022 – December 2025\nInventure Academy\nBangalore, India\nHigh School Diploma; 5 CAIE AS/A Levels including Physics, Chemistry, Math, and Biology\nMay 2020 – June 2022\nExperience\nResearch Assistant – Climate Epidemiology\nJune 2024 – 

In [27]:
import numpy as np
import os
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [28]:
class EmbeddingManager:
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model() # calls the private method to load the model

    def _load_model(self):
        try:
            self.model = SentenceTransformer(self.model_name)
            print(f"Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model: {self.model_name}. Error: {e}")
            raise

    def generate_embeddings(self, texts : list[str]) -> np.ndarray:
        if not self.model:
            raise ValueError("Model not selected.")

        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"shape of embeddings {embeddings.shape}")
        return embeddings

In [29]:
embedding_manager = EmbeddingManager()
embedding_manager

Embedding dimension: 384


<__main__.EmbeddingManager at 0x1401ebcd0>

In [38]:
class VectorStore:
    """Manages document embeddings in a ChromaDB vector store"""
    
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """
        Initialize the vector store
        
        Args:
            collection_name: Name of the ChromaDB collection
            persist_directory: Directory to persist the vector store
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize ChromaDB client and collection"""
        try:
            # Create persistent ChromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            
            # Get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store
        
        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        
        print(f"Adding {len(documents)} documents to vector store...")
        
        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            # Document content
            documents_text.append(doc.page_content)
            
            # Embedding
            embeddings_list.append(embedding.tolist())
        
        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

    def remove_all_documents(self):
        self.collection.delete(where={"doc_index": {"$gte": 0}})

vectorstore=VectorStore()
vectorstore

Vector store initialized. Collection: pdf_documents
Existing documents in collection: 0


<__main__.VectorStore at 0x13d8809d0>

In [39]:
chunks = split_docs

In [40]:
## Converting the text to embeddings

texts = [chunk.page_content for chunk in chunks]

embeddings = embedding_manager.generate_embeddings(texts)
vectorstore.add_documents(chunks, embeddings)

Batches: 100%|██████████| 1/1 [00:00<00:00,  3.80it/s]

shape of embeddings (18, 384)
Adding 18 documents to vector store...
Successfully added 18 documents to vector store
Total documents in collection: 18





In [41]:
class RAGRetriever:
    """Handles query-based retrieval from the vector store"""
    
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        """
        Initialize the retriever
        
        Args:
            vector_store: Vector store containing document embeddings
            embedding_manager: Manager for generating query embeddings
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query
        
        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold
            
        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")
        
        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        
        # Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            
            # Process results
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            
            return retrieved_docs
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

rag_retriever=RAGRetriever(vectorstore,embedding_manager)