In [1]:
### document strcture

from langchain_core.documents import Document
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from langchain_community.document_loaders import PyPDFLoader

# Path to PDF (same folder as notebook)
pdf_path = "/home/sravanthi/Downloads/GUI_Ollama_RAG/data/text_files/ppe.pdf"

# Load PDF
loader = PyPDFLoader(pdf_path)
pdf_documents = loader.load()

# Add custom metadata
for doc in pdf_documents:
    doc.metadata.update({
        "source_file": "ppe.pdf",
        "doc_type": "pdf"
    })

# Debug / verification
print(f"Total pages loaded: {len(pdf_documents)}")
print("\nSample page content:\n")
print(pdf_documents[0].page_content[:300])
print("\nMetadata:\n")
print(pdf_documents[0].metadata)


Total pages loaded: 116

Sample page content:

GUIDELINES NO.  AERB/SG/IS-3
GOVERNMENT OF INDIA
PERSONAL PROTECTIVE EQUIPMENT
AERB SAFETY GUIDELINES
ATOMIC ENERGY    REGULATORY BOARD
GUIDELINES NO. AERB/SG/IS-3

Metadata:

{'producer': 'Acrobat Distiller 3.0 for Windows', 'creator': 'PageMaker 6.5', 'creationdate': 'D:191041009145244', 'author': 'VIJAY', 'title': 'IS-3.p65', 'source': '/home/sravanthi/Downloads/GUI_Ollama_RAG/data/text_files/ppe.pdf', 'total_pages': 116, 'page': 0, 'page_label': '1', 'source_file': 'ppe.pdf', 'doc_type': 'pdf'}


In [4]:
### Text splitting get into chunks

def split_documents(documents,chunk_size=500,chunk_overlap=100):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

In [5]:
chunks=split_documents(pdf_documents)
chunks
# Add custom metadata to each chunk
for i, chunk in enumerate(chunks):
    chunk.metadata["chunk_id"] = i
    chunk.metadata["content_length"] = len(chunk.page_content)

# Check the first chunk to see the added metadata
print(chunks[0].metadata)

Split 116 documents into 433 chunks

Example chunk:
Content: GUIDELINES NO.  AERB/SG/IS-3
GOVERNMENT OF INDIA
PERSONAL PROTECTIVE EQUIPMENT
AERB SAFETY GUIDELINES
ATOMIC ENERGY    REGULATORY BOARD
GUIDELINES NO. AERB/SG/IS-3...
Metadata: {'producer': 'Acrobat Distiller 3.0 for Windows', 'creator': 'PageMaker 6.5', 'creationdate': 'D:191041009145244', 'author': 'VIJAY', 'title': 'IS-3.p65', 'source': '/home/sravanthi/Downloads/GUI_Ollama_RAG/data/text_files/ppe.pdf', 'total_pages': 116, 'page': 0, 'page_label': '1', 'source_file': 'ppe.pdf', 'doc_type': 'pdf'}
{'producer': 'Acrobat Distiller 3.0 for Windows', 'creator': 'PageMaker 6.5', 'creationdate': 'D:191041009145244', 'author': 'VIJAY', 'title': 'IS-3.p65', 'source': '/home/sravanthi/Downloads/GUI_Ollama_RAG/data/text_files/ppe.pdf', 'total_pages': 116, 'page': 0, 'page_label': '1', 'source_file': 'ppe.pdf', 'doc_type': 'pdf', 'chunk_id': 0, 'content_length': 163}


In [6]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity
import os

In [7]:
class EmbeddingManager:
    """Handles document embedding generation using SentenceTransformer"""
    
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initialize the embedding manager
        
        Args:
            model_name: HuggingFace model name for sentence embeddings
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """Load the SentenceTransformer model"""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of texts
        
        Args:
            texts: List of text strings to embed
            
        Returns:
            numpy array of embeddings with shape (len(texts), embedding_dim)
        """
        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings


## initialize the embedding manager

embedding_manager=EmbeddingManager()
embedding_manager

Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x778e4f8ad060>

In [8]:
class VectorStore:
    """Manages document embeddings in a ChromaDB vector store"""
    
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "/home/sravanthi/Downloads/GUI_Ollama_RAG/data/vector_store"):
        """
        Initialize the vector store
        
        Args:
            collection_name: Name of the ChromaDB collection
            persist_directory: Directory to persist the vector store
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize ChromaDB client and collection"""
        try:
            # Create persistent ChromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            
            # Get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store
        
        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        
        print(f"Adding {len(documents)} documents to vector store...")
        
        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            # Document content
            documents_text.append(doc.page_content)
            
            # Embedding
            embeddings_list.append(embedding.tolist())
        
        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vectorstore=VectorStore()
vectorstore

Vector store initialized. Collection: pdf_documents
Existing documents in collection: 0


<__main__.VectorStore at 0x778e4f8acdc0>

In [9]:
### Convert the text to embeddings
texts=[doc.page_content for doc in chunks]

## Generate the Embeddings

embeddings=embedding_manager.generate_embeddings(texts)

##store int he vector dtaabase
vectorstore.add_documents(chunks,embeddings)

Generating embeddings for 433 texts...


Batches: 100%|██████████| 14/14 [00:07<00:00,  1.95it/s]

Generated embeddings with shape: (433, 384)
Adding 433 documents to vector store...
Successfully added 433 documents to vector store
Total documents in collection: 433





In [10]:
from typing import List, Dict, Any, Optional
import numpy as np

class RAGRetriever:
    """Enhanced Retriever for query-based document retrieval from a vector store"""

    def __init__(self, vector_store, embedding_manager):
        """
        Args:
            vector_store: Vector store containing document embeddings
            embedding_manager: Manager for generating embeddings
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(
        self,
        query: str,
        top_k: int = 10,
        score_threshold: float = 0.0,
        metadata_filter: Optional[Dict[str, Any]] = None
    ) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query

        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold
            metadata_filter: Optional dict to filter results by metadata

        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")

        # Generate query embedding and normalize
        query_embedding = np.array(self.embedding_manager.generate_embeddings([query])[0])
        query_embedding = query_embedding / np.linalg.norm(query_embedding)

        # Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k,
                where=metadata_filter  # if supported by vector store
            )

            retrieved_docs = []

            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]

                for i, (doc_id, doc, meta, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity (assuming cosine distance)
                    similarity_score = 1 - distance

                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': doc,
                            'metadata': meta,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })

                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")

            return retrieved_docs

        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []
rag_retriever=RAGRetriever(vectorstore,embedding_manager)

In [20]:
rag_retriever.retrieve("Specification of different types of protection equipment")

Retrieving documents for query: 'Specification of different types of protection equipment'
Top K: 10, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 15.49it/s]

Generated embeddings with shape: (1, 384)
Retrieved 10 documents (after filtering)





[{'id': 'doc_5bf84c0e_148',
  'content': 'protection equipment:\nl conformity to the relevant publications of BIS,\nl easy to don, use and take off,\nl adequate protection against hazards,\nl adequate protection factor,\nl durable,\nl comfortable and non-irritating,\nl good field of vision,\nl provision for communication and for use of spectacles,\nl easy to clean and maintain,\nl easy replacement of non-lasting parts,\nl availability of test certificates, and\nl availability of testing/servicing facility with the supplier.',
  'metadata': {'author': 'VIJAY',
   'source': 'ppe.pdf',
   'doc_type': 'pdf',
   'doc_index': 148,
   'creator': 'PageMaker 6.5',
   'creationdate': 'D:191041009145244',
   'total_pages': 116,
   'content_length': 475,
   'title': 'IS-3.p65',
   'producer': 'Acrobat Distiller 3.0 for Windows',
   'page': 67,
   'source_file': 'ppe.pdf',
   'chunk_id': 148,
   'page_label': '68'},
  'similarity_score': 0.35239553451538086,
  'distance': 0.6476044654846191,
  'ran