In [16]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

In [17]:
### Read all the pdf's inside the directory
def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    
    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            
            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            
            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")
            
        except Exception as e:
            print(f"  ✗ Error: {e}")
    
    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

# Process all PDFs in the data directory
all_pdf_documents = process_all_pdfs("../data")

Found 3 PDF files to process

Processing: 01Python LF.pdf
  ✓ Loaded 31 pages

Processing: AI_ML_DevOps-MLOps-Infrastructure_Architect4.pdf
  ✓ Loaded 4 pages

Processing: attentionIsAllYouNeed.pdf
  ✓ Loaded 15 pages

Total documents loaded: 50


In [18]:
all_pdf_documents

[Document(metadata={'producer': 'Microsoft® Office Word 2007', 'creator': 'Microsoft® Office Word 2007', 'creationdate': '2018-02-13T21:21:23+05:30', 'author': 'RK', 'moddate': '2018-02-13T21:21:23+05:30', 'source': '..\\data\\pdf\\01Python LF.pdf', 'total_pages': 31, 'page': 0, 'page_label': '1', 'source_file': '01Python LF.pdf', 'file_type': 'pdf'}, page_content='1 DURGASOFT, # 202, 2nd Floor, HUDA Maitrivanam, Ameerpet, Hyderabad - 500038,  \n\uf028 040 – 64 51 27 86, 80 96 96 96 96, 92 46 21 21 43 | www.durgasoft.com \n \n \nLanguage Fundamentals \n \nIntroduction \n \n\uf0b7 Python is a general purpose high level programming language. \n \n\uf0b7 Python was developed by Guido Van Rossam in 1989 while working at National \nResearch Institute at Netherlands. \n \n\uf0b7 But officially Python was made available to public in 1991. The official Date of Birth for \nPython is : Feb 20th 1991. \n \n\uf0b7 Python is recommended as first programming language for beginners. \n \nEg1: To prin

In [19]:
### Text splitting get into chunks

def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

In [20]:
chunks=split_documents(all_pdf_documents)
chunks

Split 50 documents into 122 chunks

Example chunk:
Content: 1 DURGASOFT, # 202, 2nd Floor, HUDA Maitrivanam, Ameerpet, Hyderabad - 500038,  
 040 – 64 51 27 86, 80 96 96 96 96, 92 46 21 21 43 | www.durgasoft.com 
 
 
Language Fundamentals 
 
Introduction 
 
...
Metadata: {'producer': 'Microsoft® Office Word 2007', 'creator': 'Microsoft® Office Word 2007', 'creationdate': '2018-02-13T21:21:23+05:30', 'author': 'RK', 'moddate': '2018-02-13T21:21:23+05:30', 'source': '..\\data\\pdf\\01Python LF.pdf', 'total_pages': 31, 'page': 0, 'page_label': '1', 'source_file': '01Python LF.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'Microsoft® Office Word 2007', 'creator': 'Microsoft® Office Word 2007', 'creationdate': '2018-02-13T21:21:23+05:30', 'author': 'RK', 'moddate': '2018-02-13T21:21:23+05:30', 'source': '..\\data\\pdf\\01Python LF.pdf', 'total_pages': 31, 'page': 0, 'page_label': '1', 'source_file': '01Python LF.pdf', 'file_type': 'pdf'}, page_content='1 DURGASOFT, # 202, 2nd Floor, HUDA Maitrivanam, Ameerpet, Hyderabad - 500038,  \n\uf028 040 – 64 51 27 86, 80 96 96 96 96, 92 46 21 21 43 | www.durgasoft.com \n \n \nLanguage Fundamentals \n \nIntroduction \n \n\uf0b7 Python is a general purpose high level programming language. \n \n\uf0b7 Python was developed by Guido Van Rossam in 1989 while working at National \nResearch Institute at Netherlands. \n \n\uf0b7 But officially Python was made available to public in 1991. The official Date of Birth for \nPython is : Feb 20th 1991. \n \n\uf0b7 Python is recommended as first programming language for beginners. \n \nEg1: To prin

### embedding And vectorStoreDB

In [21]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
class EmbeddingManager:
    """Handles document embedding generation using SentenceTransformer"""
    
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initialize the embedding manager
        
        Args:
            model_name: HuggingFace model name for sentence embeddings
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """Load the SentenceTransformer model"""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of texts
        
        Args:
            texts: List of text strings to embed
            
        Returns:
            numpy array of embeddings with shape (len(texts), embedding_dim)
        """
        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings


## initialize the embedding manager

embedding_manager=EmbeddingManager()
embedding_manager

Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x1db8b54d6a0>

## VectorStore

In [23]:
class VectorStore:
    """Manages document embeddings in a ChromaDB vector store"""
    
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """
        Initialize the vector store
        
        Args:
            collection_name: Name of the ChromaDB collection
            persist_directory: Directory to persist the vector store
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize ChromaDB client and collection"""
        try:
            # Create persistent ChromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            
            # Get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store
        
        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        
        print(f"Adding {len(documents)} documents to vector store...")
        
        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            # Document content
            documents_text.append(doc.page_content)
            
            # Embedding
            embeddings_list.append(embedding.tolist())
        
        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vectorstore=VectorStore()
vectorstore

Vector store initialized. Collection: pdf_documents
Existing documents in collection: 70


<__main__.VectorStore at 0x1db8beada90>

In [24]:
chunks

[Document(metadata={'producer': 'Microsoft® Office Word 2007', 'creator': 'Microsoft® Office Word 2007', 'creationdate': '2018-02-13T21:21:23+05:30', 'author': 'RK', 'moddate': '2018-02-13T21:21:23+05:30', 'source': '..\\data\\pdf\\01Python LF.pdf', 'total_pages': 31, 'page': 0, 'page_label': '1', 'source_file': '01Python LF.pdf', 'file_type': 'pdf'}, page_content='1 DURGASOFT, # 202, 2nd Floor, HUDA Maitrivanam, Ameerpet, Hyderabad - 500038,  \n\uf028 040 – 64 51 27 86, 80 96 96 96 96, 92 46 21 21 43 | www.durgasoft.com \n \n \nLanguage Fundamentals \n \nIntroduction \n \n\uf0b7 Python is a general purpose high level programming language. \n \n\uf0b7 Python was developed by Guido Van Rossam in 1989 while working at National \nResearch Institute at Netherlands. \n \n\uf0b7 But officially Python was made available to public in 1991. The official Date of Birth for \nPython is : Feb 20th 1991. \n \n\uf0b7 Python is recommended as first programming language for beginners. \n \nEg1: To prin

In [25]:
### Convert the text to embeddings
texts=[doc.page_content for doc in chunks]

## Generate the Embeddings

embeddings=embedding_manager.generate_embeddings(texts)

##store int he vector dtaabase
vectorstore.add_documents(chunks,embeddings)

Generating embeddings for 122 texts...


Batches: 100%|██████████| 4/4 [00:02<00:00,  1.41it/s]

Generated embeddings with shape: (122, 384)
Adding 122 documents to vector store...
Successfully added 122 documents to vector store
Total documents in collection: 192





## Retriever Pipeline From VectorStore

In [40]:
class RAGRetriever:
    """Handles query-based retrieval from the vector store"""
    
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        """
        Initialize the retriever
        
        Args:
            vector_store: Vector store containing document embeddings
            embedding_manager: Manager for generating query embeddings
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query
        
        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold
            
        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")
        
        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        
        # Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            
            # Process results
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            
            return retrieved_docs
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

rag_retriever=RAGRetriever(vectorstore,embedding_manager)

In [37]:
rag_retriever

<__main__.RAGRetriever at 0x1db8bead7f0>

In [38]:
rag_retriever.retrieve("What is attention is all you need")

Retrieving documents for query: 'What is attention is all you need'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00,  8.45it/s]

Generated embeddings with shape: (1, 384)
Retrieved 1 documents (after filtering)





[{'id': 'doc_a623ad81_82',
  'content': '3.2 Attention\nAn attention function can be described as mapping a query and a set of key-value pairs to an output,\nwhere the query, keys, values, and output are all vectors. The output is computed as a weighted sum\n3',
  'metadata': {'keywords': '',
   'page_label': '3',
   'moddate': '2024-04-10T21:11:43+00:00',
   'file_type': 'pdf',
   'source_file': 'attentionIsAllYouNeed.pdf',
   'doc_index': 82,
   'total_pages': 15,
   'source': '..\\data\\pdf\\attentionIsAllYouNeed.pdf',
   'content_length': 216,
   'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5',
   'creationdate': '2024-04-10T21:11:43+00:00',
   'subject': '',
   'page': 2,
   'producer': 'pdfTeX-1.40.25',
   'creator': 'LaTeX with hyperref',
   'author': '',
   'trapped': '/False',
   'title': ''},
  'similarity_score': 0.1399548053741455,
  'distance': 0.8600451946258545,
  'rank': 1}]

In [39]:
rag_retriever.retrieve("Position-wise Feed-Forward Networks")

Retrieving documents for query: 'Position-wise Feed-Forward Networks'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 51.78it/s]

Generated embeddings with shape: (1, 384)
Retrieved 1 documents (after filtering)





[{'id': 'doc_03dbf6b2_88',
  'content': 'encoder.\n• Similarly, self-attention layers in the decoder allow each position in the decoder to attend to\nall positions in the decoder up to and including that position. We need to prevent leftward\ninformation flow in the decoder to preserve the auto-regressive property. We implement this\ninside of scaled dot-product attention by masking out (setting to −∞) all values in the input\nof the softmax which correspond to illegal connections. See Figure 2.\n3.3 Position-wise Feed-Forward Networks\nIn addition to attention sub-layers, each of the layers in our encoder and decoder contains a fully\nconnected feed-forward network, which is applied to each position separately and identically. This\nconsists of two linear transformations with a ReLU activation in between.\nFFN(x) = max(0, xW1 + b1)W2 + b2 (2)\nWhile the linear transformations are the same across different positions, they use different parameters',
  'metadata': {'page_label': '5',
   

## RAG Pipeline- VectorDB To LLM Output Generation

## Integration Vectordb Context pipeline With LLM output

In [47]:
### Simple RAG pipeline with Groq LLM
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
load_dotenv()

### Initialize the Groq LLM (set your GROQ_API_KEY in environment)
groq_api_key = os.getenv("GROQ_API_KEY")

llm=ChatGroq(groq_api_key=groq_api_key,model_name="llama-3.1-8b-instant",temperature=0.1,max_tokens=1024)

## 2. Simple RAG function: retrieve context + generate response
def rag_simple(query,retriever,llm,top_k=3):
    ## retriever the context
    results=retriever.retrieve(query,top_k=top_k)
    context="\n\n".join([doc['content'] for doc in results]) if results else ""
    if not context:
        return "No relevant context found to answer the question."
    
    ## generate the answwer using GROQ LLM
    prompt=f"""Use the following context to answer the question concisely.
        Context:
        {context}

        Question: {query}

        Answer:"""
    
    response=llm.invoke([prompt.format(context=context,query=query)])
    return response.content

In [48]:
answer=rag_simple("What is attention mechanism?",rag_retriever,llm)
print(answer)

Retrieving documents for query: 'What is attention mechanism?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 20.10it/s]

Generated embeddings with shape: (1, 384)
Retrieved 2 documents (after filtering)





The attention mechanism is a function that maps a query and a set of key-value pairs to an output, where the output is a weighted sum of the values. It is used to focus on specific parts of the input when processing it, allowing models to attend to long-distance dependencies and relationships.


## Enhanced RAG Pipeline Features

In [52]:
# --- Enhanced RAG Pipeline Features ---
def rag_advanced(query, retriever, llm, top_k=5, min_score=0.2, return_context=False):
    """
    RAG pipeline with extra features:
    - Returns answer, sources, confidence score, and optionally full context.
    """
    results = retriever.retrieve(query, top_k=top_k, score_threshold=min_score)
    if not results:
        return {'answer': 'No relevant context found.', 'sources': [], 'confidence': 0.0, 'context': ''}
    
    # Prepare context and sources
    context = "\n\n".join([doc['content'] for doc in results])
    sources = [{
        'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
        'page': doc['metadata'].get('page', 'unknown'),
        'score': doc['similarity_score'],
        'preview': doc['content'][:300] + '...'
    } for doc in results]
    confidence = max([doc['similarity_score'] for doc in results])
    
    # Generate answer
    prompt = f"""Use the following context to answer the question concisely.\nContext:\n{context}\n\nQuestion: {query}\n\nAnswer:"""
    response = llm.invoke([prompt.format(context=context, query=query)])
    
    output = {
        'answer': response.content,
        'sources': sources,
        'confidence': confidence
    }
    if return_context:
        output['context'] = context
    return output

# Example usage:
result = rag_advanced("WHow can i contribute to the Organization Goal?", rag_retriever, llm, top_k=3, min_score=0.1, return_context=True)
print("Answer:", result['answer'])
print("Sources:", result['sources'])
print("Confidence:", result['confidence'])
print("Context Preview:", result['context'][:300])

Retrieving documents for query: 'WHow can i contribute to the Organization Goal?'
Top K: 3, Score threshold: 0.1
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 25.09it/s]

Generated embeddings with shape: (1, 384)
Retrieved 2 documents (after filtering)





Answer: As an AI/ML & MLOps Architect, you can contribute to the organization goal by:

- Developing deep learning solutions with full lifecycle integration.
- Designing and implementing AI and ML models that drive business growth and efficiency.
- Ensuring seamless integration of AI and ML solutions across the organization.
- Collaborating with cross-functional teams to identify and prioritize AI and ML projects.
- Providing expertise in AI and ML to drive innovation and stay ahead of the competition.
Sources: [{'source': 'AI_ML_DevOps-MLOps-Infrastructure_Architect4.pdf', 'page': 0, 'score': 0.2391219139099121, 'preview': 'algorithms. Develop deep learning solutions with full lifecycle integration. \n \n \n \nHow How can i contribute to the Organization Goal? \n How can Contribute to the Organization Goal? \n \nVinod Ulli          AI/ML & MLOps Architect   \nvinodullistudy@gmail.com     Phone: +91-9284038277...'}, {'source': 'AI_ML_DevOps-MLOps-Infrastructure_Architect4.pdf', 'page':