## RAG Pipeline - Data Ingestion to Vector DB Pipeline

In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

In [5]:
## read all the pdf in the directory 
def process_all_pdf(pdf_path):
    """Process all pdf files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_path)

    pdf_files = list(pdf_dir.glob("**/*.pdf"))

    print(f"Found {len(pdf_files)} PDF files to process!")

    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyMuPDFLoader(str(pdf_file))
            document = loader.load()

            for doc in document:
                doc.metadata["source_file"] = pdf_file.name
                doc.metadata["file_type"] = 'pdf'

            all_documents.extend(document)
            print(f"loaded {len(document)} pages")
        except Exception as e:
            print(f"Error: {e}")
    
    print(f"\n Total Documented loaded {len(all_documents)}")
    return all_documents

all_pdf_documents = process_all_pdf("../data")

Found 4 PDF files to process!

Processing: 1-s2.0-S1110016825004752-main.pdf
loaded 16 pages

Processing: 1-s2.0-S1877050923000236-main.pdf
loaded 12 pages

Processing: Crane-Droesch_2018_Environ._Res._Lett._13_114003.pdf
loaded 13 pages

Processing: sustainability-16-06976-v2.pdf
loaded 26 pages

 Total Documented loaded 67


In [7]:
all_pdf_documents[0]

Document(metadata={'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'creator': 'Elsevier', 'creationdate': '2025-06-12T09:11:54+00:00', 'source': '..\\data\\pdf\\1-s2.0-S1110016825004752-main.pdf', 'file_path': '..\\data\\pdf\\1-s2.0-S1110016825004752-main.pdf', 'total_pages': 16, 'format': 'PDF 1.7', 'title': 'Harnessing deep learning to analyze climate change impacts on crop production', 'author': 'Amena Mahmoud', 'subject': 'Alexandria Engineering Journal, 125 (2025) 67-82. doi:10.1016/j.aej.2025.04.008', 'keywords': 'Climate Change,Agriculture,Artificial Intelligence (AI),Machine Learning (ML),Internet of Things (IoT),Geospatial Technologies,Crop Yield Prediction,Disease Detection', 'moddate': '2025-06-18T16:54:20+00:00', 'trapped': '', 'modDate': 'D:20250618165420Z', 'creationDate': 'D:20250612091154Z', 'page': 0, 'source_file': '1-s2.0-S1110016825004752-main.pdf', 'file_type': 'pdf'}, page_content='Original article\nHarnessing deep learning to analyze climate change impacts on \n

In [None]:
## text splitting into chunks

def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """ Split documents into smalller parts 

    Args:
        documents (_type_): _description_
        chunk_size (int, optional): _description_. Defaults to 1000.
        chunk_overlap (int, optional): _description_. Defaults to 200.
    """

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n","\n"," ",""]
    )

    split_doc = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} document into {len(split_doc)} chunks ")

    if split_doc:
        print("\n Example chunk:")
        print(f"Content: {split_doc[0].page_content[:200]}...")
        print(f"Meta Data: {split_doc[0].metadata}")

    return split_doc



In [9]:
chunks = split_documents(all_pdf_documents)

Split 67 document into 319 chunks 

 Example chunk:
Content: Original article
Harnessing deep learning to analyze climate change impacts on 
crop production
Amena Mahmoud a,b,*, Khursheed Aurangzeb c
, Musaed Alhussein c,  
Manal Sobhy Ali Elbelkasy d
a Departm...
Meta Data: {'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'creator': 'Elsevier', 'creationdate': '2025-06-12T09:11:54+00:00', 'source': '..\\data\\pdf\\1-s2.0-S1110016825004752-main.pdf', 'file_path': '..\\data\\pdf\\1-s2.0-S1110016825004752-main.pdf', 'total_pages': 16, 'format': 'PDF 1.7', 'title': 'Harnessing deep learning to analyze climate change impacts on crop production', 'author': 'Amena Mahmoud', 'subject': 'Alexandria Engineering Journal, 125 (2025) 67-82. doi:10.1016/j.aej.2025.04.008', 'keywords': 'Climate Change,Agriculture,Artificial Intelligence (AI),Machine Learning (ML),Internet of Things (IoT),Geospatial Technologies,Crop Yield Prediction,Disease Detection', 'moddate': '2025-06-18T16:54:20+00:00', 'trapp

## Embedding and VectorStoreDB

In [10]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import Tuple, List, Dict, Any
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
class EmbeddingManager:
    def __init__(self, model_name:str="all-MiniLM-L6-v2"):
        """Initialize the Embedding model manager

        Args:
            model_name (str, optional): _description_. Defaults to "all-MiniLM-L6-v2".
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        "Load the Sentence transformer"
        try:
            print(f"Loading Embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model Sucesfully loaded. Embedding dimensions:{self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model}: {e}")
            raise

    def generate_embedding(self, text:List[str]) -> np.ndarray:
        """ generate embedding for a list of texts

        Args:
            text (List[str]): list of text strings 

        Returns:
            np.ndarray: numpy array of embedding 
        """

        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Generating embedding for {len(text)} texts...")
        embedding = self.model.encode(text, show_progress_bar=True)
        print(f"Generated Embedding with shape: {embedding.shape}")
        return embedding

In [12]:
# initialize the Embedding manager
embedding_manager = EmbeddingManager()
embedding_manager

Loading Embedding model: all-MiniLM-L6-v2
Model Sucesfully loaded. Embedding dimensions:384


<__main__.EmbeddingManager at 0x2144bb48d70>

## VectorStore

In [19]:
class VectorStore:
    """Manages document embeddings in a ChromaDB vector store"""
    
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """
        Initialize the vector store
        
        Args:
            collection_name: Name of the ChromaDB collection
            persist_directory: Directory to persist the vector store
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize ChromaDB client and collection"""
        try:
            # Create persistent ChromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            
            # Get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store
        
        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        
        print(f"Adding {len(documents)} documents to vector store...")
        
        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            # Document content
            documents_text.append(doc.page_content)
            
            # Embedding
            embeddings_list.append(embedding.tolist())
        
        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

In [20]:
vectorstore=VectorStore()
vectorstore
    

Vector store initialized. Collection: pdf_documents
Existing documents in collection: 0


<__main__.VectorStore at 0x2144c6fde80>

In [22]:
chunks[:5]

[Document(metadata={'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'creator': 'Elsevier', 'creationdate': '2025-06-12T09:11:54+00:00', 'source': '..\\data\\pdf\\1-s2.0-S1110016825004752-main.pdf', 'file_path': '..\\data\\pdf\\1-s2.0-S1110016825004752-main.pdf', 'total_pages': 16, 'format': 'PDF 1.7', 'title': 'Harnessing deep learning to analyze climate change impacts on crop production', 'author': 'Amena Mahmoud', 'subject': 'Alexandria Engineering Journal, 125 (2025) 67-82. doi:10.1016/j.aej.2025.04.008', 'keywords': 'Climate Change,Agriculture,Artificial Intelligence (AI),Machine Learning (ML),Internet of Things (IoT),Geospatial Technologies,Crop Yield Prediction,Disease Detection', 'moddate': '2025-06-18T16:54:20+00:00', 'trapped': '', 'modDate': 'D:20250618165420Z', 'creationDate': 'D:20250612091154Z', 'page': 0, 'source_file': '1-s2.0-S1110016825004752-main.pdf', 'file_type': 'pdf'}, page_content='Original article\nHarnessing deep learning to analyze climate change impacts on \

In [25]:
### convert the text to embedding 
texts=[doc.page_content for doc in chunks]

### generate the embedding
embeddings=embedding_manager.generate_embedding(texts)

### store in the vector 
vectorstore.add_documents(chunks, embeddings)

Generating embedding for 319 texts...


Batches: 100%|██████████| 10/10 [00:18<00:00,  1.84s/it]


Generated Embedding with shape: (319, 384)
Adding 319 documents to vector store...
Successfully added 319 documents to vector store
Total documents in collection: 319


## Retriever Pipeline from VectorStore

In [38]:
class RAGRetriever:
    def __init__(self, vector_store: VectorStore, embedding_manager:EmbeddingManager):
        """_summary_

        Args:
            vector_store (VectorStore): _description_
            embedding_manager (EmbeddingManager): _description_
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager
        
    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query
        
        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold
            
        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")
        
        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embedding([query])[0]
        
        # Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            
            # Process results
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            
            return retrieved_docs
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

In [39]:
rag_retriever = RAGRetriever(vector_store=vectorstore, embedding_manager=embedding_manager)
rag_retriever

<__main__.RAGRetriever at 0x2144c6fdd30>

In [44]:
rag_retriever.retrieve("crop yield")

Retrieving documents for query: 'crop yield'
Top K: 5, Score threshold: 0.0
Generating embedding for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 47.39it/s]

Generated Embedding with shape: (1, 384)
Retrieved 5 documents (after filtering)





[{'id': 'doc_71c02d23_94',
  'content': '1. Introduction \nAgriculture is extremely important to the global economy. Understanding global crop yield is critical for resolving \nfood security issues and mitigating the effects of climate change as the human population continues to grow. Crop \nyield forecasting is a significant agricultural problem. Weather conditions (rain, temperature, etc.) and pesticides have \na great impact on agricultural yield. It is important to have accurate knowledge about crop yield history while making \ndecisions about agricultural risk management and yield forecasting [1]. Crop yield prediction is a challenge for \ndecision-makers at all levels, including global and local levels. Farmers may adopt a good crop yield prediction model \nto decide what to plant and when to plant it. Crop yield forecasting may be done in several ways [2] [3]. \n \n \n \n \n \n* Corresponding author. Tel.: +91 9680321224. \nE-mail address: Pratistha.mathur@jaipur.manipal.edu',
 

## Intgration VectorDB context pipeline with LLM output

In [48]:
### Simple RAG Pipeline with Groq LLM
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv

load_dotenv()

groq_api_key = os.getenv("groq_api_key")

llm=ChatGroq(groq_api_key=groq_api_key, model_name="llama-3.1-8b-instant", temperature=0.1, max_tokens=1024)

## 2. simple RAG function : retrieve contect + generate response 

def simple_rag(query, retriever, llm, top_k=3):
    result= retriever.retrieve(query, top_k=top_k)
    context = "\n\n".join([doc["content"] for doc in result]) if result else ""
    if not context:
        return "No relevant context found to the answer the question"
    ## generate answer 
    prompt = f"""Use the following context to answer the question concisely.
        Context:
        {context}

        Question: {query}

        Answer:"""
    response = llm.invoke([prompt.format(context=context, query=query)])
    return response.content


In [49]:
answer = simple_rag("what effect crop yield", rag_retriever,llm)
print(answer)

Retrieving documents for query: 'what effect crop yield'
Top K: 3, Score threshold: 0.0
Generating embedding for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 31.89it/s]

Generated Embedding with shape: (1, 384)
Retrieved 3 documents (after filtering)





Crop yields are critically dependent on weather.


## Enhanced RAG Pipeline feature

In [52]:
    # --- Enhanced RAG Pipeline Features ---
def rag_advanced(query, retriever, llm, top_k=5, min_score=0.2, return_context=False):
    """
    RAG pipeline with extra features:
    - Returns answer, sources, confidence score, and optionally full context.
    """
    results = retriever.retrieve(query, top_k=top_k, score_threshold=min_score)
    if not results:
        return {'answer': 'No relevant context found.', 'sources': [], 'confidence': 0.0, 'context': ''}
    
    # Prepare context and sources
    context = "\n\n".join([doc['content'] for doc in results])
    sources = [{
        'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
        'page': doc['metadata'].get('page', 'unknown'),
        'score': doc['similarity_score'],
        'preview': doc['content'][:300] + '...'
    } for doc in results]
    confidence = max([doc['similarity_score'] for doc in results])
    
    # Generate answer
    prompt = f"""Use the following context to answer the question concisely.\nContext:\n{context}\n\nQuestion: {query}\n\nAnswer:"""
    response = llm.invoke([prompt.format(context=context, query=query)])
    
    output = {
        'answer': response.content,
        'sources': sources,
        'confidence': confidence
    }
    if return_context:
        output['context'] = context
    return output

result = rag_advanced("what is the effect of climate on crops?", rag_retriever, llm, top_k=3, min_score=0.1, return_context=True)
print("\nAnswer:", result['answer'])
print("\nSources:", result['sources'])
print("\nConfidence:", result['confidence'])
print("\nContext Preview:", result['context'][:300])

Retrieving documents for query: 'what is the effect of climate on crops?'
Top K: 3, Score threshold: 0.1
Generating embedding for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 22.35it/s]

Generated Embedding with shape: (1, 384)
Retrieved 3 documents (after filtering)






Answer: Climate change can significantly impact crops, particularly wheat, by causing production losses due to extreme weather, changing rainfall trends, and rising temperatures. This can lead to reduced yields, making it harder to plan and handle crops effectively.

Sources: [{'source': 'sustainability-16-06976-v2.pdf', 'page': 1, 'score': 0.3429810404777527, 'preview': 'tools to investigate the complex interaction between climate conditions and wheat yield,\nproviding more precise and useful findings.\nExtreme weather, changing trends of rainfall, and rising temperatures are all things\nthat could hurt food security and agriculture production. Uncertainty about the we...'}, {'source': 'sustainability-16-06976-v2.pdf', 'page': 4, 'score': 0.30562764406204224, 'preview': 'at latitude 32.0740◦N and longitude 72.6861◦E, encompasses 3139 square kilometers.\nSargodha is a crucial area for wheat production in the region.\nFigure 2. Study area map.\nProjected climate changes, including temp