### RAG PIPELINE - Data Ingestion to Vector DB Pipeline

In [1]:
import os
from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def process_all_pdfs(pdf_directory):
    """Process PDF files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)

    #Find all PD files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))

    print(f"Found {len(pdf_files)} PDF files to process")

    for pdf_file in pdf_files:
        print(f"Procesing file: {pdf_file.name}")
        try:
            loader = PyMuPDFLoader(str(pdf_file))
            documents = loader.load()

            #Add source data to metadata
            for doc in documents:
                doc.metadata["source_file"] = pdf_file.name
                doc.metadata["file_type"] = 'pdf'

            all_documents.extend(documents)
            print(f" loaded {len(documents)} pages")

        except Exception as e:
            print(f"error {e}")

    print(f"\n Total documents loaded: {len(all_documents)}")
    return all_documents

all_pdf_documents = process_all_pdfs("../data")



Found 2 PDF files to process
Procesing file: lionel_messi.pdf
 loaded 13 pages
Procesing file: cristiano_ronaldo.pdf
 loaded 13 pages

 Total documents loaded: 26


In [None]:
all_pdf_documents

### Text Splitter: getting into chunks

In [None]:
def split_documents(documents, chunk_size = 1000, chunk_overlap = 200):
    """split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = len,
        separators = ["\n\n", "\n", " ", ""]
    )

    split_docs = text_splitter.split_documents(documents)
    print(f"Split into {len(split_docs)} chunks")

    #show example of a chunk
    if split_docs:
        print("\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")  #print first 200 characters of first chunk
        print(f"Metadata: {split_docs[0].metadata}")

    return split_docs


In [None]:
chunks = split_documents(all_pdf_documents)
chunks

### Embedding and VectorDB

In [12]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
class EmbeddingManager:
    """ Handles document embedding generation using Sentence Transformers """

    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        """
        Initialize the Embedding manager

        Args: 
            model_name: HuggingFace model name for sentence embdeddings
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """ Load the sentence transformer model """
        try: 
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print("Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
            
        except Exception as e:
            print(f"Error loading model: {e}")
            raise

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
            """
            Generate embeddings for a list of texts

            Args:
                texts: List of text strings to embed

            Returns:
                numpy array of embeddings with shape (len(texts), embedding_dimension)
            """
            if not self.model:
                raise ValueError("Embedding model is not loaded.")

            print(f"Generating embeddings for {len(texts)} texts")
            embeddings = self.model.encode(texts, show_progress_bar = True)
            print(f"Generated embeddings with shape: {embeddings.shape}")
            return embeddings
    
## Initialize Embedding Manager

embedding_manager = EmbeddingManager()
embedding_manager

Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}


<__main__.EmbeddingManager at 0x17739c6e0>

### Vector Store

In [18]:
class VectorStore:
    """ Manages document embeddings in a ChromaDB vector store"""

    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """
        Initialize the Vector Store

        Args:
            collection_name: Name of the ChromaDB collection
            persist_directory: Directory to persist the ChromaDB data
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """ Initialize ChromaDB client and collection """
        try:
            # Create persistent ChromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path = self.persist_directory)

            #Get or create collection
            self.collection = self.client.get_or_create_collection(
                name = self.collection_name,
                metadata = {"description": "PDF Document Embeddings for RAG"}
            )
            print(f"Vector store initalized. Collection: {self.collection_name}")
            print(f"Existing documents in colection: {self.collection.count()}")


        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store

        Args:
            documents: List of document objects with page_content and metadata
            embeddings: numpy array of embeddings corresponding to the documents
        """

        if len(documents) != len(embeddings):
            raise ValueError("Number of documents and embeddings must match.")
        
        print(f"Adding {len(documents)} documents to vector store")

        #Prepare data for ChromaDB

        ids = []
        metadatas = []
        document_texts = []
        embeddings_list = []

        for i,(doc, embedding) in enumerate(zip(documents, embeddings)):
            #Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            #prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            #Document content
            document_texts.append(doc.page_content)

            #Embedding
            embeddings_list.append(embedding.tolist())

        try:
            self.collection.add(
                ids = ids,
                metadatas = metadatas,
                documents = document_texts,
                embeddings = embeddings_list
            )
            print(f"Successfully added {len(documents)} documents to vector store.")
            print(f"Total documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vectorstore = VectorStore()
vectorstore

Vector store initalized. Collection: pdf_documents
Existing documents in colection: 0


<__main__.VectorStore at 0x10b7ed2b0>

### Convert text to embeddings

In [19]:
text = [doc.page_content for doc in chunks]

## generate the embeddings
embeddings = embedding_manager.generate_embeddings(text)

##store in vector DB
vectorstore.add_documents(chunks, embeddings)

Generating embeddings for 48 texts


Batches: 100%|██████████| 2/2 [00:00<00:00,  2.98it/s]

Generated embeddings with shape: (48, 384)
Adding 48 documents to vector store
Successfully added 48 documents to vector store.
Total documents in collection: 48





### Retriever Pipeline from VectorStore

In [22]:
class RAGretriever:
    """Handles query-based retrieval from the vector store"""

    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        """
        Initialize the RAG Retriever

        Args:
            vector_store: Instance of VectorStore to retrieve documents from
            embedding_manager: Instance of EmbeddingManager to generate query embeddings
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve top-k similar documents for a given query

        Args:
            query: The input query string
            top_k = number of top results to retrieve
            score_threshold: Minimum similarity score threshold for retrieved documents

        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: {query}")
        print(f"top-k: {top_k}, score threshold: {score_threshold}")

        #Generate embedding for the query
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]

        #Query the vector store
        
        try:
            results = self.vector_store.collection.query(
                query_embeddings = [query_embedding.tolist()],
                n_results = top_k
            )

            #process results
            retrieved_docs = []

            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]

            for i, (doc_id, documents, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                similarity_score = 1 - distance  #Convert distance to similarity score

                if similarity_score >= score_threshold:
                    retrieved_docs.append({
                        'id': doc_id,
                        'content': documents,
                        'metadata': metadata,
                        'similarity_score': similarity_score,
                        'distance': distance,
                        'rank': i + 1
                    })
                else:
                    print("No document found")

            return retrieved_docs

        except Exception as e:
            print(f"Error retrieving documents: {e}")
            return []
        
rag_retriever = RAGretriever(vectorstore, embedding_manager)

In [23]:
rag_retriever

<__main__.RAGretriever at 0x10b7ee120>

In [24]:
rag_retriever.retrieve("Who is Lionel Messi")

Retrieving documents for query: Who is Lionel Messi
top-k: 5, score threshold: 0.0
Generating embeddings for 1 texts


Batches: 100%|██████████| 1/1 [00:03<00:00,  3.13s/it]

Generated embeddings with shape: (1, 384)





[{'id': 'doc_93a3fa29_8',
  'content': "LIONEL MESSI \n \n \nCopyright © www.FamousPeopleLessons.com \n5 \nCHOOSE THE CORRECT WORD: \nDelete the wrong word in each of the pairs in italics. \nLionel Andrés Messi was birth / born on June 24, 1987 in the city of \nRosario, Argentina. Everyone knows him now as the best / beast \nfootball player in the world. In fact, a lot of people say he might \nbecome the greatest player even / ever. We’ll see / watch about that \nat the World Cup. Messi is FIFA World Player of the Year. He also \nhelped his club team Barcelona win / wins the European Champions \nLeague and the World Club Championship. \nMessi grow / grew up playing football. He joined his first club when he \nwas five. At the age of eight, he signed in / up for the famous Newell's \nOld Boys' youth team. His career nearly ended when doctors found out \nhe suffered / suffering from a growth hormone deficiency. Luckily, \nagents from Barcelona saw his talented / talent and in 2000, he",


### Integration Vector DB context pipeline with LLM output

In [25]:
from dotenv import load_dotenv
import os

load_dotenv()  # Loads from /Users/ryan1022/Documents/RAG_tutorial/.env

groq_api_key = os.getenv("GROQ_API_KEY")

In [35]:
### simple RAG pipeline with groq LLM
from langchain_groq import ChatGroq
load_dotenv()

llm = ChatGroq(api_key=groq_api_key, model_name="meta-llama/llama-4-maverick-17b-128e-instruct", temperature=0.1, max_tokens = 1024)

## create a simple RAG function: retrieves context and generates answer
def rag_simple(query, retriever, llm, top_k = 3):
    ## retrieve the context
    results = retriever.retrieve(query, top_k=top_k)
    context = "\n\n".join([res['content'] for res in results]) if results else ""
    if not context:
        return "No relevant documents found."
    
    ## generate answeer using groq llm
    prompt =f""" 

            Use the following context to answer the question.
            If the context does not contain the answer, respond with "I don't know".
            If the question is not related to the context, respond with "I don't know".

            Context:
            {context}

            Question:
            {query}

            Answer:
            """
    response = llm.invoke(prompt.format(context=context, query=query))
    return response.content

In [36]:
answer = rag_simple("describe the early career of Cristiano Ronaldo", rag_retriever, llm)
print(answer)

Retrieving documents for query: describe the early career of Cristiano Ronaldo
top-k: 3, score threshold: 0.0
Generating embeddings for 1 texts


Batches: 100%|██████████| 1/1 [00:00<00:00,  3.43it/s]


Generated embeddings with shape: (1, 384)
Cristiano Ronaldo started kicking a ball around when he was three. His skill was obvious then, and by ten years old, two of Portugal's top clubs wanted to sign him. He joined Sporting Lisbon and became the only player in their history to play for the Under-16, Under-17, Under-18, B team, and first-team within one season. His skills soon attracted Europe's big teams, and Manchester United signed him in 2003.


In [37]:
## --- Enhanced RAG Pipeline features ---
def rag_advanced(query, retriever, llm, top_k = 5, min_score = 0.2, return_context = False):
    """
    RAG Pipeline with extra features:
    - returns answer, sources, confidence score, and optionally full context"""

    ## retrieve the context
    results = retriever.retrieve(query, top_k=top_k, score_threshold=min_score)
    if not results:
        return {
            "answer": "No relevant documents found.",
            "sources": [],
            "confidence_score": 0.0,
            "context": "" if return_context else None
        }
    
    #prepare context and sources
    context = "\n\n".join([res['content'] for res in results])
    sources = [{
        'source': doc['metadata'].get('source_file', doc['metadata'].get('file_type', 'unknown')),
        'page': doc['metadata'].get('page', 'unknown'),
        'score': doc['similarity_score'],
        'oreview': doc['content'][:300] + "..."  #first 300 characters
    } for doc in results]
    confidence = max([doc['similarity_score'] for doc in results])

    
    ## generate answeer using groq llm
    prompt =f""" 

            Use the following context to answer the question.
            If the context does not contain the answer, respond with "I don't know".
            If the question is not related to the context, respond with "I don't know".

            Context:
            {context}

            Question:
            {query}

            Answer:
            """
    response = llm.invoke(prompt.format(context=context, query=query))
    output = {
        "answer": response.content,
        "sources": sources,
        "confidence_score": confidence
    }
    if return_context:
        output["context"] = context
    return output

# Example usage
result = rag_advanced("What are the main achievements of Lionel Messi?", rag_retriever, llm, top_k=3, min_score=0.1, return_context=True)
print("Answer:", result['answer'])
print("Sources:", result['sources'])
print("Confidence Score:", result['confidence_score'])
print("Context:", result['context'][:300])

Retrieving documents for query: What are the main achievements of Lionel Messi?
top-k: 3, score threshold: 0.1
Generating embeddings for 1 texts


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.08s/it]


Generated embeddings with shape: (1, 384)
Answer: Lionel Messi is known as the best football player, and his achievements include breaking records as soon as he played his first game for Barcelona in the 2004-05 season, becoming FIFA World Player of the Year, and having an impressive career ahead of him. He is also compared to legendary players like Pele and Maradona.
Sources: [{'source': 'lionel_messi.pdf', 'page': 2, 'score': 0.4458876848220825, 'oreview': '4. \nhe suffered from a growth  \nd. \nrecords \n5. \ncoached him until he made  \ne. \nhis first season \n6. \nLionel started breaking  \nf. \nPlayer of the Year \n7. \nwin the league in  \ng. \nto the legendary Pele \n8. \nMessi has an amazing career  \nh. \nup playing football \n9. \nexperts compare him \ni. \nhis Barc...'}, {'source': 'lionel_messi.pdf', 'page': 6, 'score': 0.3305927515029907, 'oreview': "LIONEL MESSI \n \n \nCopyright © www.FamousPeopleLessons.com \n7 \nPUT THE TEXT BACK TOGETHER \nNumber these lines in the c