### RAG Pipeline1 - Data Ingestion to Vector DB

In [3]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path

##### Loading PDF documents

In [4]:
### Read all the pdf's inside the directory
def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    
    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            
            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            
            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")
            
        except Exception as e:
            print(f"  ✗ Error: {e}")
    
    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

# Process all PDFs in the data directory
all_pdf_documents = process_all_pdfs("../data")

Found 5 PDF files to process

Processing: brain-tumours-primary-and-brain-metastases-in-over-16s-pdf-1837763558341.pdf
  ✓ Loaded 67 pages

Processing: malnutrition.pdf
  ✓ Loaded 11 pages

Processing: nice_hypertension_guideline_summary.pdf
  ✓ Loaded 3 pages

Processing: parkinsons-disease-in-adults-pdf-1837629189061.pdf
  ✓ Loaded 31 pages

Processing: motor-neurone-disease-assessment-and-management-pdf-1837449470149.pdf
  ✓ Loaded 45 pages

Total documents loaded: 157


#### Chunking document text

In [14]:
def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

In [15]:
chunks = split_documents(all_pdf_documents)
chunks

Split 157 documents into 394 chunks

Example chunk:
Content: Brain tumours (primary) 
and brain metastases in 
over 16s 
NICE guideline 
Published: 11 July 2018 
Last updated: 29 January 2021 
www.nice.org.uk/guidance/ng99 
© NICE 2025. All rights reserved. Sub...
Metadata: {'producer': 'Prince 12.5 (www.princexml.com)', 'creator': 'NICE Publications', 'creationdate': '2021-01-29T00:00:00+00:00', 'keywords': 'NG99', 'subject': 'Brain tumours (primary) and brain metastases in over 16s (NG99)', 'author': 'National Institute for Health and Care Excellence (NICE)', 'title': 'Brain tumours (primary) and brain metastases in over 16s', 'source': '../data/pdf_files/brain-tumours-primary-and-brain-metastases-in-over-16s-pdf-1837763558341.pdf', 'total_pages': 67, 'page': 0, 'page_label': '1', 'source_file': 'brain-tumours-primary-and-brain-metastases-in-over-16s-pdf-1837763558341.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'Prince 12.5 (www.princexml.com)', 'creator': 'NICE Publications', 'creationdate': '2021-01-29T00:00:00+00:00', 'keywords': 'NG99', 'subject': 'Brain tumours (primary) and brain metastases in over 16s (NG99)', 'author': 'National Institute for Health and Care Excellence (NICE)', 'title': 'Brain tumours (primary) and brain metastases in over 16s', 'source': '../data/pdf_files/brain-tumours-primary-and-brain-metastases-in-over-16s-pdf-1837763558341.pdf', 'total_pages': 67, 'page': 0, 'page_label': '1', 'source_file': 'brain-tumours-primary-and-brain-metastases-in-over-16s-pdf-1837763558341.pdf', 'file_type': 'pdf'}, page_content='Brain tumours (primary) \nand brain metastases in \nover 16s \nNICE guideline \nPublished: 11 July 2018 \nLast updated: 29 January 2021 \nwww.nice.org.uk/guidance/ng99 \n© NICE 2025. All rights reserved. Subject to Notice of rights (https://www.nice.org.uk/terms-and-\nconditions#notice-of-rights).'),
 Document(metadata={'producer'

### Embedding and vectorStoreDB

In [18]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import Any, List
from sklearn.metrics.pairwise import cosine_similarity


In [19]:
class EmbeddingManager:
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        """
        Initialize the embedding manager
        
        Args:
            model_name: HuggingFace model for sentence embeddings
        """
        self.model_name = model_name
        self.model = None
        self._load_model()


    def _load_model(self):
        """Load the SentenceTransformer model"""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise
        
    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of texts
        
        Args:
            texts: List of text strings to embed
            
        Returns:
            numpy array of embeddings with shape (len(texts), embedding_dim)
        """
        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
            
## initialize the embedding manager

embedding_manager=EmbeddingManager()
embedding_manager


Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x1218fee40>