# Phase 5: Build RAG Store & Retrieval Service

This notebook implements a RAG (Retrieval-Augmented Generation) system for medical guidelines with:
1. PDF text extraction and chunking
2. Embedding computation
3. FAISS vector indexing
4. Search functionality testing

## 1. Setup Dependencies and Config

In [1]:
import os
import json
import fitz  # PyMuPDF
import numpy as np
from pathlib import Path
from typing import List, Dict, Any
from sentence_transformers import SentenceTransformer
import faiss
import re

# Configuration
CONFIG = {
    'docs_dir': '../data/docs',
    'index_dir': '../data/index',
    'chunk_size': 300,  # target number of tokens per chunk
    'chunk_overlap': 50,  # number of tokens to overlap between chunks
    'embedding_model': 'all-MiniLM-L6-v2',  # sentence-transformers model
    'index_file': 'faiss_index.bin',
    'metadata_file': 'chunks_metadata.json',
    'text_extraction': {
        'strategy': 'blocks',  # Options: 'text', 'blocks', 'dict'
        'combine_paragraphs': True,  # Combine text blocks into paragraphs
        'min_block_size': 20  # Minimum characters in a text block
    }
}

# Create index directory if it doesn't exist
os.makedirs(Path(CONFIG['index_dir']), exist_ok=True)

  from .autonotebook import tqdm as notebook_tqdm


## 2. Text Extraction and Chunking

In [2]:
def clean_text(text: str) -> str:
    """Clean extracted text by normalizing whitespace and removing artifacts."""
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove any non-printable characters
    text = ''.join(char for char in text if char.isprintable())
    
    # Normalize quotes and dashes
    text = text.replace('"', '"').replace('"', '"').replace('—', '-')
    
    return text.strip()

def extract_text_from_pdf(pdf_path: str) -> List[Dict[str, Any]]:
    """Extract text from PDF using PyMuPDF with block-based extraction."""
    pages = []
    doc = fitz.open(pdf_path)
    
    try:
        for page_num, page in enumerate(doc, 1):
            blocks = []
            
            if CONFIG['text_extraction']['strategy'] == 'blocks':
                # Get text blocks with their bounding boxes and other properties
                page_dict = page.get_text("dict")
                
                # Extract and process text blocks
                for block in page_dict.get('blocks', []):
                    if block.get('type') == 0:  # Text blocks
                        block_text = ''
                        for line in block.get('lines', []):
                            for span in line.get('spans', []):
                                block_text += span.get('text', '') + ' '
                        
                        if len(block_text.strip()) >= CONFIG['text_extraction']['min_block_size']:
                            blocks.append(clean_text(block_text))
                
            else:
                # Fallback to simple text extraction
                text = page.get_text()
                if text.strip():
                    blocks = [clean_text(text)]
            
            # Combine blocks if needed
            if CONFIG['text_extraction']['combine_paragraphs']:
                text = ' '.join(blocks)
            else:
                text = '\n'.join(blocks)
            
            if text.strip():
                pages.append({
                    'page_num': page_num,
                    'text': text,
                    'doc_id': Path(pdf_path).stem
                })
                
    except Exception as e:
        print(f"Error processing PDF {pdf_path}: {e}")
    
    finally:
        doc.close()
        
    return pages

def create_chunks(text: str, chunk_size: int, overlap: int) -> List[str]:
    """Split text into overlapping chunks by word count, falling back if sentence splitting fails."""
    words = text.split()
    chunks = []
    start = 0
    
    while start < len(words):
        end = min(start + chunk_size, len(words))
        chunk = ' '.join(words[start:end])
        chunks.append(chunk)
        start += chunk_size - overlap  # move with overlap
    
    return chunks

 
# Process all PDFs in docs directory
chunks_with_metadata = []
docs_dir = Path(CONFIG['docs_dir'])

for pdf_file in docs_dir.glob('*.pdf'):
    print(f"\nProcessing {pdf_file.name}...")
    pages = extract_text_from_pdf(str(pdf_file))
    
    total_chunks = 0
    for page in pages:
        chunks = create_chunks(
            page['text'], 
            CONFIG['chunk_size'], 
            CONFIG['chunk_overlap']
        )
        
        for chunk_idx, chunk_text in enumerate(chunks):
            chunks_with_metadata.append({
                'doc_id': page['doc_id'],
                'page_num': page['page_num'],
                'chunk_idx': chunk_idx,
                'text': chunk_text
            })
            total_chunks += 1
    
    print(f"Extracted {len(pages)} pages and created {total_chunks} chunks")

print(f"\nTotal chunks created across all documents: {len(chunks_with_metadata)}")

# Display a sample chunk to verify content quality
if chunks_with_metadata:
    print("\nSample chunk:")
    sample = chunks_with_metadata[0]
    print(f"Document: {sample['doc_id']}, Page: {sample['page_num']}")
    print(f"Text: {sample['text'][:300]}...")


Processing IDF_Atlas.pdf...
Extracted 6 pages and created 6 chunks

Processing WHO_Fact_Sheet.pdf...
Extracted 8 pages and created 8 chunks

Total chunks created across all documents: 14

Sample chunk:
Document: IDF_Atlas, Page: 1
Text: FAQs Contact Data by location Data by indicators Resources 589 million adults (20-79 years) are living with diabetes worldwide Explore diabetes around the world Download 2025 Report Donate We value your privacy We use cookies to enhance your browsing experience, serve personalized ads or content, an...


## 3. Create Embedding Pipeline

In [3]:
# Initialize the embedding model
model = SentenceTransformer(CONFIG['embedding_model'])

# Generate embeddings for all chunks
texts = [chunk['text'] for chunk in chunks_with_metadata]
embeddings = model.encode(texts, show_progress_bar=True)

# Convert to numpy array for FAISS
embeddings_np = np.array(embeddings).astype('float32')

Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.89s/it]


## 4. Build Vector Index

In [4]:
# Initialize FAISS index
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)

# Add vectors to the index
index.add(embeddings_np)

# Save the index
index_path = Path(CONFIG['index_dir']) / CONFIG['index_file']
faiss.write_index(index, str(index_path))

# Save chunk metadata
metadata_path = Path(CONFIG['index_dir']) / CONFIG['metadata_file']
with open(metadata_path, 'w') as f:
    json.dump(chunks_with_metadata, f, indent=2)

print(f"Index saved to {index_path}")
print(f"Metadata saved to {metadata_path}")

Index saved to ..\data\index\faiss_index.bin
Metadata saved to ..\data\index\chunks_metadata.json


## 5. Implement Search Service

In [5]:
def search_chunks(query: str, k: int = 3) -> List[Dict[str, Any]]:
    """Search for relevant chunks given a query."""
    # Generate query embedding
    query_embedding = model.encode([query])
    
    # Search in FAISS index
    distances, indices = index.search(query_embedding.astype('float32'), k)
    
    # Get results with metadata
    results = []
    for idx, distance in zip(indices[0], distances[0]):
        chunk_data = chunks_with_metadata[idx].copy()
        chunk_data['distance'] = float(distance)
        results.append(chunk_data)
    
    return results

# Example search function for the FastAPI endpoint
def create_search_response(query: str, k: int = 3) -> Dict[str, Any]:
    results = search_chunks(query, k)
    return {
        'query': query,
        'results': results
    }

## 6. Validation and Testing

In [7]:
# Test queries with expected content
test_queries = [
    {
        'query': 'global prevalence of diabetes',
        'expected_keywords': ['830 million', '2022', 'prevalence', 'low- and middle-income']
    },
    {
        'query': 'symptoms of diabetes',
        'expected_keywords': ['thirsty', 'urinate', 'blurred vision', 'tired', 'losing weight']
    },
    {
        'query': 'type 1 diabetes characteristics',
        'expected_keywords': ['insulin', 'deficient', 'daily', 'juvenile']
    },
    {
        'query': 'causes of type 2 diabetes',
        'expected_keywords': ['overweight', 'exercise', 'genetics', 'preventable']
    },
    {
        'query': 'gestational diabetes risks',
        'expected_keywords': ['pregnancy', 'delivery', 'complications', 'type 2']
    },
    {
        'query': 'prevention of type 2 diabetes',
        'expected_keywords': ['healthy diet', 'exercise', 'weight', 'tobacco']
    }]

def evaluate_search_results(results: List[Dict[str, Any]], 
                          expected_keywords: List[str]) -> bool:
    """Check if any result contains all expected keywords."""
    for result in results:
        text = result['text'].lower()
        if all(keyword.lower() in text for keyword in expected_keywords):
            return True
    return False

# Run tests
print("Running search quality tests...\n")
successes = 0

for test in test_queries:
    print(f"Query: {test['query']}")
    results = search_chunks(test['query'])
    
    success = evaluate_search_results(results, test['expected_keywords'])
    successes += int(success)
    
    print(f"Found expected content: {'✓' if success else '❌'}")
    print(f"Top result: {results[0]['text'][:200]}...\n")

print(f"Test Results: {successes}/{len(test_queries)} queries successful")
print(f"Retrieval Quality: {(successes/len(test_queries))*100:.1f}%")

Running search quality tests...

Query: global prevalence of diabetes
Found expected content: ✓
Top result: The number of people living with diabetes rose from 200 million in 1990 to 830 million in 2022. Prevalence has been rising more rapidly in low- and middle-income countries than in high-income countrie...

Query: symptoms of diabetes
Found expected content: ✓
Top result: Symptoms of diabetes may occur suddenly. In type 2 diabetes, the symptoms can be mild and may take many years to be noticed. Symptoms of diabetes include: feeling very thirsty needing to urinate more ...

Query: type 1 diabetes characteristics
Found expected content: ✓
Top result: Type 1 diabetes (previously known as insulin-dependent, juvenile or childhood- onset) is characterized by deficient insulin production and requires daily administration of insulin. In 2017 there were ...

Query: causes of type 2 diabetes
Found expected content: ✓
Top result: Type 1 diabetes (previously known as insulin-dependent, juvenil