# Lab 3.5.1 Solutions: Building Your First RAG Pipeline

Complete solutions to all exercises with explanations and alternative approaches.

## Setup

In [None]:
import sys
sys.path.insert(0, '..')

from pathlib import Path
from typing import List, Dict, Any
import numpy as np
import torch

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## Exercise 1 Solution: Load and Chunk Documents

**Task**: Load all sample documents and create optimized chunks with metadata.

In [None]:
def load_all_documents(data_dir: str = "../data/sample_documents") -> List[Document]:
    """
    Load all markdown documents from the data directory.
    
    Key decisions:
    - Using pathlib for cross-platform compatibility
    - Extracting title from first heading
    - Adding comprehensive metadata for filtering
    """
    documents = []
    data_path = Path(data_dir)
    
    for file_path in data_path.glob("*.md"):
        content = file_path.read_text(encoding='utf-8')
        
        # Extract title from first heading
        lines = content.split('\n')
        title = file_path.stem
        for line in lines:
            if line.startswith('# '):
                title = line[2:].strip()
                break
        
        # Create document with rich metadata
        doc = Document(
            page_content=content,
            metadata={
                "source": str(file_path),
                "filename": file_path.name,
                "title": title,
                "file_type": "markdown",
                "char_count": len(content),
                "word_count": len(content.split())
            }
        )
        documents.append(doc)
    
    return documents

# Load documents
documents = load_all_documents()
print(f"Loaded {len(documents)} documents:")
for doc in documents:
    print(f"  - {doc.metadata['title']} ({doc.metadata['word_count']} words)")

In [None]:
def chunk_with_overlap(documents: List[Document], chunk_size: int = 500, overlap: int = 100) -> List[Document]:
    """
    Chunk documents with optimal settings for RAG.
    
    Key decisions:
    - 500 chars captures ~1-2 paragraphs (semantic units)
    - 100 char overlap preserves context at boundaries
    - Markdown separators respect document structure
    """
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=overlap,
        separators=["\n## ", "\n### ", "\n\n", "\n", ". ", " "],
        length_function=len
    )
    
    all_chunks = []
    for doc in documents:
        chunks = splitter.split_documents([doc])
        
        # Enrich chunk metadata
        for i, chunk in enumerate(chunks):
            chunk.metadata['chunk_index'] = i
            chunk.metadata['total_chunks'] = len(chunks)
            chunk.metadata['chunk_size'] = len(chunk.page_content)
        
        all_chunks.extend(chunks)
    
    return all_chunks

# Create chunks
chunks = chunk_with_overlap(documents)
print(f"Created {len(chunks)} chunks")
print(f"Average chunk size: {np.mean([c.metadata['chunk_size'] for c in chunks]):.0f} chars")

### Alternative Approach: Semantic Chunking

For more advanced use cases, you can chunk based on semantic similarity.

In [None]:
def semantic_chunk(text: str, embedding_model, threshold: float = 0.7) -> List[str]:
    """
    Alternative: Semantic-aware chunking based on embedding similarity.
    
    This approach groups sentences that are semantically related,
    creating more coherent chunks.
    """
    import re
    
    # Split into sentences
    sentences = re.split(r'(?<=[.!?])\s+', text)
    if len(sentences) < 2:
        return [text]
    
    # Get embeddings for all sentences
    embeddings = embedding_model.embed_documents(sentences)
    embeddings = np.array(embeddings)
    
    # Group similar sentences
    chunks = []
    current_chunk = [sentences[0]]
    current_emb = embeddings[0]
    
    for i in range(1, len(sentences)):
        # Compute similarity with current chunk centroid
        similarity = np.dot(current_emb, embeddings[i]) / (
            np.linalg.norm(current_emb) * np.linalg.norm(embeddings[i])
        )
        
        if similarity > threshold:
            current_chunk.append(sentences[i])
            # Update centroid
            current_emb = np.mean([embeddings[j] for j in range(len(current_chunk))], axis=0)
        else:
            chunks.append(' '.join(current_chunk))
            current_chunk = [sentences[i]]
            current_emb = embeddings[i]
    
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks

print("Semantic chunking function defined (use with embedding model)")

## Exercise 2 Solution: Create Vector Store with GPU Acceleration

**Task**: Create an optimized vector store using DGX Spark's GPU.

In [None]:
def create_optimized_vectorstore(
    chunks: List[Document],
    model_name: str = "BAAI/bge-large-en-v1.5",
    persist_dir: str = "./chroma_db"
) -> Chroma:
    """
    Create GPU-accelerated vector store.
    
    Key optimizations:
    - Uses CUDA for embedding computation
    - L2 normalization for cosine similarity
    - Batch processing for efficiency
    """
    import shutil
    from pathlib import Path
    
    # Clean previous database
    if Path(persist_dir).exists():
        shutil.rmtree(persist_dir)
    
    # Load embedding model with GPU
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")
    
    embedding_model = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs={"device": device},
        encode_kwargs={
            "normalize_embeddings": True,  # L2 normalize for cosine sim
            "batch_size": 64  # Larger batches for GPU
        }
    )
    
    # Create vector store
    import time
    start = time.time()
    
    vectorstore = Chroma.from_documents(
        documents=chunks,
        embedding=embedding_model,
        persist_directory=persist_dir,
        collection_name="rag_documents"
    )
    
    elapsed = time.time() - start
    print(f"Created vector store with {len(chunks)} chunks in {elapsed:.2f}s")
    print(f"Throughput: {len(chunks)/elapsed:.1f} chunks/second")
    
    return vectorstore, embedding_model

# Create vector store
vectorstore, embedding_model = create_optimized_vectorstore(chunks)

## Exercise 3 Solution: Implement Complete RAG Pipeline

**Task**: Build an end-to-end RAG system with proper generation.

In [None]:
class CompleteRAGPipeline:
    """
    Production-ready RAG pipeline with all components.
    
    Features:
    - Configurable retrieval (k, threshold)
    - Source tracking for citations
    - Fallback handling
    - Context window management
    """
    
    def __init__(
        self,
        vectorstore: Chroma,
        llm_model: str = "llama3.1:8b",
        k: int = 5,
        score_threshold: float = 0.3
    ):
        self.vectorstore = vectorstore
        self.llm_model = llm_model
        self.k = k
        self.score_threshold = score_threshold
        self.context_window = 4096  # Tokens available for context
    
    def retrieve(self, query: str) -> List[Dict[str, Any]]:
        """Retrieve relevant documents with scores."""
        results = self.vectorstore.similarity_search_with_score(query, k=self.k)
        
        retrieved = []
        for doc, score in results:
            # Convert distance to similarity (Chroma uses L2 distance)
            similarity = 1 / (1 + score)
            
            if similarity >= self.score_threshold:
                retrieved.append({
                    "content": doc.page_content,
                    "metadata": doc.metadata,
                    "score": similarity
                })
        
        return retrieved
    
    def build_context(self, retrieved: List[Dict]) -> str:
        """Build context string with source tracking."""
        context_parts = []
        total_chars = 0
        max_chars = self.context_window * 4  # Rough char estimate
        
        for i, doc in enumerate(retrieved):
            source = doc['metadata'].get('title', doc['metadata'].get('filename', 'Unknown'))
            content = f"[Source {i+1}: {source}]\n{doc['content']}"
            
            if total_chars + len(content) > max_chars:
                break
            
            context_parts.append(content)
            total_chars += len(content)
        
        return "\n\n".join(context_parts)
    
    def generate(self, query: str, context: str) -> str:
        """Generate answer using LLM."""
        prompt = f"""You are a helpful AI assistant. Answer the question based ONLY on the provided context.
If the context doesn't contain enough information, say "I don't have enough information to answer this."
Cite sources using [Source N] format.

CONTEXT:
{context}

QUESTION: {query}

ANSWER:"""
        
        try:
            import ollama
            response = ollama.chat(
                model=self.llm_model,
                messages=[{"role": "user", "content": prompt}]
            )
            return response["message"]["content"]
        except Exception as e:
            return f"Error generating response: {e}"
    
    def query(self, question: str) -> Dict[str, Any]:
        """Complete RAG query with full results."""
        # Retrieve
        retrieved = self.retrieve(question)
        
        if not retrieved:
            return {
                "question": question,
                "answer": "I couldn't find any relevant information in the knowledge base.",
                "sources": [],
                "num_sources": 0
            }
        
        # Build context
        context = self.build_context(retrieved)
        
        # Generate
        answer = self.generate(question, context)
        
        return {
            "question": question,
            "answer": answer,
            "sources": [{
                "title": r['metadata'].get('title', 'Unknown'),
                "score": r['score'],
                "preview": r['content'][:200] + "..."
            } for r in retrieved],
            "num_sources": len(retrieved)
        }

# Create RAG pipeline
rag = CompleteRAGPipeline(vectorstore)
print("RAG pipeline created successfully!")

In [None]:
# Test the complete pipeline
test_questions = [
    "What is the memory capacity of DGX Spark?",
    "How does LoRA reduce the number of trainable parameters?",
    "What are the key components of a transformer?"
]

for question in test_questions:
    print(f"\n{'='*60}")
    print(f"Q: {question}")
    print("="*60)
    
    result = rag.query(question)
    print(f"\nAnswer: {result['answer'][:500]}...")
    print(f"\nSources used: {result['num_sources']}")
    for i, src in enumerate(result['sources'][:3]):
        print(f"  [{i+1}] {src['title']} (score: {src['score']:.3f})")

## Exercise 4 Solution: Evaluate Retrieval Quality

**Task**: Implement metrics to evaluate RAG performance.

In [None]:
def evaluate_retrieval(
    vectorstore: Chroma,
    test_queries: List[Dict[str, Any]]
) -> Dict[str, float]:
    """
    Evaluate retrieval quality with standard metrics.
    
    Metrics:
    - Recall@k: Fraction of relevant docs retrieved
    - MRR: Mean Reciprocal Rank (position of first relevant)
    - Precision@k: Fraction of retrieved docs that are relevant
    """
    recalls = []
    mrrs = []
    precisions = []
    
    for query_data in test_queries:
        query = query_data["query"]
        expected_sources = set(query_data.get("expected_sources", []))
        k = query_data.get("k", 5)
        
        # Retrieve
        results = vectorstore.similarity_search_with_score(query, k=k)
        retrieved_sources = [doc.metadata.get("filename", "") for doc, _ in results]
        
        # Calculate metrics
        if expected_sources:
            # Recall@k
            found = set(retrieved_sources) & expected_sources
            recall = len(found) / len(expected_sources)
            recalls.append(recall)
            
            # Precision@k
            precision = len(found) / len(retrieved_sources) if retrieved_sources else 0
            precisions.append(precision)
            
            # MRR
            mrr = 0
            for i, source in enumerate(retrieved_sources):
                if source in expected_sources:
                    mrr = 1 / (i + 1)
                    break
            mrrs.append(mrr)
    
    return {
        "recall@k": np.mean(recalls) if recalls else 0,
        "precision@k": np.mean(precisions) if precisions else 0,
        "mrr": np.mean(mrrs) if mrrs else 0
    }

# Test evaluation
test_queries = [
    {
        "query": "DGX Spark memory specifications",
        "expected_sources": ["dgx_spark_technical_guide.md"],
        "k": 5
    },
    {
        "query": "LoRA fine-tuning technique",
        "expected_sources": ["lora_finetuning_guide.md"],
        "k": 5
    },
    {
        "query": "transformer attention mechanism",
        "expected_sources": ["transformer_architecture_explained.md"],
        "k": 5
    }
]

metrics = evaluate_retrieval(vectorstore, test_queries)
print("Retrieval Evaluation Results:")
for metric, value in metrics.items():
    print(f"  {metric}: {value:.3f}")

## Performance Comparison: CPU vs GPU

Benchmark showing the benefit of GPU acceleration on DGX Spark.

In [None]:
import time

def benchmark_embedding(texts: List[str], device: str) -> Dict[str, float]:
    """Benchmark embedding performance on specified device."""
    model = HuggingFaceEmbeddings(
        model_name="BAAI/bge-large-en-v1.5",
        model_kwargs={"device": device},
        encode_kwargs={"normalize_embeddings": True, "batch_size": 32}
    )
    
    # Warmup
    _ = model.embed_documents(texts[:10])
    
    # Benchmark
    start = time.time()
    embeddings = model.embed_documents(texts)
    elapsed = time.time() - start
    
    return {
        "device": device,
        "num_texts": len(texts),
        "total_time": elapsed,
        "throughput": len(texts) / elapsed
    }

# Prepare benchmark texts
benchmark_texts = [chunk.page_content for chunk in chunks]

print(f"Benchmarking with {len(benchmark_texts)} text chunks...")

# GPU benchmark
if torch.cuda.is_available():
    gpu_results = benchmark_embedding(benchmark_texts, "cuda")
    print(f"\nGPU Results:")
    print(f"  Time: {gpu_results['total_time']:.2f}s")
    print(f"  Throughput: {gpu_results['throughput']:.1f} texts/sec")

# CPU benchmark (on subset for speed)
cpu_texts = benchmark_texts[:50]
cpu_results = benchmark_embedding(cpu_texts, "cpu")
print(f"\nCPU Results (subset of {len(cpu_texts)} texts):")
print(f"  Time: {cpu_results['total_time']:.2f}s")
print(f"  Throughput: {cpu_results['throughput']:.1f} texts/sec")

if torch.cuda.is_available():
    speedup = gpu_results['throughput'] / cpu_results['throughput']
    print(f"\nGPU Speedup: {speedup:.1f}x faster")

## Key Takeaways

1. **Document Loading**: Always add comprehensive metadata for filtering and debugging

2. **Chunking**: 500 chars with 100 overlap works well for general documents. Use semantic chunking for better coherence.

3. **GPU Acceleration**: DGX Spark's GPU provides 5-10x speedup for embedding computation

4. **Evaluation**: Always measure Recall@k and MRR to track retrieval quality

5. **Production**: Include source tracking, score thresholds, and proper error handling