# Lab 3.5.4 Solutions: Hybrid Search Implementation

Complete solutions for dense + sparse hybrid retrieval with RRF fusion.

## Setup

In [None]:
import sys
sys.path.insert(0, '..')

from pathlib import Path
from typing import List, Dict, Any, Tuple
import numpy as np
import torch
import time

from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings

print(f"GPU available: {torch.cuda.is_available()}")

In [None]:
# Load and prepare data
def load_and_chunk():
    documents = []
    for file_path in Path("../data/sample_documents").glob("*.md"):
        content = file_path.read_text(encoding='utf-8')
        documents.append(Document(
            page_content=content,
            metadata={"source": file_path.name, "title": file_path.stem}
        ))
    
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    return splitter.split_documents(documents)

chunks = load_and_chunk()
print(f"Loaded {len(chunks)} chunks")

# Load embedding model
device = "cuda" if torch.cuda.is_available() else "cpu"
embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-large-en-v1.5",
    model_kwargs={"device": device},
    encode_kwargs={"normalize_embeddings": True}
)

## Exercise 1 Solution: Implement BM25 Retriever

**Task**: Build a complete BM25 sparse retriever with proper tokenization.

In [None]:
class BM25Retriever:
    """
    BM25 sparse retrieval with configurable tokenization.
    
    BM25 scores documents based on term frequency (TF) and 
    inverse document frequency (IDF), with saturation for TF.
    """
    
    def __init__(
        self,
        documents: List[Document],
        k1: float = 1.5,  # TF saturation
        b: float = 0.75,  # Length normalization
        remove_stopwords: bool = True
    ):
        """
        Initialize BM25 with documents.
        
        Args:
            k1: Controls term frequency saturation (1.2-2.0 typical)
            b: Controls length normalization (0.75 typical)
        """
        self.documents = documents
        self.k1 = k1
        self.b = b
        self.remove_stopwords = remove_stopwords
        
        # Setup tokenization
        self._setup_tokenizer()
        
        # Index documents
        self._build_index()
    
    def _setup_tokenizer(self):
        """Setup NLTK tokenizer and stopwords."""
        import nltk
        try:
            from nltk.tokenize import word_tokenize
            from nltk.corpus import stopwords
            self.stopwords = set(stopwords.words('english'))
        except LookupError:
            nltk.download('punkt', quiet=True)
            nltk.download('stopwords', quiet=True)
            from nltk.tokenize import word_tokenize
            from nltk.corpus import stopwords
            self.stopwords = set(stopwords.words('english'))
        
        self.tokenize = word_tokenize
    
    def _preprocess(self, text: str) -> List[str]:
        """Tokenize and optionally remove stopwords."""
        tokens = self.tokenize(text.lower())
        tokens = [t for t in tokens if t.isalnum()]
        if self.remove_stopwords:
            tokens = [t for t in tokens if t not in self.stopwords]
        return tokens
    
    def _build_index(self):
        """Build BM25 index."""
        from rank_bm25 import BM25Okapi
        
        # Tokenize all documents
        self.tokenized_docs = [
            self._preprocess(doc.page_content) 
            for doc in self.documents
        ]
        
        # Create BM25 index
        self.bm25 = BM25Okapi(
            self.tokenized_docs,
            k1=self.k1,
            b=self.b
        )
        
        # Compute average document length
        self.avg_doc_len = np.mean([len(d) for d in self.tokenized_docs])
        print(f"BM25 index built: {len(self.documents)} docs, avg length: {self.avg_doc_len:.0f} tokens")
    
    def search(self, query: str, k: int = 5) -> List[Tuple[Document, float]]:
        """Search for documents matching the query."""
        tokenized_query = self._preprocess(query)
        scores = self.bm25.get_scores(tokenized_query)
        
        # Get top-k indices
        top_indices = np.argsort(scores)[-k:][::-1]
        
        results = []
        for idx in top_indices:
            if scores[idx] > 0:  # Only include if there's a match
                results.append((self.documents[idx], float(scores[idx])))
        
        return results
    
    def get_all_scores(self, query: str) -> np.ndarray:
        """Get BM25 scores for all documents."""
        tokenized_query = self._preprocess(query)
        return self.bm25.get_scores(tokenized_query)

# Create BM25 retriever
bm25_retriever = BM25Retriever(chunks)

# Test
test_results = bm25_retriever.search("memory capacity specifications", k=3)
print("\nTest search results:")
for doc, score in test_results:
    print(f"  [{score:.3f}] {doc.page_content[:80]}...")

## Exercise 2 Solution: Implement Dense Retriever

**Task**: Build GPU-accelerated dense retriever with embedding caching.

In [None]:
class DenseRetriever:
    """
    Dense retrieval using pre-computed embeddings.
    
    Optimized for DGX Spark with:
    - GPU-accelerated embedding computation
    - Batch processing for efficiency
    - Optional embedding caching
    """
    
    def __init__(
        self,
        documents: List[Document],
        embedding_model: HuggingFaceEmbeddings,
        cache_embeddings: bool = True
    ):
        self.documents = documents
        self.embedding_model = embedding_model
        
        # Pre-compute embeddings
        print("Computing document embeddings...")
        start = time.time()
        
        texts = [doc.page_content for doc in documents]
        self.embeddings = np.array(embedding_model.embed_documents(texts))
        
        elapsed = time.time() - start
        print(f"Computed {len(documents)} embeddings in {elapsed:.2f}s")
        print(f"Embedding shape: {self.embeddings.shape}")
    
    def search(self, query: str, k: int = 5) -> List[Tuple[Document, float]]:
        """Search for semantically similar documents."""
        # Get query embedding
        query_emb = np.array(self.embedding_model.embed_query(query))
        
        # Compute similarities (dot product for normalized embeddings)
        similarities = np.dot(self.embeddings, query_emb)
        
        # Get top-k
        top_indices = np.argsort(similarities)[-k:][::-1]
        
        return [(self.documents[i], float(similarities[i])) for i in top_indices]
    
    def get_all_scores(self, query: str) -> np.ndarray:
        """Get similarity scores for all documents."""
        query_emb = np.array(self.embedding_model.embed_query(query))
        return np.dot(self.embeddings, query_emb)

# Create dense retriever
dense_retriever = DenseRetriever(chunks, embedding_model)

# Test
test_results = dense_retriever.search("memory capacity specifications", k=3)
print("\nTest search results:")
for doc, score in test_results:
    print(f"  [{score:.3f}] {doc.page_content[:80]}...")

## Exercise 3 Solution: Implement Hybrid Retriever with RRF

**Task**: Combine dense and sparse with Reciprocal Rank Fusion.

In [None]:
class HybridRetriever:
    """
    Hybrid retrieval combining dense and sparse methods.
    
    Fusion methods:
    - RRF (Reciprocal Rank Fusion): Robust, parameter-free
    - Linear: Weighted combination of normalized scores
    """
    
    def __init__(
        self,
        dense_retriever: DenseRetriever,
        sparse_retriever: BM25Retriever,
        alpha: float = 0.5,
        fusion_method: str = "rrf",
        rrf_k: int = 60
    ):
        """
        Initialize hybrid retriever.
        
        Args:
            alpha: Weight for dense retriever (1-alpha for sparse)
            fusion_method: "rrf" or "linear"
            rrf_k: RRF constant (higher = more equal weighting)
        """
        self.dense = dense_retriever
        self.sparse = sparse_retriever
        self.alpha = alpha
        self.fusion_method = fusion_method
        self.rrf_k = rrf_k
        self.documents = dense_retriever.documents
    
    def search(
        self,
        query: str,
        k: int = 5,
        first_stage_k: int = 50
    ) -> List[Tuple[Document, float, Dict[str, Any]]]:
        """
        Hybrid search with fusion.
        
        Returns:
            List of (Document, score, metadata) tuples
            Metadata includes source scores and ranks
        """
        if self.fusion_method == "rrf":
            return self._rrf_search(query, k, first_stage_k)
        else:
            return self._linear_search(query, k)
    
    def _rrf_search(
        self,
        query: str,
        k: int,
        first_stage_k: int
    ) -> List[Tuple[Document, float, Dict]]:
        """
        Reciprocal Rank Fusion.
        
        RRF Score = sum(1 / (k + rank)) for each ranking
        
        Benefits:
        - No need to normalize scores
        - Robust to outliers
        - Works well with different scoring scales
        """
        # Get results from both retrievers
        dense_results = self.dense.search(query, k=first_stage_k)
        sparse_results = self.sparse.search(query, k=first_stage_k)
        
        # Build document ID mapping
        doc_to_idx = {id(doc): i for i, doc in enumerate(self.documents)}
        
        # Compute RRF scores
        rrf_scores = {}
        
        for rank, (doc, score) in enumerate(dense_results):
            doc_idx = doc_to_idx.get(id(doc), id(doc))
            if doc_idx not in rrf_scores:
                rrf_scores[doc_idx] = {
                    "doc": doc,
                    "rrf_score": 0,
                    "dense_rank": None,
                    "dense_score": None,
                    "sparse_rank": None,
                    "sparse_score": None
                }
            rrf_scores[doc_idx]["rrf_score"] += self.alpha / (self.rrf_k + rank + 1)
            rrf_scores[doc_idx]["dense_rank"] = rank + 1
            rrf_scores[doc_idx]["dense_score"] = score
        
        for rank, (doc, score) in enumerate(sparse_results):
            doc_idx = doc_to_idx.get(id(doc), id(doc))
            if doc_idx not in rrf_scores:
                rrf_scores[doc_idx] = {
                    "doc": doc,
                    "rrf_score": 0,
                    "dense_rank": None,
                    "dense_score": None,
                    "sparse_rank": None,
                    "sparse_score": None
                }
            rrf_scores[doc_idx]["rrf_score"] += (1 - self.alpha) / (self.rrf_k + rank + 1)
            rrf_scores[doc_idx]["sparse_rank"] = rank + 1
            rrf_scores[doc_idx]["sparse_score"] = score
        
        # Sort by RRF score
        sorted_results = sorted(
            rrf_scores.values(),
            key=lambda x: -x["rrf_score"]
        )
        
        # Return top-k with metadata
        return [
            (
                r["doc"],
                r["rrf_score"],
                {
                    "dense_rank": r["dense_rank"],
                    "dense_score": r["dense_score"],
                    "sparse_rank": r["sparse_rank"],
                    "sparse_score": r["sparse_score"]
                }
            )
            for r in sorted_results[:k]
        ]
    
    def _linear_search(
        self,
        query: str,
        k: int
    ) -> List[Tuple[Document, float, Dict]]:
        """
        Linear combination of normalized scores.
        
        hybrid_score = alpha * norm(dense) + (1-alpha) * norm(sparse)
        """
        # Get all scores
        dense_scores = self.dense.get_all_scores(query)
        sparse_scores = self.sparse.get_all_scores(query)
        
        # Min-max normalize
        def normalize(scores):
            min_s, max_s = scores.min(), scores.max()
            if max_s - min_s < 1e-8:
                return np.zeros_like(scores)
            return (scores - min_s) / (max_s - min_s)
        
        dense_norm = normalize(dense_scores)
        sparse_norm = normalize(sparse_scores)
        
        # Combine
        hybrid_scores = self.alpha * dense_norm + (1 - self.alpha) * sparse_norm
        
        # Get top-k
        top_indices = np.argsort(hybrid_scores)[-k:][::-1]
        
        return [
            (
                self.documents[i],
                float(hybrid_scores[i]),
                {
                    "dense_score": float(dense_scores[i]),
                    "sparse_score": float(sparse_scores[i]),
                    "dense_norm": float(dense_norm[i]),
                    "sparse_norm": float(sparse_norm[i])
                }
            )
            for i in top_indices
        ]

# Create hybrid retriever
hybrid_retriever = HybridRetriever(
    dense_retriever,
    bm25_retriever,
    alpha=0.5,
    fusion_method="rrf"
)

print("Hybrid retriever created!")

## Exercise 4 Solution: Compare Retrieval Methods

**Task**: Benchmark dense vs sparse vs hybrid retrieval.

In [None]:
def compare_retrieval_methods(
    test_queries: List[Dict[str, Any]],
    dense: DenseRetriever,
    sparse: BM25Retriever,
    hybrid: HybridRetriever,
    k: int = 5
) -> Dict[str, Dict]:
    """
    Compare retrieval methods on test queries.
    
    Metrics:
    - Recall@k: Did we find the expected source?
    - MRR: Mean Reciprocal Rank
    - Latency: Query time
    """
    results = {
        "Dense": {"recalls": [], "mrrs": [], "latencies": []},
        "Sparse (BM25)": {"recalls": [], "mrrs": [], "latencies": []},
        "Hybrid (RRF)": {"recalls": [], "mrrs": [], "latencies": []}
    }
    
    for query_data in test_queries:
        query = query_data["query"]
        expected = query_data.get("expected_source", "")
        
        # Dense retrieval
        start = time.time()
        dense_results = dense.search(query, k=k)
        results["Dense"]["latencies"].append(time.time() - start)
        
        # Sparse retrieval
        start = time.time()
        sparse_results = sparse.search(query, k=k)
        results["Sparse (BM25)"]["latencies"].append(time.time() - start)
        
        # Hybrid retrieval
        start = time.time()
        hybrid_results = hybrid.search(query, k=k)
        results["Hybrid (RRF)"]["latencies"].append(time.time() - start)
        
        # Calculate metrics
        for method_name, method_results in [
            ("Dense", dense_results),
            ("Sparse (BM25)", sparse_results),
            ("Hybrid (RRF)", [(r[0], r[1]) for r in hybrid_results])
        ]:
            # Check recall
            found = False
            mrr = 0
            for i, (doc, _) in enumerate(method_results):
                source = doc.metadata.get("source", "")
                if expected in source:
                    found = True
                    mrr = 1 / (i + 1)
                    break
            
            results[method_name]["recalls"].append(1.0 if found else 0.0)
            results[method_name]["mrrs"].append(mrr)
    
    # Aggregate
    summary = {}
    for method, metrics in results.items():
        summary[method] = {
            "recall@k": np.mean(metrics["recalls"]),
            "mrr": np.mean(metrics["mrrs"]),
            "latency_ms": np.mean(metrics["latencies"]) * 1000
        }
    
    return summary

# Test queries with expected sources
test_queries = [
    {"query": "What is the memory capacity of DGX Spark?", "expected_source": "dgx_spark"},
    {"query": "How does LoRA reduce trainable parameters?", "expected_source": "lora"},
    {"query": "Explain the attention mechanism", "expected_source": "transformer"},
    {"query": "What quantization methods exist?", "expected_source": "quantization"},
    {"query": "vector database comparison", "expected_source": "vector_database"},
    {"query": "128GB unified memory", "expected_source": "dgx_spark"},  # Keyword-heavy
    {"query": "efficient model training techniques", "expected_source": "lora"},  # Semantic
    {"query": "NVFP4 vs AWQ quantization", "expected_source": "quantization"}  # Mixed
]

# Run comparison
comparison = compare_retrieval_methods(
    test_queries,
    dense_retriever,
    bm25_retriever,
    hybrid_retriever
)

# Display results
print("="*70)
print("RETRIEVAL METHOD COMPARISON")
print("="*70)
print(f"{'Method':<20} {'Recall@5':<12} {'MRR':<12} {'Latency (ms)':<12}")
print("-"*56)

for method, metrics in comparison.items():
    print(f"{method:<20} {metrics['recall@k']:<12.3f} {metrics['mrr']:<12.3f} {metrics['latency_ms']:<12.2f}")

## Exercise 5 Solution: Optimize Alpha Parameter

**Task**: Find optimal balance between dense and sparse.

In [None]:
def optimize_alpha(
    test_queries: List[Dict],
    dense: DenseRetriever,
    sparse: BM25Retriever,
    alpha_values: List[float] = [0.0, 0.25, 0.5, 0.75, 1.0]
) -> Dict[float, float]:
    """Find optimal alpha for hybrid search."""
    
    results = {}
    
    for alpha in alpha_values:
        hybrid = HybridRetriever(dense, sparse, alpha=alpha, fusion_method="rrf")
        
        recalls = []
        for query_data in test_queries:
            query = query_data["query"]
            expected = query_data.get("expected_source", "")
            
            search_results = hybrid.search(query, k=5)
            
            found = any(
                expected in doc.metadata.get("source", "")
                for doc, _, _ in search_results
            )
            recalls.append(1.0 if found else 0.0)
        
        results[alpha] = np.mean(recalls)
    
    return results

# Run optimization
alpha_results = optimize_alpha(
    test_queries,
    dense_retriever,
    bm25_retriever,
    alpha_values=[0.0, 0.2, 0.4, 0.5, 0.6, 0.8, 1.0]
)

print("\n" + "="*50)
print("ALPHA OPTIMIZATION RESULTS")
print("="*50)
print(f"{'Alpha':<10} {'Dense Weight':<15} {'Sparse Weight':<15} {'Recall@5':<10}")
print("-"*50)

best_alpha = max(alpha_results, key=alpha_results.get)
for alpha, recall in sorted(alpha_results.items()):
    marker = " <-- BEST" if alpha == best_alpha else ""
    print(f"{alpha:<10.2f} {alpha:<15.2f} {1-alpha:<15.2f} {recall:<10.3f}{marker}")

print(f"\nOptimal alpha: {best_alpha} (Recall@5: {alpha_results[best_alpha]:.3f})")

## Key Insights

In [None]:
insights = """
HYBRID SEARCH INSIGHTS
======================

1. WHEN TO USE EACH METHOD
   - Dense: Semantic similarity, paraphrase matching
   - Sparse (BM25): Exact keyword matching, technical terms
   - Hybrid: General purpose, unknown query distribution

2. RRF vs LINEAR FUSION
   - RRF: More robust, doesn't need score calibration
   - Linear: Faster, but requires careful normalization
   - Recommendation: Use RRF for production

3. ALPHA TUNING GUIDELINES
   - alpha=0.5: Good default for balanced queries
   - alpha>0.5: Prioritize semantic understanding
   - alpha<0.5: Prioritize keyword matching
   - Tune based on your query distribution!

4. DGX SPARK OPTIMIZATION
   - Pre-compute dense embeddings on GPU
   - BM25 runs efficiently on CPU
   - Batch queries for better GPU utilization

5. PRODUCTION CONSIDERATIONS
   - Cache both embedding and BM25 results
   - Use first_stage_k to control candidate pool
   - Monitor which method contributes more to wins
"""

print(insights)