# Lab 3.5.3 Solutions: Vector Database Comparison

Complete solutions with GPU benchmarks and production recommendations.

## Setup

In [None]:
import sys
sys.path.insert(0, '..')

from pathlib import Path
from typing import List, Dict, Any, Tuple
import numpy as np
import torch
import time
import shutil
import tempfile

from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")
print(f"CUDA available: {torch.cuda.is_available()}")

In [None]:
# Load and chunk documents
def prepare_data():
    documents = []
    for file_path in Path("../data/sample_documents").glob("*.md"):
        content = file_path.read_text(encoding='utf-8')
        documents.append(Document(
            page_content=content,
            metadata={"source": file_path.name}
        ))
    
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    chunks = splitter.split_documents(documents)
    return chunks

chunks = prepare_data()
print(f"Prepared {len(chunks)} chunks for testing")

# Load embedding model
device = "cuda" if torch.cuda.is_available() else "cpu"
embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-large-en-v1.5",
    model_kwargs={"device": device},
    encode_kwargs={"normalize_embeddings": True, "batch_size": 64}
)
print(f"Embedding model loaded on {device}")

## Exercise 1 Solution: Implement Vector Store Wrappers

**Task**: Create unified interface for ChromaDB, FAISS, and Qdrant.

In [None]:
class VectorStoreWrapper:
    """Unified interface for vector store operations."""
    
    def __init__(self, name: str):
        self.name = name
        self.store = None
        self.build_time = 0
    
    def build(self, chunks: List[Document], embedding_model) -> float:
        """Build the vector store. Returns build time in seconds."""
        raise NotImplementedError
    
    def search(self, query: str, k: int = 5) -> List[Tuple[Document, float]]:
        """Search for similar documents. Returns (doc, score) tuples."""
        raise NotImplementedError
    
    def cleanup(self):
        """Clean up resources."""
        pass


class ChromaWrapper(VectorStoreWrapper):
    """ChromaDB wrapper."""
    
    def __init__(self):
        super().__init__("ChromaDB")
        self.persist_dir = None
    
    def build(self, chunks: List[Document], embedding_model) -> float:
        self.persist_dir = tempfile.mkdtemp()
        
        start = time.time()
        self.store = Chroma.from_documents(
            documents=chunks,
            embedding=embedding_model,
            persist_directory=self.persist_dir
        )
        self.build_time = time.time() - start
        return self.build_time
    
    def search(self, query: str, k: int = 5) -> List[Tuple[Document, float]]:
        return self.store.similarity_search_with_score(query, k=k)
    
    def cleanup(self):
        if self.persist_dir and Path(self.persist_dir).exists():
            shutil.rmtree(self.persist_dir)


class FAISSWrapper(VectorStoreWrapper):
    """FAISS wrapper with GPU support."""
    
    def __init__(self, use_gpu: bool = True):
        super().__init__(f"FAISS-{'GPU' if use_gpu else 'CPU'}")
        self.use_gpu = use_gpu
    
    def build(self, chunks: List[Document], embedding_model) -> float:
        try:
            from langchain_community.vectorstores import FAISS
        except ImportError:
            raise ImportError("Install faiss: pip install faiss-gpu")
        
        start = time.time()
        self.store = FAISS.from_documents(
            documents=chunks,
            embedding=embedding_model
        )
        
        # Move to GPU if requested
        if self.use_gpu and torch.cuda.is_available():
            try:
                import faiss
                # Get the index
                cpu_index = self.store.index
                # Create GPU resource
                res = faiss.StandardGpuResources()
                # Move index to GPU
                gpu_index = faiss.index_cpu_to_gpu(res, 0, cpu_index)
                self.store.index = gpu_index
                print("  FAISS index moved to GPU")
            except Exception as e:
                print(f"  GPU transfer failed, using CPU: {e}")
        
        self.build_time = time.time() - start
        return self.build_time
    
    def search(self, query: str, k: int = 5) -> List[Tuple[Document, float]]:
        results = self.store.similarity_search_with_score(query, k=k)
        return results


class QdrantWrapper(VectorStoreWrapper):
    """Qdrant wrapper."""
    
    def __init__(self):
        super().__init__("Qdrant")
        self.collection_name = "benchmark_collection"
    
    def build(self, chunks: List[Document], embedding_model) -> float:
        try:
            from langchain_qdrant import Qdrant
            from qdrant_client import QdrantClient
        except ImportError:
            raise ImportError("Install qdrant: pip install langchain-qdrant qdrant-client")
        
        start = time.time()
        
        # Use in-memory storage for benchmarking
        self.store = Qdrant.from_documents(
            documents=chunks,
            embedding=embedding_model,
            location=":memory:",
            collection_name=self.collection_name
        )
        
        self.build_time = time.time() - start
        return self.build_time
    
    def search(self, query: str, k: int = 5) -> List[Tuple[Document, float]]:
        results = self.store.similarity_search_with_score(query, k=k)
        return results

print("Vector store wrappers defined")

## Exercise 2 Solution: Comprehensive Benchmark

**Task**: Compare build time, query latency, and memory usage.

In [None]:
def benchmark_vector_store(
    wrapper: VectorStoreWrapper,
    chunks: List[Document],
    embedding_model,
    test_queries: List[str],
    num_iterations: int = 10
) -> Dict[str, Any]:
    """
    Comprehensive benchmark for a vector store.
    
    Metrics:
    - Build time
    - Query latency (mean, p50, p95, p99)
    - Memory usage
    """
    import psutil
    import gc
    
    # Measure memory before
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gpu_mem_before = torch.cuda.memory_allocated()
    cpu_mem_before = psutil.Process().memory_info().rss
    
    # Build
    print(f"  Building {wrapper.name}...")
    build_time = wrapper.build(chunks, embedding_model)
    
    # Measure memory after build
    cpu_mem_after = psutil.Process().memory_info().rss
    if torch.cuda.is_available():
        gpu_mem_after = torch.cuda.memory_allocated()
        gpu_mem_used = (gpu_mem_after - gpu_mem_before) / (1024**2)
    else:
        gpu_mem_used = 0
    
    cpu_mem_used = (cpu_mem_after - cpu_mem_before) / (1024**2)
    
    # Query latency benchmark
    print(f"  Benchmarking queries...")
    latencies = []
    
    # Warmup
    for query in test_queries[:3]:
        _ = wrapper.search(query, k=5)
    
    # Actual benchmark
    for _ in range(num_iterations):
        for query in test_queries:
            start = time.time()
            _ = wrapper.search(query, k=5)
            latencies.append(time.time() - start)
    
    latencies = np.array(latencies) * 1000  # Convert to ms
    
    # Cleanup
    wrapper.cleanup()
    
    return {
        "name": wrapper.name,
        "build_time_s": build_time,
        "query_latency_mean_ms": np.mean(latencies),
        "query_latency_p50_ms": np.percentile(latencies, 50),
        "query_latency_p95_ms": np.percentile(latencies, 95),
        "query_latency_p99_ms": np.percentile(latencies, 99),
        "cpu_memory_mb": cpu_mem_used,
        "gpu_memory_mb": gpu_mem_used,
        "throughput_qps": 1000 / np.mean(latencies)  # Queries per second
    }

# Test queries
test_queries = [
    "What is the memory capacity of DGX Spark?",
    "How does LoRA reduce trainable parameters?",
    "Explain transformer attention",
    "What quantization methods are available?",
    "How do vector databases work?",
    "What is NVFP4 quantization?",
    "Explain RAG architecture",
    "What is the difference between ChromaDB and FAISS?"
]

# Run benchmarks
print("Running comprehensive benchmarks...\n")

wrappers = [
    ChromaWrapper(),
    FAISSWrapper(use_gpu=True),
    FAISSWrapper(use_gpu=False),
]

# Try to add Qdrant if available
try:
    from langchain_qdrant import Qdrant
    wrappers.append(QdrantWrapper())
except ImportError:
    print("Qdrant not installed, skipping...")

results = []
for wrapper in wrappers:
    try:
        result = benchmark_vector_store(wrapper, chunks, embedding_model, test_queries)
        results.append(result)
        print(f"  {wrapper.name}: {result['query_latency_mean_ms']:.2f}ms avg latency\n")
    except Exception as e:
        print(f"  {wrapper.name} failed: {e}\n")

In [None]:
# Display results
print("="*100)
print("VECTOR DATABASE BENCHMARK RESULTS")
print("="*100)
print(f"{'Database':<15} {'Build (s)':<12} {'Latency (ms)':<15} {'P95 (ms)':<12} {'QPS':<10} {'Memory (MB)':<12}")
print("-"*76)

for r in sorted(results, key=lambda x: x['query_latency_mean_ms']):
    print(f"{r['name']:<15} {r['build_time_s']:<12.2f} {r['query_latency_mean_ms']:<15.2f} {r['query_latency_p95_ms']:<12.2f} {r['throughput_qps']:<10.0f} {r['cpu_memory_mb']:<12.1f}")

# Find best in each category
if results:
    print("\n" + "="*60)
    print("CATEGORY WINNERS")
    print("="*60)
    
    fastest_build = min(results, key=lambda x: x['build_time_s'])
    lowest_latency = min(results, key=lambda x: x['query_latency_mean_ms'])
    highest_qps = max(results, key=lambda x: x['throughput_qps'])
    lowest_memory = min(results, key=lambda x: x['cpu_memory_mb'])
    
    print(f"Fastest Build:    {fastest_build['name']} ({fastest_build['build_time_s']:.2f}s)")
    print(f"Lowest Latency:   {lowest_latency['name']} ({lowest_latency['query_latency_mean_ms']:.2f}ms)")
    print(f"Highest QPS:      {highest_qps['name']} ({highest_qps['throughput_qps']:.0f} qps)")
    print(f"Lowest Memory:    {lowest_memory['name']} ({lowest_memory['cpu_memory_mb']:.1f} MB)")

## Exercise 3 Solution: Scaling Analysis

**Task**: Test how each database scales with document count.

In [None]:
def scaling_analysis(
    base_chunks: List[Document],
    embedding_model,
    test_queries: List[str],
    scale_factors: List[int] = [1, 2, 5, 10]
) -> Dict[str, List[Dict]]:
    """
    Analyze how databases scale with increasing data.
    """
    scaling_results = {}
    
    for wrapper_class in [ChromaWrapper, lambda: FAISSWrapper(use_gpu=True)]:
        wrapper = wrapper_class()
        scaling_results[wrapper.name] = []
        
        for factor in scale_factors:
            # Create scaled dataset
            scaled_chunks = base_chunks * factor
            
            # Add unique IDs to prevent deduplication
            for i, chunk in enumerate(scaled_chunks):
                chunk.metadata = {**chunk.metadata, "unique_id": i}
            
            print(f"  {wrapper.name} at {len(scaled_chunks)} chunks...")
            
            try:
                result = benchmark_vector_store(
                    wrapper_class(),
                    scaled_chunks,
                    embedding_model,
                    test_queries,
                    num_iterations=3
                )
                result['num_chunks'] = len(scaled_chunks)
                scaling_results[wrapper.name].append(result)
            except Exception as e:
                print(f"    Failed: {e}")
    
    return scaling_results

print("Running scaling analysis...\n")
scaling = scaling_analysis(chunks, embedding_model, test_queries[:3], scale_factors=[1, 2, 3])

print("\n" + "="*80)
print("SCALING ANALYSIS RESULTS")
print("="*80)

for db_name, results in scaling.items():
    print(f"\n{db_name}:")
    print(f"{'Chunks':<10} {'Build (s)':<12} {'Latency (ms)':<15} {'QPS':<10}")
    print("-"*47)
    for r in results:
        print(f"{r['num_chunks']:<10} {r['build_time_s']:<12.2f} {r['query_latency_mean_ms']:<15.2f} {r['throughput_qps']:<10.0f}")

## Recommendations

In [None]:
recommendations = """
VECTOR DATABASE SELECTION GUIDE
================================

CHROMADB
--------
Best for: Development, prototyping, small-medium datasets
Strengths:
  - Easy setup, no external dependencies
  - Built-in persistence
  - Good filtering support
Weaknesses:
  - Slower at scale (>100k vectors)
  - No native GPU support
Use when: Prototyping, datasets < 100k vectors

FAISS (GPU)
-----------
Best for: High-performance production, large datasets
Strengths:
  - Extremely fast search (GPU accelerated)
  - Excellent scaling characteristics
  - Multiple index types (IVF, HNSW, PQ)
Weaknesses:
  - Requires GPU for best performance
  - No built-in persistence (need to save/load)
  - Limited filtering
Use when: Performance critical, GPU available, > 1M vectors

QDRANT
------
Best for: Production with complex filtering needs
Strengths:
  - Excellent filtering performance
  - Good horizontal scaling
  - Rich query language
Weaknesses:
  - Slightly higher latency than FAISS
  - Requires running server for persistence
Use when: Need complex filters, production deployment

DGX SPARK RECOMMENDATIONS
=========================
- Use FAISS-GPU for maximum throughput
- ChromaDB for development/debugging
- Pre-compute embeddings with batch_size=64
- Monitor GPU memory with torch.cuda.memory_allocated()
"""

print(recommendations)