# 4. ColBERT (Token-Level Embeddings)

**What:** Creates separate embeddings for each token, not a single vector per chunk

**Why:** Standard embeddings average all token meanings - small but important terms get lost

**When:** Keyword/numeric queries, scientific terminology, exact matches matter

**The Problem:**
```
Document: "The learning rate was set to 0.001 with Adam optimizer"
Standard: [single 1536-dim vector] --> "0.001" gets averaged away
Query:    "What was the learning rate?" --> May fail to match
```

**ColBERT Solution:**
```
Standard Embedding:
  "The learning rate was 0.001" --> [single 1536-dim vector]
  Good for: Conceptual/semantic queries
  Bad for:  Specific keywords, exact values

ColBERT:
  "The learning rate was 0.001"
    |      |       |    |    |
  [v1]   [v2]    [v3] [v4] [v5]  <-- Token-level vectors
  Good for: Keyword queries, exact matching
  Slower:   More vectors to store and compare
```

In [None]:
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS

from config import embeddings, load_documents, format_docs

# Check if RAGatouille is available
try:
    from ragatouille import RAGPretrainedModel
    COLBERT_AVAILABLE = True
    print("ColBERT available via RAGatouille")
except ImportError:
    COLBERT_AVAILABLE = False
    print("RAGatouille not installed. Install with: pip install ragatouille")

## Create ColBERT Index

In [None]:
def create_colbert_index(documents, index_name="colbert_index"):
    """Create ColBERT token-level index"""
    if not COLBERT_AVAILABLE:
        print("ColBERT not available")
        return None
    
    print(f"Creating ColBERT index for {len(documents)} documents...")
    
    # Initialize ColBERT
    RAG = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
    
    # Prepare texts
    texts = [doc.page_content for doc in documents]
    ids = [f"doc_{i}" for i in range(len(documents))]
    
    # Create index
    RAG.index(
        collection=texts,
        document_ids=ids,
        index_name=index_name,
        max_document_length=512,
        split_documents=True
    )
    
    print("ColBERT index created")
    return RAG

## ColBERT Search

In [None]:
def colbert_search(query, RAG, k=3):
    """Search using ColBERT token-level matching"""
    if RAG is None:
        print("No ColBERT index available")
        return []
    
    print(f"Query: {query}\n")
    
    results = RAG.search(query, k=k)
    
    print(f"Found {len(results)} results:")
    for i, r in enumerate(results):
        score = r.get('score', 0)
        content = r.get('content', '')[:80]
        print(f"  {i+1}. (score: {score:.3f}) {content}...")
    
    return results

## Compare Standard vs ColBERT

In [None]:
def compare_standard_vs_colbert(query, documents, RAG, k=3):
    """Compare standard embedding search vs ColBERT"""
    print("="*60)
    print("COMPARISON: Standard vs ColBERT")
    print("="*60)
    print(f"Query: {query}")
    print("(ColBERT excels at keyword/exact-value queries)\n")
    
    # Standard embedding search
    print("--- STANDARD EMBEDDING ---")
    vectorstore = FAISS.from_documents(documents[:30], embeddings)
    standard_results = vectorstore.similarity_search(query, k=k)
    
    for i, doc in enumerate(standard_results):
        print(f"  {i+1}. {doc.page_content[:80]}...")
    
    # ColBERT search
    if RAG:
        print("\n--- COLBERT ---")
        colbert_results = RAG.search(query, k=k)
        
        for i, r in enumerate(colbert_results):
            score = r.get('score', 0)
            content = r.get('content', '')[:80]
            print(f"  {i+1}. (score: {score:.3f}) {content}...")
    
    return standard_results

## Test

In [None]:
# Load documents
documents = load_documents()

if not COLBERT_AVAILABLE:
    print("Install RAGatouille to test ColBERT: pip install ragatouille")
else:
    # Create index
    RAG = create_colbert_index(documents[:20], index_name="test_colbert")
    
    # Test keyword queries (ColBERT excels here)
    keyword_queries = [
        "learning rate 0.001",
        "Adam optimizer",
        "molecular weight threshold"
    ]
    
    for query in keyword_queries:
        print("\n" + "="*60)
        colbert_search(query, RAG, k=3)
    
    # Compare with standard
    print("\n")
    compare_standard_vs_colbert("Adam optimizer learning rate", documents[:20], RAG)