### Mini: Embedding Generation — 5‑minute demo

Goal: Take chunked documents and convert them to vector embeddings for similarity search.

What we'll do:
- Load chunked documents from our PDF processing
- Set up a lightweight embedding model (HuggingFace or OpenAI)
- Generate embeddings for each chunk
- Preview the vector representations
- Save embeddings with metadata for indexing

Quick and practical—ready for vector search!


In [None]:
# Install embedding dependencies
import sys, subprocess

def pip_install(package: str):
    try:
        __import__(package)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# For embedding models
pip_install("langchain")
pip_install("langchain-community") 
pip_install("sentence-transformers")  # HuggingFace embeddings
pip_install("langchain-huggingface")   # LangChain HF integration

print("Embedding setup ready ✔")


In [None]:
# Setup embedding model - using lightweight HuggingFace model
from langchain_huggingface import HuggingFaceEmbeddings

# Using a small, fast model good for demos
embedding_model = HuggingFaceEmbeddings(
    model_name="all-MiniLM-L6-v2",  # 384 dimensions, fast
    model_kwargs={'device': 'cpu'},  # CPU for compatibility
    encode_kwargs={'normalize_embeddings': True}
)

# Test it works
test_text = "This is a test sentence for embedding."
test_embedding = embedding_model.embed_query(test_text)
print(f"Model loaded ✔ Embedding dimension: {len(test_embedding)}")
print(f"Sample embedding (first 5 values): {test_embedding[:5]}")


In [None]:
# Load chunked documents from our doc_processing work
# Option 1: Re-run the LangChain chunking (if you have it)
# Option 2: Load from saved JSONL and convert to Documents

from pathlib import Path
import json
from langchain.schema import Document

# Try to load from JSONL first (simpler)
jsonl_path = Path("data/sample_chunks.jsonl")

if jsonl_path.exists():
    print("Loading from JSONL...")
    docs = []
    with open(jsonl_path, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f):
            data = json.loads(line.strip())
            doc = Document(
                page_content=data["text"],
                metadata={
                    "chunk_id": data["id"],
                    "source": "data/sample.pdf",
                    "chunk_index": line_num,
                    "type": "pdf_chunk"
                }
            )
            docs.append(doc)
else:
    print("JSONL not found. Run doc_processing.ipynb first or create sample docs...")
    # Fallback: create sample documents
    docs = [
        Document(page_content="Sample document chunk 1 with some content.", metadata={"chunk_id": "sample-0"}),
        Document(page_content="Sample document chunk 2 with different content.", metadata={"chunk_id": "sample-1"})
    ]

print(f"Loaded {len(docs)} document chunks")
print(f"First chunk preview: {docs[0].page_content[:200]}...")


In [None]:
# Generate embeddings for all chunks
import time

print("Generating embeddings...")
start_time = time.time()

# Extract text content from documents
texts = [doc.page_content for doc in docs]

# Generate embeddings in batch (more efficient)
embeddings = embedding_model.embed_documents(texts)

end_time = time.time()
print(f"Generated {len(embeddings)} embeddings in {end_time - start_time:.2f} seconds")
print(f"Each embedding has {len(embeddings[0])} dimensions")

# Preview first embedding
print(f"First embedding (first 10 values): {embeddings[0][:10]}")
print(f"Embedding magnitude: {sum(x**2 for x in embeddings[0])**0.5:.3f}")  # Should be ~1.0 if normalized


In [None]:
# Quick similarity demo - compare chunks
import numpy as np

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Compare first few chunks
print("Similarity between chunks:")
for i in range(min(3, len(embeddings))):
    for j in range(i+1, min(3, len(embeddings))):
        sim = cosine_similarity(embeddings[i], embeddings[j])
        print(f"Chunk {i} ↔ Chunk {j}: {sim:.3f}")
        print(f"  Text {i}: {texts[i][:100]}...")
        print(f"  Text {j}: {texts[j][:100]}...")
        print()


In [None]:
# Save embeddings with metadata for vector database
import json
import numpy as np
from pathlib import Path

# Prepare data for saving
embedding_data = []
for i, (doc, embedding) in enumerate(zip(docs, embeddings)):
    record = {
        "id": doc.metadata.get("chunk_id", f"chunk-{i}"),
        "text": doc.page_content,
        "metadata": doc.metadata,
        "embedding": embedding,  # List of floats
        "embedding_model": "all-MiniLM-L6-v2",
        "embedding_dim": len(embedding)
    }
    embedding_data.append(record)

# Save as NPZ for efficient loading (vectors) + JSON for metadata
vectors_path = Path("data/embeddings.npz")
metadata_path = Path("data/embeddings_metadata.json")

# Save vectors efficiently
np.savez_compressed(
    vectors_path,
    embeddings=np.array(embeddings),
    chunk_ids=[record["id"] for record in embedding_data]
)

# Save metadata separately
metadata_only = [
    {k: v for k, v in record.items() if k != "embedding"}
    for record in embedding_data
]

with open(metadata_path, 'w', encoding='utf-8') as f:
    json.dump(metadata_only, f, indent=2, ensure_ascii=False)

print(f"Saved {len(embeddings)} embeddings:")
print(f"  Vectors: {vectors_path}")
print(f"  Metadata: {metadata_path}")
print(f"Ready for vector search! 🚀")
