# Embeddings Notebook

This notebook handles embedding generation for the RAG system.

## Purpose
- Generate embeddings for knowledge base content
- Create query embeddings
- Test embedding models
- Visualize embedding spaces

## Usage
Import embedding functions from src.rag.embeddings and generate embeddings for your data.


In [None]:
# Setup and Imports
import sys
from pathlib import Path

# Add project root to path
project_root = Path("..").resolve()
sys.path.insert(0, str(project_root))

# Import config from root config folder
from config import get_config, get_config_loader

# Import other modules
from src.rag.embeddings import EmbeddingModel, create_embedding_model
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

print("✅ Imports successful!")


In [None]:
# Initialize Configuration from config folder
config = get_config()
config_loader = get_config_loader()

print(f"Config loaded from: {config_loader.config_path}")
print(f"Embedding Model: {config.get('models.embedding.model_name')}")
print(f"Device: {config.get('models.embedding.device')}")
print(f"Batch Size: {config.get('models.embedding.batch_size')}")


## 2. Initialize Embedding Model

Create an embedding model instance using the configured settings.


In [None]:
# Create embedding model
embedding_model = EmbeddingModel(
    model_name=config.get("models.embedding.model_name"),
    device=config.get("models.embedding.device"),
)

print(f"✅ Embedding model loaded: {embedding_model.model_name}")
print(f"Embedding dimension: {embedding_model.get_embedding_dimension()}")
print(f"Device: {embedding_model.device}")


## 3. Generate Embeddings for Sample Texts

Test embedding generation with sample Cirq-related texts.


In [None]:
# Sample texts for testing
sample_texts = [
    "Create a 2-qubit Bell state circuit using Cirq",
    "Implement a Grover search algorithm for 3 qubits",
    "Build a VQE circuit for quantum chemistry",
    "Generate a QAOA circuit for optimization",
    "Create a quantum teleportation circuit",
]

# Generate embeddings
embeddings = embedding_model.encode(
    sample_texts,
    batch_size=config.get("models.embedding.batch_size", 32),
    show_progress_bar=True,
)

print(f"✅ Generated embeddings for {len(sample_texts)} texts")
print(f"Embedding shape: {embeddings.shape}")
print(f"Embedding stats:")
print(f"  Mean: {embeddings.mean():.4f}")
print(f"  Std: {embeddings.std():.4f}")
print(f"  Min: {embeddings.min():.4f}")
print(f"  Max: {embeddings.max():.4f}")


## 4. Test Query Embeddings

Generate embeddings for queries and compare with document embeddings.


In [None]:
# Generate query embedding
query = "How to create a Bell state?"
query_embedding = embedding_model.encode_queries([query])

print(f"✅ Query embedding generated")
print(f"Query: {query}")
print(f"Query embedding shape: {query_embedding.shape}")

# Calculate similarity with sample texts
from sklearn.metrics.pairwise import cosine_similarity

similarities = cosine_similarity(query_embedding, embeddings)[0]

print("\nSimilarity scores:")
for text, sim in zip(sample_texts, similarities):
    print(f"  {text[:50]}... : {sim:.4f}")


## 5. Visualize Embeddings (PCA)

Visualize embeddings in 2D using PCA for dimensionality reduction.


In [None]:
# Reduce to 2D using PCA
pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(embeddings)

# Plot
plt.figure(figsize=(10, 8))
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], s=100, alpha=0.6)

for i, text in enumerate(sample_texts):
    plt.annotate(
        text[:30] + "...",
        (embeddings_2d[i, 0], embeddings_2d[i, 1]),
        fontsize=8,
    )

plt.title("Embedding Visualization (PCA)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"✅ Visualization complete")
print(f"PCA explained variance: {pca.explained_variance_ratio_.sum():.2%}")


## 6. Batch Processing

Test batch processing for large-scale embedding generation.


In [None]:
# Generate more sample texts
large_batch = [
    f"Cirq code example {i}: Create a quantum circuit with {i} qubits"
    for i in range(1, 21)
]

# Process in batches
large_embeddings = embedding_model.encode(
    large_batch,
    batch_size=config.get("models.embedding.batch_size", 32),
    show_progress_bar=True,
)

print(f"✅ Processed {len(large_batch)} texts in batches")
print(f"Final embeddings shape: {large_embeddings.shape}")

# Check statistics
stats = embedding_model.get_stats()
print(f"\nEmbedding Statistics:")
print(f"  Total embeddings: {stats['total_embeddings']}")
print(f"  Total batches: {stats['total_batches']}")
print(f"  Total texts: {stats['total_texts']}")


## 7. Model Information

Display model information and capabilities.


In [None]:
print("Embedding Model Information:")
print("=" * 60)
print(f"Model Name: {embedding_model.model_name}")
print(f"Embedding Dimension: {embedding_model.get_embedding_dimension()}")
print(f"Device: {embedding_model.device}")
print(f"Model Type: {type(embedding_model.model).__name__}")
print("=" * 60)
