In [1]:
# Core LlamaIndex
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    Settings,
    Document,
    StorageContext,
    load_index_from_storage,
)
from llama_index.core.vector_stores import VectorStoreInfo, MetadataInfo
from llama_index.core.retrievers import VectorIndexRetriever, VectorIndexAutoRetriever
from llama_index.core.query_engine import RetrieverQueryEngine

# Vector Stores
from llama_index.vector_stores.chroma import ChromaVectorStore
try:
    from llama_index.vector_stores.qdrant import QdrantVectorStore
    QDRANT_AVAILABLE = True
except ImportError:
    QDRANT_AVAILABLE = False
    print("⚠️  Qdrant not installed. Install with: pip install llama-index-vector-stores-qdrant qdrant-client")

# Embeddings
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# LLM
from llama_index.llms.groq import Groq

# External libraries
import chromadb
if QDRANT_AVAILABLE:
    from qdrant_client import QdrantClient

from dotenv import load_dotenv
import os
from pathlib import Path
import time
import warnings
warnings.filterwarnings('ignore')

print("✅ Imports successful!")


All support for the `google.generativeai` package has ended. It will no longer be receiving 
updates or bug fixes. Please switch to the `google.genai` package as soon as possible.
See README for more details:

https://github.com/google-gemini/deprecated-generative-ai-python/blob/main/README.md

  import google.generativeai as gemini


✅ Imports successful!


In [2]:
# Load environment and configure Settings
load_dotenv()

# Configure LLM
Settings.llm = Groq(model="Qwen/Qwen3-32B",temperature=0.1)

# Configure Embedding Model
Settings.embed_model = GeminiEmbedding(
    model_name="models/gemini-embedding-001",title="this is a document",
)

Settings.chunk_size = 1024
Settings.chunk_overlap = 200

print("✅ Settings configured")

✅ Settings configured


In [3]:
# Create comprehensive sample documents
documents = [
    Document(
        text="""
        Vector databases are specialized databases designed to store and query high-dimensional vectors.
        These vectors typically represent embeddings of text, images, or other data. Vector databases
        enable efficient similarity search using algorithms like HNSW (Hierarchical Navigable Small World)
        or IVF (Inverted File Index). Popular vector databases include Qdrant, Pinecone, Weaviate, and Milvus.
        """,
        metadata={"topic": "vector_databases", "difficulty": "intermediate", "year": 2023}
    ),
    Document(
        text="""
        HNSW (Hierarchical Navigable Small World) is a graph-based algorithm for approximate nearest neighbor
        search. It builds a multi-layer graph where each layer is a subset of the previous one. The algorithm
        achieves excellent query performance (sub-millisecond) with high recall. HNSW parameters include
        M (number of connections per node) and ef_construction (search width during construction).
        """,
        metadata={"topic": "algorithms", "difficulty": "advanced", "year": 2023}
    ),
    Document(
        text="""
        Embedding models convert text into dense vector representations that capture semantic meaning.
        OpenAI's text-embedding-3-small produces 1536-dimensional vectors and is optimized for retrieval tasks.
        Open-source alternatives include sentence-transformers models like all-MiniLM-L6-v2 (384 dimensions)
        and all-mpnet-base-v2 (768 dimensions). The choice of embedding model affects retrieval quality and cost.
        """,
        metadata={"topic": "embeddings", "difficulty": "beginner", "year": 2024}
    ),
    Document(
        text="""
        Qdrant is an open-source vector database written in Rust. It supports HNSW indexing, filtering,
        and hybrid search. Qdrant can run locally (Docker) or in the cloud. Key features include payload
        filtering, quantization for memory reduction, and distributed deployments. Qdrant is particularly
        well-suited for production RAG applications.
        """,
        metadata={"topic": "qdrant", "difficulty": "intermediate", "year": 2024}
    ),
    Document(
        text="""
        Chroma is a lightweight, embedded vector database designed for AI applications. It runs in-memory
        or can persist to disk. Chroma is easy to set up and integrates seamlessly with LangChain and LlamaIndex.
        It's ideal for prototyping and small-to-medium scale applications. Chroma supports metadata filtering
        and multiple distance metrics (cosine, euclidean, dot product).
        """,
        metadata={"topic": "chroma", "difficulty": "beginner", "year": 2024}
    ),
]

print(f"✅ Created {len(documents)} sample documents")
print(f"   Topics: {', '.join(set(d.metadata['topic'] for d in documents))}")

✅ Created 5 sample documents
   Topics: qdrant, chroma, vector_databases, embeddings, algorithms


In [4]:
# Create index with default in-memory vector store
print("Creating VectorStoreIndex (in-memory)...")
start_time = time.time()

simple_index = VectorStoreIndex.from_documents(
    documents,
    show_progress=True,
)

elapsed = time.time() - start_time
print(f"\n✅ Index created in {elapsed:.2f} seconds")
print(f"   Vector store type: SimpleVectorStore (in-memory)")

Creating VectorStoreIndex (in-memory)...


Parsing nodes:   0%|          | 0/5 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/5 [00:00<?, ?it/s]


✅ Index created in 3.20 seconds
   Vector store type: SimpleVectorStore (in-memory)


In [5]:
# Initialize Chroma client (in-memory)
chroma_client = chromadb.EphemeralClient()  # In-memory
# For persistence: chromadb.PersistentClient(path="./chroma_db")

# Create collection
collection_name = "llama_index_docs"
chroma_collection = chroma_client.create_collection(collection_name)

# Create Chroma vector store
chroma_vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

# Create storage context
storage_context = StorageContext.from_defaults(vector_store=chroma_vector_store)

print("Creating VectorStoreIndex with Chroma...")
start_time = time.time()

chroma_index = VectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context,
    show_progress=True,
)

elapsed = time.time() - start_time
print(f"\n✅ Chroma index created in {elapsed:.2f} seconds")
print(f"   Collection: {collection_name}")
print(f"   Documents indexed: {chroma_collection.count()}")

Creating VectorStoreIndex with Chroma...


Parsing nodes:   0%|          | 0/5 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/5 [00:00<?, ?it/s]


✅ Chroma index created in 2.59 seconds
   Collection: llama_index_docs
   Documents indexed: 5


In [6]:
if QDRANT_AVAILABLE:
    # Initialize Qdrant client (in-memory)
    qdrant_client = QdrantClient(location=":memory:")
    # For persistence: QdrantClient(path="./qdrant_db")
    # For cloud: QdrantClient(url=os.getenv("QDRANT_URL"), api_key=os.getenv("QDRANT_API_KEY"))
    
    # Create Qdrant vector store
    qdrant_vector_store = QdrantVectorStore(
        client=qdrant_client,
        collection_name="llama_index_qdrant",
    )
    
    # Create storage context
    qdrant_storage_context = StorageContext.from_defaults(vector_store=qdrant_vector_store)
    
    print("Creating VectorStoreIndex with Qdrant...")
    start_time = time.time()
    
    qdrant_index = VectorStoreIndex.from_documents(
        documents,
        storage_context=qdrant_storage_context,
        show_progress=True,
    )
    
    elapsed = time.time() - start_time
    print(f"\n✅ Qdrant index created in {elapsed:.2f} seconds")
    print(f"   Collection: llama_index_qdrant")
else:
    print("⚠️  Skipping Qdrant example (not installed)")
    qdrant_index = None

Creating VectorStoreIndex with Qdrant...


Parsing nodes:   0%|          | 0/5 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/5 [00:00<?, ?it/s]


✅ Qdrant index created in 3.37 seconds
   Collection: llama_index_qdrant


In [9]:
# Test OpenAI embedding
# openai_embed = OpenAIEmbedding(
#     model="text-embedding-3-small",
#     dimensions=1536,
# )

gemini_embed = GeminiEmbedding(
    model_name="models/gemini-embedding-001"
)

test_text = "Vector databases enable semantic search"
start_time = time.time()
gemini_vector = gemini_embed.get_text_embedding(test_text)
openai_time = time.time() - start_time

print(f"OpenAI Embedding (text-embedding-3-small):")
print(f"  Dimensions: {len(gemini_vector)}")
print(f"  Time: {openai_time*1000:.2f}ms")
print(f"  First 5 values: {gemini_vector[:5]}")

OpenAI Embedding (text-embedding-3-small):
  Dimensions: 3072
  Time: 834.91ms
  First 5 values: [-0.0456277, -0.0052703377, 0.00029374406, -0.089979425, 0.0037476365]


In [8]:
# Test HuggingFace embedding
print("Loading HuggingFace model (this may take a moment on first run)...")
hf_embed = HuggingFaceEmbedding(
    model_name="sentence-transformers/all-MiniLM-L6-v2",  # 384 dimensions
)

start_time = time.time()
hf_vector = hf_embed.get_text_embedding(test_text)
hf_time = time.time() - start_time

print(f"\nHuggingFace Embedding (all-MiniLM-L6-v2):")
print(f"  Dimensions: {len(hf_vector)}")
print(f"  Time: {hf_time*1000:.2f}ms")
print(f"  First 5 values: {hf_vector[:5]}")

Loading HuggingFace model (this may take a moment on first run)...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


HuggingFace Embedding (all-MiniLM-L6-v2):
  Dimensions: 384
  Time: 445.14ms
  First 5 values: [0.04381895065307617, -0.00954702403396368, -0.020091237500309944, 0.015027557499706745, 0.013344950042665005]


In [10]:
import pandas as pd

comparison = pd.DataFrame([
    {
        "Model": "Groq embed_mode",
        "Dimensions": len(gemini_vector),
        "Time (ms)": f"{openai_time*1000:.2f}",
        "Cost": "$0.02/1M tokens",
        "Quality": "Excellent",
        "Hosting": "API",
    },
    {
        "Model": "all-MiniLM-L6-v2",
        "Dimensions": len(hf_vector),
        "Time (ms)": f"{hf_time*1000:.2f}",
        "Cost": "Free",
        "Quality": "Good",
        "Hosting": "Local",
    },
])

print("\nEmbedding Model Comparison:")
print(comparison.to_string(index=False))


Embedding Model Comparison:
           Model  Dimensions Time (ms)            Cost   Quality Hosting
 Groq embed_mode        3072    834.91 $0.02/1M tokens Excellent     API
all-MiniLM-L6-v2         384    445.14            Free      Good   Local


In [11]:
# Save index to disk
persist_dir = "./storage"

print(f"Persisting index to {persist_dir}...")
simple_index.storage_context.persist(persist_dir=persist_dir)

print("\n✅ Index persisted successfully!")
print(f"   Location: {persist_dir}")

# Check what was saved
storage_path = Path(persist_dir)
if storage_path.exists():
    files = list(storage_path.glob("*"))
    print(f"   Files created: {len(files)}")
    for f in files:
        print(f"     - {f.name}")

Persisting index to ./storage...

✅ Index persisted successfully!
   Location: ./storage
   Files created: 5
     - default__vector_store.json
     - docstore.json
     - graph_store.json
     - image__vector_store.json
     - index_store.json


In [13]:
import re
def groqLlmResponse(response):
    return  re.sub(r"<think>.*?</think>", "", response, flags=re.DOTALL).strip()

In [14]:
# Load index from disk
print(f"Loading index from {persist_dir}...")

storage_context_loaded = StorageContext.from_defaults(persist_dir=persist_dir)
loaded_index = load_index_from_storage(storage_context_loaded)

print("✅ Index loaded successfully!")

# Test the loaded index
test_query_engine = loaded_index.as_query_engine(similarity_top_k=2)
test_response = test_query_engine.query("What is Qdrant?")
clearResponse = test_response.response
print(f"\nTest query on loaded index:")
print(f"  Query: What is Qdrant?")
print(f"  Response: {groqLlmResponse(clearResponse)}")

Loading index from ./storage...
✅ Index loaded successfully!

Test query on loaded index:
  Query: What is Qdrant?
  Response: Qdrant is an open-source vector database developed in Rust, designed for efficient storage and querying of high-dimensional vectors. It supports advanced capabilities such as HNSW indexing for similarity search, payload filtering, and hybrid search methods. The system offers flexibility by allowing deployment on local environments via Docker or cloud-based setups. Additional features include quantization techniques to reduce memory usage and support for distributed architectures, making it a suitable choice for production-level RAG (Retrieval-Augmented Generation) applications.


In [15]:
# Create query engine with configuration
query_engine = chroma_index.as_query_engine(
    similarity_top_k=3,
    response_mode="compact",
)

query = "What are the main vector databases mentioned?"
response = query_engine.query(query)
clear_res = response.response

print(f"Query: {query}\n")
print(f"Response:\n{groqLlmResponse(clear_res)}")
print("\n" + "="*80)
print(f"\nSources used: {len(response.source_nodes)}")

Query: What are the main vector databases mentioned?

Response:
The main vector databases mentioned are Qdrant, Pinecone, Weaviate, and Milvus.


Sources used: 3


In [17]:
# Test all response modes
modes = ["compact", "tree_summarize", "simple_summarize", "refine"]
test_query = "Explain HNSW algorithm"

print(f"Testing response modes with query: '{test_query}'\n")
print("="*80)

for mode in modes:
    engine = chroma_index.as_query_engine(
        similarity_top_k=2,
        response_mode=mode
    )
    
    start = time.time()
    resp = engine.query(test_query)
    clear_resp = resp.response
    elapsed = time.time() - start
    
    print(f"\nMode: {mode}")
    print(f"  Time: {elapsed:.2f}s")
    print(f"  Response length: {len(str(resp))} chars")
    print(f"  Response: {str(groqLlmResponse(clear_resp))[:200]}...")
    print("-" * 80)

Testing response modes with query: 'Explain HNSW algorithm'


Mode: compact
  Time: 2.23s
  Response length: 2382 chars
  Response: HNSW (Hierarchical Navigable Small World) is a graph-based algorithm designed for efficient approximate nearest neighbor searches in high-dimensional spaces. It organizes data points into a multi-laye...
--------------------------------------------------------------------------------

Mode: tree_summarize
  Time: 1.92s
  Response length: 2042 chars
  Response: The HNSW (Hierarchical Navigable Small World) algorithm is designed for efficient approximate nearest neighbor search in high-dimensional spaces. It organizes data into a multi-layered graph structure...
--------------------------------------------------------------------------------

Mode: simple_summarize
  Time: 3.38s
  Response length: 3546 chars
  Response: The HNSW (Hierarchical Navigable Small World) algorithm is designed for efficient approximate nearest neighbor search in high-dimensional sp

In [None]:
# Create streaming query engine
streaming_engine = chroma_index.as_query_engine(
    similarity_top_k=2,
    streaming=True,
)

query = "What is the difference between Qdrant and Chroma?"
print(f"Query: {query}\n")
print("Streaming response:")
print("-" * 80)

response = streaming_engine.query(query)

inside_think = False
buffer = ""

# Stream tokens   content within <think> and </think> tags will be removed
for text in response.response_gen:
    buffer += text

    while True:
        if not inside_think:
            if "<think>" in buffer:
                before, buffer = buffer.split("<think>", 1)
                if before:
                    print(before, end="", flush=True)
                inside_think = True
            else:
                print(buffer, end="", flush=True)
                buffer = ""
                break
        else:
            if "</think>" in buffer:
                _, buffer = buffer.split("</think>", 1)
                inside_think = False
            else:
                buffer = ""
                break

print("\n" + "=" * 80)


Query: What is the difference between Qdrant and Chroma?

Streaming response:
--------------------------------------------------------------------------------


Qdrant and Chroma are both vector databases tailored for AI applications but differ in their design, capabilities, and use cases. 

**Chroma** emphasizes simplicity and ease of integration, making it ideal for prototyping and small-to-medium projects. It supports metadata filtering and multiple distance metrics (e.g., cosine, Euclidean, dot product) while offering seamless compatibility with tools like LangChain and LlamaIndex. Its lightweight nature allows it to operate in-memory or persist to disk, prioritizing developer convenience.

**Qdrant**, in contrast, is built for production-grade applications requiring advanced performance and scalability. It leverages Rust for efficiency, supports HNSW indexing, hybrid search, and distributed deployments. Features like payload filtering, quantization for memory optimization, and clo

In [25]:
# Create custom retriever
retriever = VectorIndexRetriever(
    index=chroma_index,
    similarity_top_k=3,
)

# Retrieve nodes directly (no LLM synthesis)
query_str = "vector database algorithms"
retrieved_nodes = retriever.retrieve(query_str)

print(f"Query: {query_str}\n")
print(f"Retrieved {len(retrieved_nodes)} nodes:\n")

for i, node in enumerate(retrieved_nodes, 1):
    print(f"Node {i}:")
    print(f"  Score: {node.score:.4f}")
    print(f"  Topic: {node.metadata.get('topic')}")
    print(f"  Difficulty: {node.metadata.get('difficulty')}")
    print(f"  Text (first 150 chars): {node.text[:150]}...")
    print()

Query: vector database algorithms

Retrieved 3 nodes:

Node 1:
  Score: 0.7894
  Topic: vector_databases
  Difficulty: intermediate
  Text (first 150 chars): Vector databases are specialized databases designed to store and query high-dimensional vectors.
        These vectors typically represent embeddings ...

Node 2:
  Score: 0.7228
  Topic: algorithms
  Difficulty: advanced
  Text (first 150 chars): HNSW (Hierarchical Navigable Small World) is a graph-based algorithm for approximate nearest neighbor
        search. It builds a multi-layer graph wh...

Node 3:
  Score: 0.6968
  Topic: qdrant
  Difficulty: intermediate
  Text (first 150 chars): Qdrant is an open-source vector database written in Rust. It supports HNSW indexing, filtering,
        and hybrid search. Qdrant can run locally (Doc...



In [27]:
# Build query engine from custom retriever
custom_query_engine = RetrieverQueryEngine.from_args(
    retriever=retriever,
    response_mode="compact",
)

response = custom_query_engine.query("Explain the HNSW algorithm")
clear_resp = response.response
print(f"Response from custom retriever:\n{groqLlmResponse(clear_resp)}")

Response from custom retriever:
The HNSW (Hierarchical Navigable Small World) algorithm is designed for efficient approximate nearest neighbor search in high-dimensional spaces. It organizes data points into a multi-layered graph structure, where each layer serves as a progressively sparser representation of the dataset. The topmost layer contains the fewest nodes, enabling rapid traversal, while lower layers retain more nodes for finer-grained searches. This hierarchical design allows queries to start at the top layer and navigate downward, narrowing the search scope with each layer.

Key aspects of HNSW include:  
1. **Graph Connectivity**: Each node connects to a fixed number of neighbors (controlled by the parameter *M*), balancing graph density and search efficiency.  
2. **Construction Process**: During graph building, a search width parameter (*ef_construction*) determines how thoroughly candidate connections are explored, influencing both accuracy and construction time.  
3. **

In [28]:
# Define metadata schema for auto-retriever
vector_store_info = VectorStoreInfo(
    content_info="Technical documentation about vector databases and embeddings",
    metadata_info=[
        MetadataInfo(
            name="topic",
            type="str",
            description="The main topic of the document (e.g., 'qdrant', 'chroma', 'embeddings')",
        ),
        MetadataInfo(
            name="difficulty",
            type="str",
            description="Difficulty level: 'beginner', 'intermediate', or 'advanced'",
        ),
        MetadataInfo(
            name="year",
            type="int",
            description="Year of publication (2023 or 2024)",
        ),
    ],
)

# Create auto-retriever
auto_retriever = VectorIndexAutoRetriever(
    chroma_index,
    vector_store_info=vector_store_info,
    similarity_top_k=3,
)

print("✅ VectorIndexAutoRetriever configured")

✅ VectorIndexAutoRetriever configured


In [29]:
# Query with natural language filters
query_with_filter = "Tell me about beginner-level topics"

print(f"Query: {query_with_filter}\n")
print("Auto-retriever will automatically extract metadata filters from the query!\n")

retrieved = auto_retriever.retrieve(query_with_filter)

print(f"Retrieved {len(retrieved)} nodes:\n")
for i, node in enumerate(retrieved, 1):
    print(f"Node {i}:")
    print(f"  Topic: {node.metadata.get('topic')}")
    print(f"  Difficulty: {node.metadata.get('difficulty')}")
    print(f"  Score: {node.score:.4f}")
    print()

Query: Tell me about beginner-level topics

Auto-retriever will automatically extract metadata filters from the query!

Retrieved 2 nodes:

Node 1:
  Topic: embeddings
  Difficulty: beginner
  Score: 0.7903

Node 2:
  Topic: chroma
  Difficulty: beginner
  Score: 0.7271

