In [None]:
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    Settings,
    Document,
    StorageContext,
    load_index_from_storage,
)
from llama_index.core.vector_stores import VectorStoreInfo, MetadataInfo
from llama_index.core.retrievers import VectorIndexRetriever, VectorIndexAutoRetriever
from llama_index.core.query_engine import RetrieverQueryEngine

from llama_index.vector_stores.chroma import ChromaVectorStore
try:
    from llama_index.vector_stores.qdrant import QdrantVectorStore
    QDRANT_AVAILABLE = True
except ImportError:
    QDRANT_AVAILABLE = False
    print("⚠️  Qdrant not installed. Install with: pip install llama-index-vector-stores-qdrant qdrant-client")

#Embeddings
from llama_index.embeddings.google_genai import GoogleGenAIEmbedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

#LLM
from llama_index.llms.google_genai import GoogleGenAI

#External Liberaries
import chromadb
if QDRANT_AVAILABLE:
    from qdrant_client import QdrantClient

from dotenv import load_dotenv
load_dotenv()

import nest_asyncio
nest_asyncio.apply()

import os
from pathlib import Path
import time
import warnings
warnings.filterwarnings("ignore")

print("✅ Imports successful!")

✅ Imports successful!


In [3]:
# Load environment and configure Settings
load_dotenv()

Settings.llm = GoogleGenAI(model="gemini-2.5-flash", temperature=0.1)
Settings.embed_model = GoogleGenAIEmbedding(
    model="gemini-embedding-001",
    dimensions=1536
)
Settings.chunk_size = 1024
Settings.chunk_overlap = 200

print("✅ Settings configured")

✅ Settings configured


In [4]:
# Create comprehensive sample documents
documents = [
    Document(
        text="""
        Vector databases are specialized databases designed to store and query high-dimensional vectors.
        These vectors typically represent embeddings of text, images, or other data. Vector databases
        enable efficient similarity search using algorithms like HNSW (Hierarchical Navigable Small World)
        or IVF (Inverted File Index). Popular vector databases include Qdrant, Pinecone, Weaviate, and Milvus.
        """,
        metadata={"topic": "vector_databases", "difficulty": "intermediate", "year": 2023}
    ),
    Document(
        text="""
        HNSW (Hierarchical Navigable Small World) is a graph-based algorithm for approximate nearest neighbor
        search. It builds a multi-layer graph where each layer is a subset of the previous one. The algorithm
        achieves excellent query performance (sub-millisecond) with high recall. HNSW parameters include
        M (number of connections per node) and ef_construction (search width during construction).
        """,
        metadata={"topic": "algorithms", "difficulty": "advanced", "year": 2023}
    ),
    Document(
        text="""
        Embedding models convert text into dense vector representations that capture semantic meaning.
        OpenAI's text-embedding-3-small produces 1536-dimensional vectors and is optimized for retrieval tasks.
        Open-source alternatives include sentence-transformers models like all-MiniLM-L6-v2 (384 dimensions)
        and all-mpnet-base-v2 (768 dimensions). The choice of embedding model affects retrieval quality and cost.
        """,
        metadata={"topic": "embeddings", "difficulty": "beginner", "year": 2024}
    ),
    Document(
        text="""
        Qdrant is an open-source vector database written in Rust. It supports HNSW indexing, filtering,
        and hybrid search. Qdrant can run locally (Docker) or in the cloud. Key features include payload
        filtering, quantization for memory reduction, and distributed deployments. Qdrant is particularly
        well-suited for production RAG applications.
        """,
        metadata={"topic": "qdrant", "difficulty": "intermediate", "year": 2024}
    ),
    Document(
        text="""
        Chroma is a lightweight, embedded vector database designed for AI applications. It runs in-memory
        or can persist to disk. Chroma is easy to set up and integrates seamlessly with LangChain and LlamaIndex.
        It's ideal for prototyping and small-to-medium scale applications. Chroma supports metadata filtering
        and multiple distance metrics (cosine, euclidean, dot product).
        """,
        metadata={"topic": "chroma", "difficulty": "beginner", "year": 2024}
    ),
]

print(f"✅ Created {len(documents)} sample documents")
print(f"   Topics: {', '.join(set(d.metadata['topic'] for d in documents))}")

✅ Created 5 sample documents
   Topics: vector_databases, chroma, qdrant, embeddings, algorithms


---

## 3. In-Memory Vector Store (Default)

### SimpleVectorStore: LlamaIndex's Built-in Store

## This is only for POC's, NOBODY USE THIS IN PRODUCTION 

In [5]:
# Create index with default in-memory vector store
print("Creating VectorStoreIndex (in-memory)...")
start_time = time.time()

simple_index = VectorStoreIndex.from_documents(
    documents,
    show_progress=True
)

elapsed = time.time() - start_time
print(f"\n✅ Index created in {elapsed:.2f} seconds")
print(f"   Vector store type: SimpleVectorStore (in-memory)")

Creating VectorStoreIndex (in-memory)...


Parsing nodes:   0%|          | 0/5 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/5 [00:00<?, ?it/s]


✅ Index created in 0.66 seconds
   Vector store type: SimpleVectorStore (in-memory)


---

## 4. Chroma Vector Store Integration

In [7]:
# Initialize Chroma client (in-memory)
chroma_client = chromadb.EphemeralClient() #In-memory
# For persistence: chromadb.PersistentClient(path="./chroma_db")

#collection name
collection_name = "llama_index_docs"
chroma_collection = chroma_client.create_collection(collection_name)

# Create Chroma vector store
chroma_vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

# Create storage context
storage_context = StorageContext.from_defaults(vector_store=chroma_vector_store)

print("Creating VectorStoreIndex with Chroma...")
start_time = time.time()

chroma_index = VectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context,
    show_progress=True,
)

elapsed = time.time() - start_time
print(f"\n✅ Chroma index created in {elapsed:.2f} seconds")
print(f"   Collection: {collection_name}")
print(f"   Documents indexed: {chroma_collection.count()}")

Creating VectorStoreIndex with Chroma...


Parsing nodes:   0%|          | 0/5 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/5 [00:00<?, ?it/s]


✅ Chroma index created in 0.83 seconds
   Collection: llama_index_docs
   Documents indexed: 5


---

## 5. Qdrant Vector Store Integration (Optional)

In [8]:
if QDRANT_AVAILABLE:
    # Initialize Qdrant client (in-memory)
    qdrant_client = QdrantClient(location=":memory:")
    # For persistence: QdrantClient(path="./qdrant_db")
    # For cloud: QdrantClient(url=os.getenv("QDRANT_URL"), api_key=os.getenv("QDRANT_API_KEY"))
    
    # Create Qdrant vector store
    qdrant_vector_store = QdrantVectorStore(
        client=qdrant_client,
        collection_name="llama_index_qdrant",
    )
    
    # Create storage context
    qdrant_storage_context = StorageContext.from_defaults(vector_store=qdrant_vector_store)
    
    print("Creating VectorStoreIndex with Qdrant...")
    start_time = time.time()
    
    qdrant_index = VectorStoreIndex.from_documents(
        documents,
        storage_context=qdrant_storage_context,
        show_progress=True,
    )
    
    elapsed = time.time() - start_time
    print(f"\n✅ Qdrant index created in {elapsed:.2f} seconds")
    print(f"   Collection: llama_index_qdrant")
else:
    print("⚠️  Skipping Qdrant example (not installed)")
    qdrant_index = None

Creating VectorStoreIndex with Qdrant...


Parsing nodes:   0%|          | 0/5 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/5 [00:00<?, ?it/s]


✅ Qdrant index created in 0.52 seconds
   Collection: llama_index_qdrant


In [14]:
# Test OpenAI embedding
openai_embed = GoogleGenAIEmbedding(
    model="gemini-embedding-001",
    dimensions=1536,
)

test_text = "Vector databases enable semantic search"
start_time = time.time()
gemini_vector = openai_embed.get_text_embedding(test_text)
gemini_time = time.time() - start_time

print(f"GoogleGenAI Embedding (gemini-embedding-001):")
print(f"  Dimensions: {len(gemini_vector)}")
print(f"  Time: {gemini_time*1000:.2f}ms")
print(f"  First 5 values: {gemini_vector[:5]}")

GoogleGenAI Embedding (gemini-embedding-001):
  Dimensions: 768
  Time: 356.02ms
  First 5 values: [-0.01889738, 0.00028500613, -0.027077155, 0.00047685512, 0.001233422]


---

## 7. Index Persistence

### Saving Index to Disk

In [15]:
# Save index to disk
persist_dir = "./storage"

print(f"Persisting index to {persist_dir}...")
simple_index.storage_context.persist(persist_dir=persist_dir)

print("\n✅ Index persisted successfully!")
print(f"   Location: {persist_dir}")

# Check what was saved
storage_path = Path(persist_dir)
if storage_path.exists():
    files = list(storage_path.glob("*"))
    print(f"   Files created: {len(files)}")
    for f in files:
        print(f"     - {f.name}")

Persisting index to ./storage...

✅ Index persisted successfully!
   Location: ./storage
   Files created: 5
     - image__vector_store.json
     - graph_store.json
     - index_store.json
     - docstore.json
     - default__vector_store.json


### Loading Index from Storage

In [22]:
import asyncio

# Load index from disk
print(f"Loading index from {persist_dir}...")

storage_context_load = StorageContext.from_defaults(persist_dir=persist_dir)
loaded_index = load_index_from_storage(storage_context_load)

print("Index loaded successfully.")

# -----------------------------
# Async query execution
# -----------------------------
async def run_engine():
    test_query_engine = loaded_index.as_query_engine(similarity_top_k=2)
    test_response = await test_query_engine.aquery("what is Qdrant?")

    print("\nTest query on loaded index:")
    print("  Query: What is Qdrant?")
    print(f"  Response: {test_response}")

# In Jupyter / Notebook
await run_engine()


Loading index from ./storage...
Index loaded successfully.

Test query on loaded index:
  Query: What is Qdrant?
  Response: Qdrant is an open-source vector database developed in Rust. It is designed to store and query high-dimensional vectors, supporting features such as HNSW indexing, filtering, and hybrid search. It can be deployed locally using Docker or in cloud environments. Its key capabilities include payload filtering, quantization for memory optimization, and distributed deployments, making it particularly suitable for production RAG applications.


---

## 8. Query Engine Configuration

### 8.1 Basic Query Engine

In [27]:
query_engine = chroma_index.as_query_engine(
    similarity_top_k=2,
    response_mode="compact"
)

async def run_query(query_text: str):
    response = await query_engine.aquery(query_text)

    print(f"Query: {query_text}\n")
    print(f"Response:\n{response}")
    print("\n" + "=" * 80)
    print(f"\nSources used: {len(response.source_nodes)}")

# Jupyter / Notebook
await run_query("What are the main vector databases mentioned?")


Query: What are the main vector databases mentioned?

Response:
The main vector databases mentioned are Qdrant, Pinecone, Weaviate, and Milvus.


Sources used: 2


### 8.2 Response Synthesis Modes Deep Dive

In [None]:
import time

modes = ["compact", "tree_summarize", "simple_summarize", "refine"]
test_query = "Explain HNSW algorithm"

print(f"Testing response modes with query: '{test_query}'\n")
print("=" * 80)
 ync def test_response_modes():
    for mode in modes:
        engine = chroma_index.as_query_engine(
            response_mode=mode,
            similarity_top_k=2,
        )

        start = time.time()
        response = await engine.aquery(test_query)
        elapsed = time.time() - start

        print(f"\nMode: {mode}")
        print(f"  Time: {elapsed:.2f}s")
        print(f"  Response length: {len(str(response))} chars")
        print(f"  Response preview: {str(response)[:200]}...")
        print("-" * 80)

# Jupyter / Notebook
await test_response_modes()


Testing response modes with query: 'Explain HNSW algorithm'


Mode: compact
  Time: 2.60s
  Response length: 604 chars
  Response preview: HNSW (Hierarchical Navigable Small World) is a graph-based algorithm designed for approximate nearest neighbor search. It constructs a multi-layer graph where each successive layer is a subset of the ...
--------------------------------------------------------------------------------

Mode: tree_summarize
  Time: 2.63s
  Response length: 621 chars
  Response preview: HNSW (Hierarchical Navigable Small World) is a graph-based algorithm designed for approximate nearest neighbor search. It constructs a multi-layer graph where each successive layer is a subset of the ...
--------------------------------------------------------------------------------

Mode: simple_summarize
  Time: 2.49s
  Response length: 622 chars
  Response preview: HNSW (Hierarchical Navigable Small World) is a graph-based algorithm designed for approximate nearest neighbor search. 

### 8.3 Streaming Responses

In [34]:
# Create streaming query engine
streaming_engine = chroma_index.as_query_engine(
    similarity_top_k=2,
    streaming=True,
)

query_text = "What is the difference between Qdrant and Chroma?"
print(f"Query: {query_text}\n")
print("Streaming response:")
print("-" * 80)

async def run_streaming_query():
    response = await streaming_engine.aquery(query_text)

    # ✅ async generator → async for
    async for text in response.response_gen:
        print(text, end="", flush=True)

    print("\n" + "=" * 80)

# Jupyter / Notebook
await run_streaming_query()


Query: What is the difference between Qdrant and Chroma?

Streaming response:
--------------------------------------------------------------------------------
Qdrant is an open-source vector database written in Rust, designed for production RAG applications, and supports distributed deployments, HNSW indexing, hybrid search, and quantization for memory reduction. It can be run locally via Docker or in the cloud.

Chroma, on the other hand, is a lightweight, embedded vector database primarily for AI applications, ideal for prototyping and small-to-medium scale use cases. It runs in-memory or can persist to disk, is easy to set up, and integrates well with LangChain and LlamaIndex, offering metadata filtering and various distance metrics.
