In [2]:
#core LlamaIndex
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings, Document
from llama_index.core.node_parser import SentenceSplitter

#LLM Integrations
from llama_index.llms.google_genai import GoogleGenAI
# from llama_index.llms.openai import OpenAI

#Embedding Integrations
from llama_index.embeddings.google_genai import GoogleGenAIEmbedding
# from llama_index.embeddings.openai import OpenAIEmbedding

#utilities
from dotenv import load_dotenv
import os
import warnings
warnings.filterwarnings("ignore")

import nest_asyncio
nest_asyncio.apply()

print("✅ Imports successful")


✅ Imports successful


In [3]:
# Load environment variables from .env file
load_dotenv()

# Verify OpenAI API key is set
google_genai_api_key = os.getenv("GOOGLE_API_KEY")

if not google_genai_api_key:
    raise ValueError(
        "❌ GOOGLE_API_KEY not found!\n"
        "Please create a .env file in the project root with:\n"
        "GOOGLE_API_KEY=your_key_here"
    )

print(f"✅ Google GenAI API key loaded (starts with: {google_genai_api_key[:8]}...)")

✅ Google GenAI API key loaded (starts with: AIzaSyBk...)


In [10]:
# Configure LLM
# from llama_index.llms.google_genai.base import GoogleGenAI
Settings.llm = GoogleGenAI(
    model="gemini-2.5-flash",  # Fast, cost-effective for most use cases
    temperature=0.1,      # Low temperature for consistent responses
)

# Configure Embedding Model
Settings.embed_model = GoogleGenAIEmbedding(
    model="gemini-embedding-001",  # 1536 dimensions, good balance
    dimensions=1536,                 # Can be reduced for speed (e.g., 512)
)

# Configure Text Chunking
Settings.chunk_size = 1024           # Tokens per chunk (typical: 512-1024)
Settings.chunk_overlap = 200         # 20% overlap helps preserve context

# Configure Node Parser
Settings.node_parser = SentenceSplitter(
    chunk_size=Settings.chunk_size,
    chunk_overlap=Settings.chunk_overlap,
)

print("✅ Global Settings configured successfully!")
print(f"   LLM: {Settings.llm.model}")
print(f"   Embedding: {Settings.embed_model.model_name}")
print(f"   Chunk size: {Settings.chunk_size} tokens")
print(f"   Chunk overlap: {Settings.chunk_overlap} tokens")

✅ Global Settings configured successfully!
   LLM: gemini-2.5-flash
   Embedding: text-embedding-004
   Chunk size: 1024 tokens
   Chunk overlap: 200 tokens


In [11]:
# Create sample documents (in practice, load from files)
documents = [
    Document(
        text="""
        LlamaIndex is a data framework for large language models (LLMs). 
        It provides tools to ingest, structure, and access private or domain-specific data.
        LlamaIndex was created to solve the problem of connecting LLMs to external data sources.
        The framework supports various data sources including PDFs, databases, APIs, and web pages.
        """,
        metadata={"source": "intro", "category": "overview"}
    ),
    Document(
        text="""
        Vector embeddings are numerical representations of text that capture semantic meaning.
        In LlamaIndex, embeddings enable semantic search - finding relevant content based on meaning,
        not just keyword matching. The default embedding model is OpenAI's text-embedding-3-small,
        which produces 1536-dimensional vectors. Other models like all-MiniLM-L6-v2 produce 384 dimensions.
        """,
        metadata={"source": "embeddings", "category": "technical"}
    ),
    Document(
        text="""
        The VectorStoreIndex is the most common index type in LlamaIndex. It stores document embeddings
        in a vector database and performs similarity search during queries. When you query the index,
        it retrieves the most semantically similar chunks and passes them to the LLM as context.
        This is the foundation of Retrieval-Augmented Generation (RAG).
        """,
        metadata={"source": "vector_index", "category": "technical"}
    ),
]

print(f"✅ Created {len(documents)} sample documents")
print(f"   Total characters: {sum(len(doc.text) for doc in documents)}")

✅ Created 3 sample documents
   Total characters: 1169


In [6]:

#  Create index from documents
print("Creating VectorStoreIndex...")
print("This will:")
print("  1. Chunk documents into nodes")
print("  2. Generate embeddings for each node")
print("  3. Store in in-memory vector store\n")

index = VectorStoreIndex.from_documents(
    documents,
    show_progress=True,  #displays the progress bar
)

print("\n✅ Index created successfully!")

Creating VectorStoreIndex...
This will:
  1. Chunk documents into nodes
  2. Generate embeddings for each node
  3. Store in in-memory vector store



Parsing nodes:   0%|          | 0/3 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/3 [00:00<?, ?it/s]


✅ Index created successfully!


In [12]:
# Create query engine from index
query_engine = index.as_query_engine(
    similarity_top_k=2,  # Retrieve top 2 most similar chunks
    response_mode="compact",  # Compact response synthesis
)

print("✅ Query engine created!")
print(f"   Top-K: {2}")
print(f"   Response mode: compact")

✅ Query engine created!
   Top-K: 2
   Response mode: compact


In [13]:
# Query the index
query = "What is LlamaIndex used for?"
print(f"Query: {query}\n")

response = query_engine.query(query)

print("Response:")
print("-" * 80)
print(response)
print("-" * 80)

Query: What is LlamaIndex used for?

Response:
--------------------------------------------------------------------------------
LlamaIndex is a data framework designed for large language models (LLMs). It offers tools to ingest, structure, and access private or domain-specific data. Its primary purpose is to connect LLMs to external data sources.
--------------------------------------------------------------------------------


In [14]:
query1 = "How do embeddings work in LlamaIndex?"
response1 = query_engine.query(query1)

print(f"Query: {query1}\n")
print("Response:")
print(response1)
print("\nTop retrieved source:")
print(f"  Category: {response1.source_nodes[0].metadata.get('category')}")
print(f"  Score: {response1.source_nodes[0].score:.4f}")

Query: How do embeddings work in LlamaIndex?

Response:
In LlamaIndex, embeddings are numerical representations of text that capture semantic meaning. They facilitate semantic search, allowing for the discovery of relevant content based on its meaning rather than just keyword matches.

The VectorStoreIndex stores these document embeddings in a vector database. When a query is made, the index performs a similarity search using these embeddings to identify and retrieve the most semantically similar chunks of information. These retrieved chunks are then passed to a Large Language Model (LLM) as context, which is the underlying mechanism for Retrieval-Augmented Generation (RAG). The default model for generating these embeddings is OpenAI's text-embedding-3-small, which creates 1536-dimensional vectors, though other models like all-MiniLM-L6-v2, producing 384-dimensional vectors, can also be used.

Top retrieved source:
  Category: technical
  Score: 0.7761


---

## 9. Understanding the Document → Node → Index Flow

### Inspecting Nodes Directly

In [15]:
from llama_index.core.node_parser import SentenceSplitter

parser = SentenceSplitter(
    chunk_size=1024,
    chunk_overlap=200,
)
nodes = parser.get_nodes_from_documents(documents)

print(f"Number of nodes created: {len(nodes)}\n")

for i, node in enumerate(nodes, 1):
    print(f"Node {i}:")
    print(f"  ID: {node.node_id}")
    print(f"  Text length: {len(node.text)} characters")
    print(f"  Metadata: {node.metadata}")
    print(f"  Relationships: {node.relationships}")
    print()

Number of nodes created: 3

Node 1:
  ID: 360dacd5-ebe0-4895-9f43-97205409a815
  Text length: 354 characters
  Metadata: {'source': 'intro', 'category': 'overview'}
  Relationships: {<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='51b7aa84-8584-4552-84f5-fb5999d7d4d0', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'source': 'intro', 'category': 'overview'}, hash='6b67115e521a90d22245235f7a97b2808451a01b637bc146fc4c6b1b0126d392')}

Node 2:
  ID: b93dfcde-8fc6-458b-9d6a-3f9d037ebf02
  Text length: 395 characters
  Metadata: {'source': 'embeddings', 'category': 'technical'}
  Relationships: {<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='30c5fa28-61f3-4757-91d1-ac58b0e89aa5', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'source': 'embeddings', 'category': 'technical'}, hash='c74e00722500b0b510e568d1fd63750c229430501577b8d0a9295eaa48b99fb3')}

Node 3:
  ID: 6c0366b1-bad2-4ccd-8cb3-ced9a5a9c868
  Text length: 366 characters
  Metadata: {'source': 'vector_index', 'cat

In [16]:
test_query = "Explain vectore embeddings"

for k in [1,2,3]:
    engine = index.as_query_engine(similarity_top_k=k)
    response = engine.query(test_query)

    print(f"\nTop-K = {k}:")
    print(f"  Retrieved {len(response.source_nodes)} nodes")
    print(f"  Response length: {len(str(response))} characters")
    print(f"  First source score: {response.source_nodes[0].score:.4f}")


Top-K = 1:
  Retrieved 1 nodes
  Response length: 405 characters
  First source score: 0.7174

Top-K = 2:
  Retrieved 2 nodes
  Response length: 608 characters
  First source score: 0.7174

Top-K = 3:
  Retrieved 3 nodes
  Response length: 418 characters
  First source score: 0.7174


In [17]:
# Test different response modes
modes = ["compact", "tree_summarize", "simple_summarize"]
test_query = "What are the key features of LlamaIndex?"

for mode in modes:
    engine = index.as_query_engine(
        similarity_top_k=2,
        response_mode=mode
    )
    response = engine.query(test_query)
    
    print(f"\nMode: {mode}")
    print(f"Response: {response}")
    print("-" * 80)


Mode: compact
Response: LlamaIndex is a data framework for large language models (LLMs) that provides tools to ingest, structure, and access private or domain-specific data. It was developed to connect LLMs with external data sources, supporting various types such as PDFs, databases, APIs, and web pages. A key component is the VectorStoreIndex, which stores document embeddings in a vector database and performs similarity searches during queries. This process retrieves semantically similar data chunks and passes them to the LLM as context, forming the foundation of Retrieval-Augmented Generation (RAG).
--------------------------------------------------------------------------------

Mode: tree_summarize
Response: LlamaIndex is a data framework designed for large language models (LLMs). Its primary function is to provide tools for ingesting, structuring, and accessing private or domain-specific data, thereby connecting LLMs to external data sources. It supports a variety of data sources

### Response Mode Comparison

| Mode | How It Works | Best For |
|------|-------------|----------|
| **compact** | Concatenates chunks, refines iteratively | Balanced quality/speed |
| **tree_summarize** | Builds summary tree hierarchically | Large context, comprehensive answers |
| **simple_summarize** | Concatenates all chunks, single LLM call | Simple queries, speed |
| **refine** | Iteratively refines answer with each chunk | High quality, slower |
| **accumulate** | Generates separate answer per chunk | Multiple perspectives |

**Default**: `compact` (good balance for most use cases)