In [1]:
# Core LlamaIndex
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings, Document
from llama_index.core.node_parser import SentenceSplitter

# LLM Integration
from llama_index.llms.groq import Groq

# Embedding Integration
from llama_index.embeddings.gemini import GeminiEmbedding

# Utilities
from dotenv import load_dotenv
import os
import warnings
warnings.filterwarnings('ignore')

print("✅ Imports successful!")

✅ Imports successful!



All support for the `google.generativeai` package has ended. It will no longer be receiving 
updates or bug fixes. Please switch to the `google.genai` package as soon as possible.
See README for more details:

https://github.com/google-gemini/deprecated-generative-ai-python/blob/main/README.md

  import google.generativeai as gemini


In [4]:
# Load environment variables from .env file
load_dotenv()

# Verify OpenAI API key is set
if not os.getenv("GROQ_API_KEY"):
    print("Groq API key not found")
    
if not os.getenv("GOOGLE_API_KEY"):
    print("GOOGLE_API_KEY not found")
    
print("All is okay")


All is okay


In [10]:
# Configure LLM
Settings.llm = Groq(model="Qwen/Qwen3-32B",temperature=0.1)

# Configure Embedding Model
Settings.embed_model = GeminiEmbedding(
    model_name="models/gemini-embedding-001",title="this is a document",
)

# Configure Text Chunking
Settings.chunk_size = 1024           # Tokens per chunk (typical: 512-1024)
Settings.chunk_overlap = 200         # 20% overlap helps preserve context

# Configure Node Parser
Settings.node_parser = SentenceSplitter(
    chunk_size=Settings.chunk_size,
    chunk_overlap=Settings.chunk_overlap,
)

print("✅ Global Settings configured successfully!")
print(f"   LLM: {Settings.llm.model}")
print(f"   Embedding: {Settings.embed_model.model_name}")
print(f"   Chunk size: {Settings.chunk_size} tokens")
print(f"   Chunk overlap: {Settings.chunk_overlap} tokens")

✅ Global Settings configured successfully!
   LLM: Qwen/Qwen3-32B
   Embedding: models/gemini-embedding-001
   Chunk size: 1024 tokens
   Chunk overlap: 200 tokens


In [6]:
# Create sample documents (in practice, load from files)
documents = [
    Document(
        text="""
        LlamaIndex is a data framework for large language models (LLMs). 
        It provides tools to ingest, structure, and access private or domain-specific data.
        LlamaIndex was created to solve the problem of connecting LLMs to external data sources.
        The framework supports various data sources including PDFs, databases, APIs, and web pages.
        """,
        metadata={"source": "intro", "category": "overview"}
    ),
    Document(
        text="""
        Vector embeddings are numerical representations of text that capture semantic meaning.
        In LlamaIndex, embeddings enable semantic search - finding relevant content based on meaning,
        not just keyword matching. The default embedding model is OpenAI's text-embedding-3-small,
        which produces 1536-dimensional vectors. Other models like all-MiniLM-L6-v2 produce 384 dimensions.
        """,
        metadata={"source": "embeddings", "category": "technical"}
    ),
    Document(
        text="""
        The VectorStoreIndex is the most common index type in LlamaIndex. It stores document embeddings
        in a vector database and performs similarity search during queries. When you query the index,
        it retrieves the most semantically similar chunks and passes them to the LLM as context.
        This is the foundation of Retrieval-Augmented Generation (RAG).
        """,
        metadata={"source": "vector_index", "category": "technical"}
    ),
]

print(f"✅ Created {len(documents)} sample documents")
print(f"   Total characters: {sum(len(doc.text) for doc in documents)}")

✅ Created 3 sample documents
   Total characters: 1169


In [12]:
# Create index from documents
print("Creating VectorStoreIndex...")
print("This will:")
print("  1. Chunk documents into nodes")
print("  2. Generate embeddings for each node")
print("  3. Store in in-memory vector store\n")

index = VectorStoreIndex.from_documents(
    documents,
    show_progress=True,  # Display progress bar
)

print("\n✅ Index created successfully!")

Creating VectorStoreIndex...
This will:
  1. Chunk documents into nodes
  2. Generate embeddings for each node
  3. Store in in-memory vector store



Parsing nodes:   0%|          | 0/3 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/3 [00:00<?, ?it/s]


✅ Index created successfully!


In [13]:
# Create query engine from index
query_engine = index.as_query_engine(
    similarity_top_k=2,  # Retrieve top 2 most similar chunks
    response_mode="compact",  # Compact response synthesis
)

print("✅ Query engine created!")
print(f"   Top-K: {2}")
print(f"   Response mode: compact")

✅ Query engine created!
   Top-K: 2
   Response mode: compact


In [None]:
import re
def groqLlmResponse(response):
    return  re.sub(r"<think>.*?</think>", "", response, flags=re.DOTALL).strip()

In [27]:
# Query the index
 
query = "What is LlamaIndex used for?"
print(f"Query: {query}\n")

response_obj = query_engine.query(query)
response = response_obj.response
response = groqLlmResponse(response)

print("Response:")
print("-" * 80)
print(response)
print("-" * 80)

Query: What is LlamaIndex used for?

Response:
--------------------------------------------------------------------------------
LlamaIndex is designed to bridge large language models (LLMs) with external data sources by offering tools to ingest, organize, and retrieve private or specialized data. It enables LLMs to access structured information from diverse sources like documents, databases, and APIs. A key application is its support for Retrieval-Augmented Generation (RAG), where it uses vector-based similarity searches to provide contextually relevant data to the model during queries, enhancing the accuracy and relevance of responses.
--------------------------------------------------------------------------------


In [23]:
# Inspect source nodes
print(f"Number of source nodes: {len(response_obj.source_nodes)}\n")

for i, node in enumerate(response_obj.source_nodes, 1):
    print(f"Source Node {i}:")
    print(f"  Score: {node.score:.4f}")  # Similarity score (0-1)
    print(f"  Metadata: {node.metadata}")
    print(f"  Text (first 200 chars): {node.text[:200]}...")
    print()

Number of source nodes: 2

Source Node 1:
  Score: 0.8994
  Metadata: {'source': 'intro', 'category': 'overview'}
  Text (first 200 chars): LlamaIndex is a data framework for large language models (LLMs). 
        It provides tools to ingest, structure, and access private or domain-specific data.
        LlamaIndex was created to solve th...

Source Node 2:
  Score: 0.8580
  Metadata: {'source': 'vector_index', 'category': 'technical'}
  Text (first 200 chars): The VectorStoreIndex is the most common index type in LlamaIndex. It stores document embeddings
        in a vector database and performs similarity search during queries. When you query the index,
  ...



In [32]:
query1 = "How do embeddings work in LlamaIndex?"
response1 = query_engine.query(query1)
clearResponse1 = response1.response

print(f"Query: {query1}\n")
print("Response:")
print(groqLlmResponse(clearResponse1))
print("\nTop retrieved source:")
print(f"  Category: {response1.source_nodes[0].metadata.get('category')}")
print(f"  Score: {response1.source_nodes[0].score:.4f}")

Query: How do embeddings work in LlamaIndex?

Response:
In LlamaIndex, embeddings convert text into numerical vectors that encode semantic meaning, enabling systems to understand and compare the contextual relationships between pieces of text. These vectors are generated using models like OpenAI's text-embedding-3-small (producing 1536-dimensional vectors) or alternatives such as all-MiniLM-L6-v2 (384-dimensional vectors). 

When documents are processed, their embeddings are stored in a vector database as part of the VectorStoreIndex. During a query, the system calculates the similarity between the query's embedding and stored document embeddings, retrieving the most semantically relevant chunks. These retrieved results are then provided as context to a language model to generate responses, forming the core mechanism of Retrieval-Augmented Generation (RAG). This approach ensures searches are based on meaning rather than exact keyword matches.

Top retrieved source:
  Category: technica

In [34]:
query2 = "What is Retrieval-Augmented Generation?"
response2 = query_engine.query(query2)
clearResponse2 = response2.response

print(f"Query: {query2}\n")
print("Response:")
print(groqLlmResponse(clearResponse2))

Query: What is Retrieval-Augmented Generation?

Response:
Retrieval-Augmented Generation (RAG) is a method that enhances the responses of large language models by integrating externally retrieved information. It works by first identifying and extracting relevant data from external sources based on the input query, then using that retrieved information as contextual input for the language model. This approach improves the accuracy and relevance of generated outputs by grounding them in domain-specific or up-to-date data, rather than relying solely on the model's internal knowledge.


In [35]:
# Parse documents into nodes manually to understand the flow
from llama_index.core.node_parser import SentenceSplitter

parser = SentenceSplitter(chunk_size=1024, chunk_overlap=200)
nodes = parser.get_nodes_from_documents(documents)

print(f"Number of nodes created: {len(nodes)}\n")

for i, node in enumerate(nodes, 1):
    print(f"Node {i}:")
    print(f"  ID: {node.node_id}")
    print(f"  Text length: {len(node.text)} characters")
    print(f"  Metadata: {node.metadata}")
    print(f"  Relationships: {node.relationships}")
    print()

Number of nodes created: 3

Node 1:
  ID: a9119f4e-9fc8-4f4d-afc6-b00ee697b65f
  Text length: 354 characters
  Metadata: {'source': 'intro', 'category': 'overview'}
  Relationships: {<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='5bbec56a-f038-4b3d-bbee-8b6484c232bd', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'source': 'intro', 'category': 'overview'}, hash='6b67115e521a90d22245235f7a97b2808451a01b637bc146fc4c6b1b0126d392')}

Node 2:
  ID: c8246393-f2f5-4c0b-9809-2503b710b920
  Text length: 395 characters
  Metadata: {'source': 'embeddings', 'category': 'technical'}
  Relationships: {<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='a5a8dca4-b54b-49c9-a902-d145d4746991', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'source': 'embeddings', 'category': 'technical'}, hash='c74e00722500b0b510e568d1fd63750c229430501577b8d0a9295eaa48b99fb3')}

Node 3:
  ID: a21b670b-5551-4e05-b15b-b7e085085e61
  Text length: 366 characters
  Metadata: {'source': 'vector_index', 'cat

In [36]:
# Test with different top_k values
test_query = "Explain vector embeddings"

for k in [1, 2, 3]:
    engine = index.as_query_engine(similarity_top_k=k)
    response = engine.query(test_query)
    
    print(f"\nTop-K = {k}:")
    print(f"  Retrieved {len(response.source_nodes)} nodes")
    print(f"  Response length: {len(str(response))} characters")
    print(f"  First source score: {response.source_nodes[0].score:.4f}")


Top-K = 1:
  Retrieved 1 nodes
  Response length: 1788 characters
  First source score: 0.8665

Top-K = 2:
  Retrieved 2 nodes
  Response length: 2994 characters
  First source score: 0.8665

Top-K = 3:
  Retrieved 3 nodes
  Response length: 2959 characters
  First source score: 0.8665


In [38]:
# Test different response modes
modes = ["compact", "tree_summarize", "simple_summarize"]
test_query = "What are the key features of LlamaIndex?"

for mode in modes:
    engine = index.as_query_engine(
        similarity_top_k=2,
        response_mode=mode
    )
    response = engine.query(test_query)
    clearResponse = response.response
    
    print(f"\nMode: {mode}")
    print(f"Response: {groqLlmResponse(clearResponse)}")
    print("-" * 80)


Mode: compact
Response: LlamaIndex is designed to bridge large language models (LLMs) with external data sources, offering tools to ingest, structure, and access private or domain-specific data. Key features include support for diverse data formats such as PDFs, databases, APIs, and web pages. It employs a **VectorStoreIndex**, which stores document embeddings in a vector database to enable similarity searches during queries. This process retrieves semantically relevant data chunks, enhancing LLM responses through Retrieval-Augmented Generation (RAG). The framework emphasizes efficient integration of external context into language model workflows.
--------------------------------------------------------------------------------

Mode: tree_summarize
Response: LlamaIndex is designed to enhance the capabilities of large language models (LLMs) by enabling seamless integration with external data sources. Key features include support for diverse data formats such as documents, databases, AP