In [12]:
import os
from dotenv import load_dotenv
load_dotenv()
from llama_index.core import Settings
from llama_index.core import SimpleDirectoryReader, StorageContext, VectorStoreIndex
from llama_index.embeddings.google_genai import GoogleGenAIEmbedding
from llama_index.llms.google_genai import GoogleGenAI

from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb
import json
from pathlib import Path


In [3]:
# !pip install llama-index-retrievers-bm25
#pip install PyStemmer

In [60]:
process_directory = 'processed_data/llama_index'

In [89]:
config_path = (Path(process_directory) /'config.json').as_posix()
with open(config_path, 'r') as fp:
    config = json.load(fp)

In [90]:
choma_path = config['chromadb_path']
chroma_colection_name = config['chroma_collection']

In [91]:
llm = GoogleGenAI(
    model="models/gemini-2.0-flash",
    api_key=os.environ['GEMINI_API_KEY'], 
)
embed_model =  GoogleGenAIEmbedding(model_name="models/text-embedding-004", api_key=os.environ['GEMINI_API_KEY'])

In [92]:
Settings.llm = llm
Settings.embed_model = embed_model

In [93]:
from llama_index.core.callbacks import CallbackManager, TokenCountingHandler, simple_llm_handler
import tiktoken
from utilities import GeminiTokenizer

In [94]:
Settings.tokenizer = GeminiTokenizer().encode

In [95]:
token_counter = TokenCountingHandler(
    tokenizer=GeminiTokenizer().encode,
    verbose=True,  # set to true to see usage printed to the console
)

llm_debugger = simple_llm_handler.SimpleLLMHandler()
handlers = [token_counter] #, llm_debugger]
Settings.callback_manager = CallbackManager(handlers)

In [96]:
chroma_command = f"chroma run --path {choma_path}"
print(chroma_command)

chroma run --path processed_data/llama_index/chromadb


In [98]:
remote_db = chromadb.HttpClient()
chroma_collection = remote_db.get_or_create_collection('contextual')
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

In [99]:
chroma_collection.count()

2084

In [100]:
# QUERY = 'How to Implement Callback manager to count tokens of all llm calls made, through any component, maybe extractor or retreiver, or query engine.'
QUERY = 'how to use gemini as the llm and for embeddings'

In [101]:
index = VectorStoreIndex.from_vector_store(vector_store, embed_model=embed_model)

In [102]:
index_retreiver = index.as_retriever(similarity_top_k=45)

In [103]:
context_nodes = index_retreiver.retrieve(QUERY)

Embedding Token Usage: 12


In [104]:
context_nodes[:2]

[NodeWithScore(node=TextNode(id_='api_reference/embeddings/gemini.md-0-87', embedding=None, metadata={'file_path': 'api_reference/embeddings/gemini.md', 'file_name': 'gemini.md', 'file_size': 88, 'creation_date': '2025-02-11', 'last_modified_date': '2025-02-11', 'header_path': '/', 'context': 'This chunk is the entire document.\n'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='api_reference/embeddings/gemini.md', node_type='4', metadata={'file_path': 'api_reference/embeddings/gemini.md', 'file_name': 'gemini.md', 'file_size': 88, 'creation_date': '2025-02-11', 'last_modified_date': '2025-02-11'}, hash='de1d135c1cd76d13df58c394442ce97650b7d28aa3a20c3c8174ba1d7532f723')}, metadata_template='{key}: {value}'

In [105]:

# nodes_info  = chroma_collection.get(include=[])['ids']
# index = VectorStoreIndex.from_vector_store(vector_store)


In [106]:
nodes_info = chroma_collection.get()

In [107]:
nodes_info.keys()

dict_keys(['ids', 'embeddings', 'metadatas', 'documents', 'data', 'uris', 'included'])

In [108]:
nodes_info['metadatas'][-1]

{'_node_content': '{"id_": "module_guides/storing/vector_stores.md-7039-10664", "embedding": null, "metadata": {"file_path": "module_guides/storing/vector_stores.md", "file_name": "vector_stores.md", "file_size": 10974, "creation_date": "2025-02-11", "last_modified_date": "2025-02-11", "header_path": "/Vector Stores/", "context": "This section lists example Jupyter notebooks demonstrating how to use various vector stores with LlamaIndex.\\n"}, "excluded_embed_metadata_keys": ["file_name", "file_type", "file_size", "creation_date", "last_modified_date", "last_accessed_date"], "excluded_llm_metadata_keys": ["file_name", "file_type", "file_size", "creation_date", "last_modified_date", "last_accessed_date"], "relationships": {"1": {"node_id": "module_guides/storing/vector_stores.md", "node_type": "4", "metadata": {"file_path": "module_guides/storing/vector_stores.md", "file_name": "vector_stores.md", "file_size": 10974, "creation_date": "2025-02-11", "last_modified_date": "2025-02-11"}, "h

In [109]:
nodes = vector_store.get_nodes(nodes_info['ids'])

In [110]:
len(nodes)

2084

In [111]:
# nodes[1].dict()

In [112]:
from llama_index.retrievers.bm25 import BM25Retriever
import Stemmer

In [113]:


# We can pass in the index, docstore, or list of nodes to create the retriever
bm25_retriever = BM25Retriever.from_defaults(
    nodes=nodes,
    similarity_top_k=20,
    # Optional: We can pass in the stemmer and set the language for stopwords
    # This is important for removing stopwords and stemming the query + text
    # The default is english for both
    stemmer=Stemmer.Stemmer("english"),
    language="english",
)

In [114]:
bm25_nodes = bm25_retriever.retrieve('what are callback handlers?')

In [115]:
for node in bm25_nodes:
    print(node.node.node_id, '-----', node.score)

api_reference/callbacks/langfuse.md-0-98 ----- 5.477563858032227
understanding/tracing_and_debugging/tracing_and_debugging.md-525-1300 ----- 5.186171054840088
api_reference/callbacks/uptrain.md-0-94 ----- 4.90223503112793
api_reference/callbacks/promptlayer.md-0-94 ----- 4.90223503112793
api_reference/callbacks/deepeval.md-0-98 ----- 4.90223503112793
api_reference/callbacks/arize_phoenix.md-0-108 ----- 4.871725082397461
api_reference/callbacks/argilla.md-0-96 ----- 4.871725082397461
module_guides/observability/index.md-18792-18893 ----- 4.868949890136719
community/integrations/uptrain.md-8404-8713 ----- 4.773504257202148
community/integrations/uptrain.md-2707-3132 ----- 4.572553634643555
module_guides/observability/index.md-16664-16947 ----- 4.515216827392578
module_guides/observability/index.md-15309-15401 ----- 3.982264518737793
community/integrations/uptrain.md-5279-5617 ----- 3.734769105911255
community/integrations/uptrain.md-5619-6485 ----- 3.7272391319274902
community/integratio

In [116]:
from llama_index.core.retrievers import QueryFusionRetriever


In [117]:
import nest_asyncio

nest_asyncio.apply()

In [118]:
retriever = QueryFusionRetriever(
    [
        index_retreiver,
        bm25_retriever,
    ],
    num_queries=4,
    similarity_top_k = 50,
    llm=llm,
    retriever_weights=[0.7, 0.3],
    use_async=True,
)

In [119]:
ret_queries = retriever._get_queries(QUERY)
ret_queries

LLM Prompt Token Usage: 54
LLM Completion Token Usage: 22


[QueryBundle(query_str='how to use gemini llm', image_path=None, custom_embedding_strs=None, embedding=None),
 QueryBundle(query_str='gemini llm embeddings tutorial', image_path=None, custom_embedding_strs=None, embedding=None),
 QueryBundle(query_str='gemini api for llm and embeddings', image_path=None, custom_embedding_strs=None, embedding=None)]

In [120]:
for ret_query in ret_queries:
    print(ret_query)
    emb = embed_model.get_query_embedding(ret_query.query_str)
    print('--------')
print(QUERY)
emb = embed_model.get_query_embedding(QUERY)

how to use gemini llm
Embedding Token Usage: 7
--------
gemini llm embeddings tutorial
Embedding Token Usage: 5
--------
gemini api for llm and embeddings
Embedding Token Usage: 7
--------
how to use gemini as the llm and for embeddings
Embedding Token Usage: 12


In [121]:
hybrid_nodes = retriever.retrieve(QUERY)

LLM Prompt Token Usage: 54
LLM Completion Token Usage: 22
Embedding Token Usage: 5
Embedding Token Usage: 7
Embedding Token Usage: 12
Embedding Token Usage: 7


In [122]:
# len(Settings.tokenizer("You are a helpful assistant that generates multiple search queries based on a single input query. Generate 3 search queries, one on each line, related to the following input query:\n\
# Query: what are callback handlers?\n\
# Queries:\n\
# "))

In [123]:
# hybrid_nodes[0].dict()

In [124]:
for node in hybrid_nodes:
    print(node.node.node_id, '-----', node.score)

api_reference/embeddings/gemini.md-0-87 ----- 5.783661365509033
api_reference/llms/gemini.md-0-72 ----- 5.5878520011901855
understanding/agent/index.md-3073-3437 ----- 5.290548324584961
module_guides/models/multi_modal.md-5156-8283 ----- 4.709981441497803
module_guides/evaluating/evaluating_evaluators_with_llamadatasets.md-0-1865 ----- 4.363983631134033
api_reference/multi_modal_llms/gemini.md-0-94 ----- 4.359795093536377
module_guides/models/multi_modal.md-9705-10409 ----- 4.213951110839844
getting_started/starter_example_local.md-0-448 ----- 3.927900791168213
community/faq/embeddings.md-1121-1349 ----- 3.899174690246582
understanding/indexing/indexing.md-1757-2303 ----- 3.8961241245269775
community/faq/llms.md-25-633 ----- 3.7066307067871094
community/faq/embeddings.md-14-494 ----- 3.7046380043029785
understanding/evaluating/cost_analysis/index.md-3404-3688 ----- 3.6233983039855957
understanding/indexing/indexing.md-757-1755 ----- 3.4983839988708496
module_guides/loading/documents_an

In [125]:
for node in context_nodes:
    print(node.node.node_id, '-----', node.score)

api_reference/embeddings/gemini.md-0-87 ----- 0.7104416075854444
api_reference/llms/gemini.md-0-72 ----- 0.6754126524282204
community/faq/embeddings.md-0-12 ----- 0.6651634935775171
community/frequently_asked_questions.md-214-342 ----- 0.6553498762103321
api_reference/embeddings/google.md-0-158 ----- 0.6419364265094576
module_guides/models/embeddings.md-0-12 ----- 0.6232502230085977
api_reference/embeddings/vllm.md-0-83 ----- 0.6217949006512549
community/faq/embeddings.md-14-494 ----- 0.6190136241012562
community/faq/embeddings.md-901-1119 ----- 0.6188554892257974
community/frequently_asked_questions.md-41-212 ----- 0.6184940686339753
understanding/agent/index.md-3073-3437 ----- 0.617153228292253
api_reference/multi_modal_llms/gemini.md-0-94 ----- 0.6167912577714008
api_reference/embeddings/openai.md-0-87 ----- 0.6158111103662927
api_reference/embeddings/langchain.md-0-93 ----- 0.6150433311487288
optimizing/custom_modules.md-907-1017 ----- 0.6124784824019214
use_cases/fine_tuning.md-0-

In [126]:
for node in bm25_nodes:
    print(node.node.node_id, '-----', node.score)

api_reference/callbacks/langfuse.md-0-98 ----- 5.477563858032227
understanding/tracing_and_debugging/tracing_and_debugging.md-525-1300 ----- 5.186171054840088
api_reference/callbacks/uptrain.md-0-94 ----- 4.90223503112793
api_reference/callbacks/promptlayer.md-0-94 ----- 4.90223503112793
api_reference/callbacks/deepeval.md-0-98 ----- 4.90223503112793
api_reference/callbacks/arize_phoenix.md-0-108 ----- 4.871725082397461
api_reference/callbacks/argilla.md-0-96 ----- 4.871725082397461
module_guides/observability/index.md-18792-18893 ----- 4.868949890136719
community/integrations/uptrain.md-8404-8713 ----- 4.773504257202148
community/integrations/uptrain.md-2707-3132 ----- 4.572553634643555
module_guides/observability/index.md-16664-16947 ----- 4.515216827392578
module_guides/observability/index.md-15309-15401 ----- 3.982264518737793
community/integrations/uptrain.md-5279-5617 ----- 3.734769105911255
community/integrations/uptrain.md-5619-6485 ----- 3.7272391319274902
community/integratio

In [127]:
vector_node_ids = [node.node.node_id for node in context_nodes]
bm25_node_ids = [node.node.node_id for node in bm25_nodes]
 
for node in hybrid_nodes:
    if node.node.node_id in vector_node_ids:
        print(node.node.node_id, '-----', node.score, 'vector')
    if node.node.node_id in bm25_node_ids:
        print(node.node.node_id, '-----', node.score, 'bm25')
    print(node.node.get_content(metadata_mode='embed')[:100])
    print('\n\n-------------------------------------\n\n')

api_reference/embeddings/gemini.md-0-87 ----- 5.783661365509033 vector
[Excerpt from document]
file_path: api_reference/embeddings/gemini.md
header_path: /
context: This c


-------------------------------------


api_reference/llms/gemini.md-0-72 ----- 5.5878520011901855 vector
[Excerpt from document]
file_path: api_reference/llms/gemini.md
header_path: /
context: This chunk d


-------------------------------------


understanding/agent/index.md-3073-3437 ----- 5.290548324584961 vector
[Excerpt from document]
file_path: understanding/agent/index.md
header_path: /Building a basic agent


-------------------------------------


module_guides/models/multi_modal.md-5156-8283 ----- 4.709981441497803 vector
[Excerpt from document]
file_path: module_guides/models/multi_modal.md
header_path: /[Beta] Multi-mo


-------------------------------------


[Excerpt from document]
file_path: module_guides/evaluating/evaluating_evaluators_with_llamadatasets


-------------------------------------


a

In [128]:
text = '\n------------------\n'.join([node.node.get_content(metadata_mode='embed') for node in hybrid_nodes])
print(len(Settings.tokenizer(text)))

10136


In [129]:
from llama_index.core.query_engine import RetrieverQueryEngine

query_engine = RetrieverQueryEngine(retriever)

In [130]:
# query_engine.synthesize

In [131]:
# bm25_retriever.persist('./bm25retreiver')

In [132]:
response = query_engine.query(QUERY)
print(response)

LLM Prompt Token Usage: 54
LLM Completion Token Usage: 24
Embedding Token Usage: 12
Embedding Token Usage: 9
Embedding Token Usage: 5
Embedding Token Usage: 7
LLM Prompt Token Usage: 10122
LLM Completion Token Usage: 119
To use Gemini, you can access it via an API. You can also use Gemini for multi-modal applications.

The `llama_index.llms.gemini` module defines the Gemini LLM.

The `llama_index.embeddings.gemini` module includes `GeminiEmbedding`.

There are example notebooks for using Gemini with multi-modal applications. These notebooks demonstrate how to integrate Multi-Modal LLM models, Multi-Modal embeddings, Multi-Modal vector stores, Retrievers, and Query engines for composing Multi-Modal Retrieval Augmented Generation (RAG) orchestration.


In [113]:
print(
    "Embedding Tokens: ",
    token_counter.total_embedding_token_count,
    "\n",
    "LLM Prompt Tokens: ",
    token_counter.prompt_llm_token_count,
    "\n",
    "LLM Completion Tokens: ",
    token_counter.completion_llm_token_count,
    "\n",
    "Total LLM Token Count: ",
    token_counter.total_llm_token_count,
)

Embedding Tokens:  379 
 LLM Prompt Tokens:  29586 
 LLM Completion Tokens:  523 
 Total LLM Token Count:  30109


In [114]:
token_counter.total_llm_token_count

30109

## Rerank for better results 

In [115]:
from llama_index.postprocessor.cohere_rerank import CohereRerank

cohere_rerank = CohereRerank(
    top_n=20, model="rerank-v3.5", api_key=os.environ['COHERE_API_KEY']
)

In [121]:
reranked_nodes = cohere_rerank.postprocess_nodes(nodes=hybrid_nodes, query_str=QUERY)

In [122]:
reranked_nodes

[NodeWithScore(node=TextNode(id_='understanding/evaluating/cost_analysis/usage_pattern.md-17-1995', embedding=None, metadata={'file_path': 'understanding/evaluating/cost_analysis/usage_pattern.md', 'file_name': 'usage_pattern.md', 'file_size': 1996, 'creation_date': '2025-02-11', 'last_modified_date': '2025-02-11', 'header_path': '/Usage Pattern/', 'context': 'This section details a step-by-step guide on how to use the `TokenCountingHandler` callback to estimate LLM and embedding token counts during index construction and querying.\n'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='understanding/evaluating/cost_analysis/usage_pattern.md', node_type='4', metadata={'file_path': 'understanding/evaluating/cos

In [124]:
print(reranked_nodes[-2].node.text)

# Customizing LLMs within LlamaIndex Abstractions

You can plugin these LLM abstractions within our other modules in LlamaIndex (indexes, retrievers, query engines, agents) which allow you to build advanced workflows over your data.

By default, we use OpenAI's `gpt-3.5-turbo` model. But you may choose to customize
the underlying LLM being used.

Below we show a few examples of LLM customization. This includes

- changing the underlying LLM
- changing the number of output tokens (for OpenAI, Cohere, or AI21)
- having more fine-grained control over all parameters for any LLM, from context window to chunk overlap


In [116]:
query_engine_rerank = RetrieverQueryEngine(retriever, node_postprocessors=[cohere_rerank])

In [118]:
response_rerank = query_engine_rerank.query(QUERY)

LLM Prompt Token Usage: 72
LLM Completion Token Usage: 29
Embedding Token Usage: 7
Embedding Token Usage: 30
Embedding Token Usage: 9
Embedding Token Usage: 10
LLM Prompt Token Usage: 7245
LLM Completion Token Usage: 433


In [120]:
print(response_rerank)

To count tokens from LLM calls across different components like extractors, retrievers, or query engines, you can use the `TokenCountingHandler` callback along with the `CallbackManager`. Here's a step-by-step guide:

1.  Set up the `TokenCountingCallback` handler:

    ```python
    import tiktoken
    from llama_index.core.callbacks import CallbackManager, TokenCountingHandler

    token_counter = TokenCountingHandler(
        tokenizer=tiktoken.encoding_for_model("gpt-3.5-turbo").encode
    )

    callback_manager = CallbackManager([token_counter])
    ```

2.  Add the callback manager to the global `Settings`:

    ```python
    from llama_index.core import Settings

    Settings.callback_manager = callback_manager
    ```

After setting up the `TokenCountingHandler` and adding it to the global `Settings`, LlamaIndex will automatically track token usage for LLM calls made by any component. You can then access the counts directly:

```python
print(
    "Embedding Tokens: ",
    toke

In [127]:
resp = query_engine_rerank.query('how to do chat with query fusion retreiver and cohere rerank? Provide code. I want to chat, not just query')

LLM Prompt Token Usage: 69
LLM Completion Token Usage: 41
Embedding Token Usage: 27
Embedding Token Usage: 14
Embedding Token Usage: 12
Embedding Token Usage: 12
LLM Prompt Token Usage: 5596
LLM Completion Token Usage: 701


In [128]:
print(resp)

While the provided documents do not contain a specific example that combines a chat engine with query fusion and Cohere rerank, they do offer guidance on how to achieve this.

To create a chat engine with query fusion and Cohere rerank, you would need to:

1.  **Set up a Query Engine:** Construct a query engine that utilizes a query fusion retriever and the CohereRerank node postprocessor. The query engine serves as the foundation for answering questions over your data.
2.  **Configure a Chat Engine:**  Then configure a chat engine, and there are multiple chat modes available. You can configure the chat engine in a high-level or low-level approach.
3.  **Choose a Chat Mode**: You can select a chat mode like `context` or `condense_plus_context` to incorporate retrieved context into the chat.

Here's a conceptual outline based on the provided information:

```python
from llama_index.core import VectorStoreIndex, get_response_synthesizer
from llama_index.core.retrievers import BaseRetriev

In [130]:
print(resp.get_formatted_sources())

> Source (Doc id: module_guides/deploying/query_engine/index.md-16-593): ## Concept

Query engine is a generic interface that allows you to ask question over your data.

...

> Source (Doc id: api_reference/retrievers/query_fusion.md-0-90): ::: llama_index.core.retrievers
    options:
      members:
        - QueryFusionRetriever

> Source (Doc id: api_reference/postprocessor/cohere_rerank.md-0-94): ::: llama_index.postprocessor.cohere_rerank
    options:
      members:
        - CohereRerank

> Source (Doc id: module_guides/querying/node_postprocessors/node_postprocessors.md-2547-2974): ## CohereRerank

Uses the "Cohere ReRank" functionality to re-order nodes, and returns the top N ...

> Source (Doc id: module_guides/querying/node_postprocessors/index.md-624-1542): ## Usage Pattern

An example of using a node postprocessors is below:

```python
from llama_index...

> Source (Doc id: module_guides/querying/retriever/retrievers.md-655-1482): ### Advanced Retrieval and Search

These gui

## Chat Engine

In [132]:
from llama_index.core.chat_engine import ContextChatEngine, CondensePlusContextChatEngine
from llama_index.core.memory import ChatMemoryBuffer



In [133]:
memory = ChatMemoryBuffer.from_defaults(token_limit=8000)

In [135]:
chat_engine = CondensePlusContextChatEngine(retriever=retriever, llm=llm, memory=memory, node_postprocessors=[cohere_rerank], verbose=True)

In [136]:
res = chat_engine.chat('how to use a react chat engine, given that I have a retriever? provide code', )

Condensed question: how to use a react chat engine, given that I have a retriever? provide code
LLM Prompt Token Usage: 59
LLM Completion Token Usage: 22
Embedding Token Usage: 7
Embedding Token Usage: 17
Embedding Token Usage: 6
Embedding Token Usage: 6
LLM Prompt Token Usage: 4022
LLM Completion Token Usage: 1447


In [138]:
print(res)

Okay, I can help with that! Here's how you can use a React Chat Engine with a retriever in LlamaIndex.

Firstly, it's important to understand the relationship between Chat Engines, Query Engines, and Retrievers. A Retriever is responsible for fetching the most relevant context given a user query. It can be built on top of indexes, but can also be defined independently. It's a key building block in both Query Engines and Chat Engines. Chat Engines are stateful interfaces for having conversations with your data, keeping track of conversation history to answer questions with past context in mind.

While the documents don't provide a direct, copy-and-paste code example for integrating a React frontend *directly* with a retriever and chat engine, they do provide the key components and concepts. Here's a breakdown of how you can approach this, combining the information from the documents:

1.  **Backend (LlamaIndex)**:
    *   You'll need to set up a LlamaIndex chat engine in your backend. T

In [139]:
res2 = chat_engine.chat('I meant how to use REACT agent based chat engine in llama index. Please provide code to react a react Agent based chat engine. I want to create it from a retriever and not from an index directly.')

LLM Prompt Token Usage: 1560
LLM Completion Token Usage: 30
Condensed question: How can I create a React Agent-based chat engine in LlamaIndex from a retriever, providing code examples for the agent's React functionality?

LLM Prompt Token Usage: 71
LLM Completion Token Usage: 26
Embedding Token Usage: 30
Embedding Token Usage: 9
Embedding Token Usage: 6
Embedding Token Usage: 8
LLM Prompt Token Usage: 8195
LLM Completion Token Usage: 1146


In [140]:
print(res2)

Okay, I understand now. You want to use a `ReActAgent` as your chat engine, building it from a retriever instead of directly from an index. Here's how you can do that, combining the information from the documents:

```python
from llama_index.core.agent import ReActAgent
from llama_index.llms.openai import OpenAI
from llama_index.core.tools import QueryEngineTool
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.callbacks import CallbackManager, LlamaDebugHandler
from llama_index.core.prompts import PromptTemplate

# 1. Define your Retriever
# Assume you have a retriever instance (e.g., from a VectorStoreIndex)
# my_retriever = ...

# 2. Create a Query Engine from the Retriever
#    This wraps the retriever in a query engine interface.
query_engine = RetrieverQueryEngine.from_args(retriever=my_retriever)

# 3. (Optional) Add a name and description to the query engine tool
#    This helps the agent decide when to use it.
query_engine_tool = QueryEngineT

In [141]:
res3 = chat_engine.chat('what is the capital of assam?')

LLM Prompt Token Usage: 2717
LLM Completion Token Usage: 8
Condensed question: What is the capital of Assam?

LLM Prompt Token Usage: 49
LLM Completion Token Usage: 25
Embedding Token Usage: 8
Embedding Token Usage: 8
Embedding Token Usage: 4
Embedding Token Usage: 10
LLM Prompt Token Usage: 16255
LLM Completion Token Usage: 28


In [142]:
print(res3)

I'm sorry, but I cannot answer that question. The provided documents do not contain information about the capital of Assam.



In [145]:
chat_engine.reset()

In [146]:
res4 = chat_engine.chat('what did I ask you before?')

Condensed question: what did I ask you before?
LLM Prompt Token Usage: 49
LLM Completion Token Usage: 29
Embedding Token Usage: 7
Embedding Token Usage: 8
Embedding Token Usage: 8
Embedding Token Usage: 10
LLM Prompt Token Usage: 9284
LLM Completion Token Usage: 197


In [147]:
print(res4)

I do not have access to past conversations, so I don't know what you asked me before. However, the documentation does mention tools for storing chat history.

Specifically, the `PostgresChatStore` allows you to store chat history remotely using PostgreSQL. Here's how you can initialize and use it with `ChatMemoryBuffer`:

```python
from llama_index.storage.chat_store.postgres import PostgresChatStore
from llama_index.core.memory import ChatMemoryBuffer

chat_store = PostgresChatStore.from_uri(
    uri="postgresql+asyncpg://postgres:password@127.0.0.1:5432/database",
)

chat_memory = ChatMemoryBuffer.from_defaults(
    token_limit=3000,
    chat_store=chat_store,
    chat_store_key="user1",
)
```


## ReACT Agent Chat Engine

In [152]:
from llama_index.core.agent import ReActAgent
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.tools import QueryEngineTool
from llama_index.core.tools.types import ToolMetadata

In [153]:
query_engine = RetrieverQueryEngine.from_args(retriever=retriever, node_postprocessors=[cohere_rerank])

# 3. (Optional) Add a name and description to the query engine tool
#    This helps the agent decide when to use it.
query_engine_tool = QueryEngineTool(
    query_engine=query_engine,
    metadata= ToolMetadata(
            name="my_retriever_tool",
            description="Useful for answering questions about the documents I have access to.",
            return_direct=False
        ),
)

In [154]:
agent = ReActAgent.from_tools(
    tools=[query_engine_tool],
    llm=llm,
    verbose=True,
    # react_prompt=react_prompt,
)

In [155]:
resp1 = agent.chat(QUERY)

> Running step 85c3d896-43dd-4b61-a42e-2c12a8236bb4. Step input: How to Implement Callback manager to count tokens of all llm calls made, through any component, maybe extractor or retreiver, or query engine.
LLM Prompt Token Usage: 523
LLM Completion Token Usage: 70
[1;3;38;5;200mThought: The current language of the user is: English. I need to use a tool to find information about implementing a callback manager to count tokens of all LLM calls.
Action: my_retriever_tool
Action Input: {'input': 'implement callback manager to count tokens of all llm calls'}
[0mLLM Prompt Token Usage: 53
LLM Completion Token Usage: 28
Embedding Token Usage: 9
Embedding Token Usage: 11
Embedding Token Usage: 8
Embedding Token Usage: 8
LLM Prompt Token Usage: 6450
LLM Completion Token Usage: 210
[1;3;34mObservation: To count tokens from LLM calls, you can implement a callback manager using the `TokenCountingHandler`. Here's how:

1.  Instantiate `TokenCountingHandler`, optionally setting a tokenizer.
2. 

In [160]:
# totla llm counts till now
print('prompt tokens', token_counter.prompt_llm_token_count, 'op tokens', token_counter.completion_llm_token_count)

prompt tokens 99135 op tokens 5241


## Experiment

In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from custom_components.custom_google_genai import GoogleGenAI

In [3]:
llm = GoogleGenAI(
            model="models/gemini-2.0-flash",
            api_key=os.environ['GEMINI_API_KEY'], 
            max_retries=2,  # Number of retry attempts
            retry_on_rate_limit=True,
            additional_kwargs={"stream_options": {"include_usage": True}}
        )

In [4]:
llm

GoogleGenAI(callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x0000017089826150>, system_prompt=None, messages_to_prompt=<function messages_to_prompt at 0x00000170FE71A0C0>, completion_to_prompt=<function default_completion_to_prompt at 0x00000170FFE1C7C0>, output_parser=None, pydantic_program_mode=<PydanticProgramMode.DEFAULT: 'default'>, query_wrapper_prompt=None, model='models/gemini-2.0-flash', temperature=0.1, context_window=None, is_function_calling_model=True)

In [5]:
from llama_index.core.base.llms.types import ChatMessage

In [6]:
# response = llm.chat([ChatMessage(role="user", content="Hello")])

In [11]:
from langfuse.llama_index import LlamaIndexInstrumentor
 
instrumentor = LlamaIndexInstrumentor()

In [12]:
instrumentor.start()
 
# ... your LlamaIndex index creation ...
 
# response = index.as_query_engine().query("What is the use of callbacks?")
response = llm.chat([ChatMessage(role="user", content="Hello")])
 
# Flush events to langfuse
 
instrumentor.flush()

Trace ID is not set. Creating generation client with new trace id.


In [8]:
response = llm.chat([ChatMessage(role="user", content="Hello")])

In [10]:
response.raw

{'content': {'parts': [{'video_metadata': None,
    'thought': None,
    'code_execution_result': None,
    'executable_code': None,
    'file_data': None,
    'function_call': None,
    'function_response': None,
    'inline_data': None,
    'text': 'Hello there! How can I help you today?\n'}],
  'role': 'model'},
 'citation_metadata': None,
 'finish_message': None,
 'token_count': None,
 'finish_reason': <FinishReason.STOP: 'STOP'>,
 'avg_logprobs': -0.06534120711413297,
 'grounding_metadata': None,
 'index': None,
 'logprobs_result': None,
 'safety_ratings': None,
 'usage_metadata': {'cache_tokens_details': None,
  'cached_content_token_count': None,
  'candidates_token_count': 11,
  'candidates_tokens_details': [{'modality': <MediaModality.TEXT: 'TEXT'>,
    'token_count': 11}],
  'prompt_token_count': 1,
  'prompt_tokens_details': [{'modality': <MediaModality.TEXT: 'TEXT'>,
    'token_count': 1}],
  'thoughts_token_count': None,
  'tool_use_prompt_token_count': None,
  'tool_use_p