## How a trulens eval function works on llamaindex platform

In [46]:
## Loading environment variables
from dotenv import load_dotenv
load_dotenv()
import nest_asyncio
nest_asyncio.apply()
import os

In [47]:
# Initializing trulens session to track evaluations
from trulens.core import TruSession

session = TruSession()
session.reset_database()
# session.get_leaderboard()

 ... (more hidden) ...
 ... (more hidden) ...
 ... (more hidden) ...


In [48]:
import os
import urllib.request
url = "https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt"
file_path = "data_llama/paul_graham_essay.txt"
if not os.path.exists(file_path):
    urllib.request.urlretrieve(url, file_path)

In [None]:
from llama_index.core import Settings
from llama_index.core import SimpleDirectoryReader
from llama_index.core import VectorStoreIndex
from llama_index.llms.openai import OpenAI

# Setting the config
Settings.chunk_size = 128
Settings.chunk_overlap = 16
Settings.llm = OpenAI()
#Loading the data
documents = SimpleDirectoryReader("data_llama").load_data()
index = VectorStoreIndex.from_documents(documents)
#Creating a query engine
query_engine = index.as_query_engine(similarity_top_k=3)
# Validate if the query engine is working
response = query_engine.query("What did the author do growing up?")
print(response)

In [50]:
## Reference documents
# https://www.trulens.org/getting_started/quickstarts/llama_index_quickstart/
import numpy as np
from trulens.apps.llamaindex import TruLlama
from trulens.core import Feedback
from trulens.providers.openai import OpenAI

# Initialize provider class
provider = OpenAI(model_engine="gpt-4.1-mini")

# select context to be used in feedback. the location of context is app specific.

context = TruLlama.select_context(query_engine)

# Define a groundedness feedback function
f_groundedness = (
    Feedback(
        provider.groundedness_measure_with_cot_reasons, name="Groundedness"
    )
    .on(context.collect())  # collect context chunks into a list
    .on_output()
)

# Question/answer relevance between overall question and answer.
f_answer_relevance = Feedback(
    provider.relevance_with_cot_reasons, name="Answer Relevance"
).on_input_output()
# Question/statement relevance between question and each context chunk.
f_context_relevance = (
    Feedback(
        provider.context_relevance_with_cot_reasons, name="Context Relevance"
    )
    .on_input()
    .on(context)
    .aggregate(np.mean)
)

✅ In Groundedness, input source will be set to __record__.calls[-1].rets.source_nodes[:].node.text.collect() .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input question will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input context will be set to __record__.calls[-1].rets.source_nodes[:].node.text .


In [None]:
# Initializing the app
tru_query_engine_recorder = TruLlama(
    query_engine,
    app_name="LlamaIndex_App",
    app_version="base",
    feedbacks=[f_groundedness, f_answer_relevance, f_context_relevance],
)

In [51]:
# or as context manager
with tru_query_engine_recorder as recording:
    query_engine.query("What kind of writing did he do as a beginner?")

In [52]:
eval_questions = ["What was the first computer he encountered?", 'What did Paul Graham initially plan to study in college?']

with tru_query_engine_recorder as recording:
    for question in eval_questions:
        response = query_engine.query(question)
        print(f"Question: {question}\nResponse: {response.response}\n---")

Question: What was the first computer he encountered?
Response: IBM 1401
---
Question: What did Paul Graham initially plan to study in college?
Response: Paul Graham initially planned to study philosophy in college.
---


In [53]:
from trulens.dashboard import run_dashboard
run_dashboard(session)

Starting dashboard ...
Dashboard already running at path:   Local URL: http://localhost:49896



<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>

In [None]:
from trulens.core import Feedback
from trulens_eval.feedback import Groundedness

In [None]:
from llama_index.core import Settings
from llama_index.core import SimpleDirectoryReader
from llama_index.core import VectorStoreIndex
from llama_index.llms.openai import OpenAI

Settings.chunk_size = 128
Settings.chunk_overlap = 16
Settings.llm = OpenAI()

documents = SimpleDirectoryReader("data_llama").load_data()
index = VectorStoreIndex.from_documents(documents)

query_engine = index.as_query_engine(similarity_top_k=3)

In [None]:
response = query_engine.query("What did the author do growing up?")
print(response)

In [None]:
import numpy as np
from trulens.apps.llamaindex import TruLlama
from trulens.core import Feedback
from trulens.providers.openai import OpenAI

# Initialize provider class
provider = OpenAI(model_engine="gpt-4.1-mini")

# select context to be used in feedback. the location of context is app specific.

context = TruLlama.select_context(query_engine)

# Define a groundedness feedback function
f_groundedness = (
    Feedback(
        provider.groundedness_measure_with_cot_reasons, name="Groundedness"
    )
    .on(context.collect())  # collect context chunks into a list
    .on_output()
)

# Question/answer relevance between overall question and answer.
f_answer_relevance = Feedback(
    provider.relevance_with_cot_reasons, name="Answer Relevance"
).on_input_output()
# Question/statement relevance between question and each context chunk.
f_context_relevance = (
    Feedback(
        provider.context_relevance_with_cot_reasons, name="Context Relevance"
    )
    .on_input()
    .on(context)
    .aggregate(np.mean)
)

In [None]:
tru_query_engine_recorder = TruLlama(
    query_engine,
    app_name="LlamaIndex_App",
    app_version="base",
    feedbacks=[f_groundedness, f_answer_relevance, f_context_relevance],
)

In [None]:
# or as context manager
with tru_query_engine_recorder as recording:
    query_engine.query("What did the author do growing up?")

In [None]:
from trulens.dashboard.display import get_feedback_result

last_record = recording.records[-1]
get_feedback_result(last_record, "Context Relevance")

In [None]:
from trulens.apps.llamaindex.guardrails import WithFeedbackFilterNodes

guardrail_provider = OpenAI(model_engine="gpt-4.1-nano")

In [None]:
# note: feedback function used for guardrail must only return a score, not also reasons
f_context_relevance_score = Feedback(guardrail_provider.context_relevance, name="Context Relevance")

In [None]:
filtered_query_engine = WithFeedbackFilterNodes(
    query_engine, feedback=f_context_relevance_score, threshold=0.5
)

In [None]:
tru_recorder = TruLlama(
    filtered_query_engine,
    app_name="LlamaIndex_App",
    app_version="filtered",
    feedbacks=[f_answer_relevance, f_context_relevance, f_groundedness],
)

In [None]:
# from trulens.eval import Feedback, OpenAI as fOpenAI
from trulens.feedback import Groundedness

In [None]:
!pip show trulens
!pip show llama_index

In [None]:
from trulens_eval import OpenAI as fOpenAI
from trulens_eval.feedback import Groundedness

In [None]:
import trulens.core
import trulens.core.feedback
import trulens.core.feedback.feedback
dir(trulens.core.feedback.feedback)

In [None]:
import os

import openai
from trulens.core import Feedback
from trulens.core import FeedbackMode
from trulens.core import Select
from trulens.core import TruSession
from trulens.apps.llamaindex import TruLlama
from trulens.providers.openai import OpenAI as fOpenAI

session = TruSession()

session.reset_database()

In [None]:
!curl https://www.ipcc.ch/report/ar6/wg2/downloads/report/IPCC_AR6_WGII_Chapter03.pdf --output IPCC_AR6_WGII_Chapter03.pdf

In [None]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(
    input_files=["./IPCC_AR6_WGII_Chapter03.pdf"]
).load_data()

In [None]:
# Merge into a single large document rather than one document per-page
from llama_index.core import Document

document = Document(text="\n\n".join([doc.text for doc in documents]))

In [None]:
!gdown "https://drive.google.com/uc?id=16pH4NETEs43dwJUvYnJ9Z-bsR9_krkrP"

In [None]:
!tar -xzf sentence_index.tar.gz

In [None]:
from llama_index.core import ServiceContext
from llama_index.llms.openai import OpenAI
from llama_index.core.node_parser import SentenceWindowNodeParser

# create the sentence window node parser w/ default settings
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
sentence_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model="local:BAAI/bge-small-en-v1.5",
    node_parser=node_parser,
)