Try evaluation with TruLens

Minimal example

In [None]:
from openai import OpenAI
from dotenv import load_dotenv
from trulens_eval import TruBasicApp
from trulens_eval import Feedback, OpenAI as fOpenAI, Tru

load_dotenv()

In [None]:
# Database to store evals
tru = Tru("sqlite:///trulens_eval.db")
tru.reset_database()

In [None]:
# Mock QA app
client = OpenAI()


def llm_standalone(prompt):
    return (
        client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {
                    "role": "system",
                    "content": "You are a question and answer bot, and you answer super upbeat.",
                },
                {"role": "user", "content": prompt},
            ],
        )
        .choices[0]
        .message.content
    )

In [None]:
# Eval
fopenai = fOpenAI()
f_relevance = Feedback(fopenai.relevance).on_input_output()
tru_llm_standalone_recorder = TruBasicApp(
    llm_standalone, app_id="Happy Bot", feedbacks=[f_relevance]
)

with tru_llm_standalone_recorder as recording:
    prompt_input = "How good is language AI?"
    tru_llm_standalone_recorder.app(prompt_input)

In [None]:
tru.run_dashboard()

Triangle metrics

In [None]:
from trulens_eval import Feedback, Select
from trulens_eval.feedback import Groundedness
from trulens_eval.feedback.provider.openai import OpenAI as fOpenAI

import numpy as np

# Initialize provider class
fopenai = fOpenAI()

grounded = Groundedness(groundedness_provider=fopenai)

# Define a groundedness feedback function
f_groundedness = (
    Feedback(grounded.groundedness_measure_with_cot_reasons, name="Groundedness")
    .on(Select.RecordCalls.retrieve.rets.collect())
    .on_output()
    .aggregate(grounded.grounded_statements_aggregator)
)

# Question/answer relevance between overall question and answer.
f_qa_relevance = (
    Feedback(fopenai.relevance_with_cot_reasons, name="Answer Relevance")
    .on(Select.RecordCalls.retrieve.args.query)
    .on_output()
)

# Question/statement relevance between question and each context chunk.
f_context_relevance = (
    Feedback(fopenai.qs_relevance_with_cot_reasons, name="Context Relevance")
    .on(Select.RecordCalls.retrieve.args.query)
    .on(Select.RecordCalls.retrieve.rets.collect())
    .aggregate(np.mean)
)