# Lesson 2: RAG Triad of metrics

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import utils

import os
import openai
openai.api_key = utils.get_openai_api_key()

✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input response will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input source will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\RistoHinno\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
from trulens_eval import Tru

tru = Tru()
tru.reset_database()

🦑 Tru initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of Tru` to prevent this.


## laod data

In [4]:
def list_files_recursively(directory):
    paths=[]
    for root, dirs, files in os.walk(directory):
        for file in files:
            paths.append(os.path.join(root, file))
    return paths

# Example usage
papers=list_files_recursively('support_articles/')

In [5]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(
#     input_files=["./eBook-How-to-Build-a-Career-in-AI.pdf"]
    input_files=papers
).load_data()

In [6]:
from llama_index.core import Document

document = Document(text="\n\n".\
                    join([doc.text for doc in documents]))

In [7]:
from utils import build_sentence_window_index

# from llama_index.llms import OpenAI
from  llama_index.llms.openai import OpenAI

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1, api_key=openai.api_key)

sentence_index = build_sentence_window_index(
    document,
    llm,
    embed_model="local:BAAI/bge-small-en-v1.5",
#     embed_model='text-embedding-ada-002',
    save_dir="sentence_index"
)

In [8]:
from utils import get_sentence_window_query_engine

sentence_window_engine = \
get_sentence_window_query_engine(sentence_index)

In [9]:
output = sentence_window_engine.query(
    "What is needed to create topic model?")
output.response

'Texts as input and no labelling are needed to create a topic model.'

## Feedback functions

In [10]:
import nest_asyncio

nest_asyncio.apply()

In [11]:
from trulens_eval import OpenAI as fOpenAI

provider = fOpenAI(api_key=openai.api_key)

### 1. Answer Relevance

In [12]:
from trulens_eval import Feedback

f_qa_relevance = Feedback(
    provider.relevance_with_cot_reasons,
    name="Answer Relevance"
).on_input_output()

✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .


### 2. Context Relevance

In [13]:
from trulens_eval import TruLlama

context_selection = TruLlama.select_source_nodes().node.text

In [14]:
import numpy as np

f_qs_relevance = (
    Feedback(provider.qs_relevance,
             name="Context Relevance")
    .on_input()
    .on(context_selection)
    .aggregate(np.mean)
)

✅ In Context Relevance, input question will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input context will be set to __record__.app.query.rets.source_nodes[:].node.text .


In [15]:
import numpy as np

f_qs_relevance = (
    Feedback(provider.qs_relevance_with_cot_reasons,
             name="Context Relevance")
    .on_input()
    .on(context_selection)
    .aggregate(np.mean)
)

✅ In Context Relevance, input question will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input context will be set to __record__.app.query.rets.source_nodes[:].node.text .


### 3. Groundedness

In [16]:
from trulens_eval.feedback import Groundedness

grounded = Groundedness(groundedness_provider=provider)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\RistoHinno\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
f_groundedness = (
    Feedback(grounded.groundedness_measure_with_cot_reasons,
             name="Groundedness"
            )
    .on(context_selection)
    .on_output()
    .aggregate(grounded.grounded_statements_aggregator)
)

✅ In Groundedness, input source will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .


## Evaluation of the RAG application

In [18]:
from trulens_eval import TruLlama
from trulens_eval import FeedbackMode

tru_recorder = TruLlama(
    sentence_window_engine,
    app_id="App_1",
    feedbacks=[
        f_qa_relevance,
        f_qs_relevance,
        f_groundedness
    ]
)

In [19]:
eval_questions = []
with open('data/eval_questions.txt', 'r') as file:
    for line in file:
        # Remove newline character and convert to integer
        item = line.strip()
        eval_questions.append(item)

In [20]:
eval_questions

['how can I make topic model?',
 'how much data is needed for topic model?',
 'can LLMs be used for topic models?',
 'what are the steps for training topic model?',
 'What kind of analysis can I make in the platform?',
 'how to find unwanted calls?',
 'what are options for sentiment detection in call transcripts?',
 'where should I start if I am total beginner in using the application?',
 'can I download data into excel file?',
 'anonymization is not working correctly, some names are not removed. What should I do?']

In [21]:
for question in eval_questions:
    with tru_recorder as recording:
        sentence_window_engine.query(question)

Groundedness per statement in source:   0%|          | 0/5 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/5 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/6 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/6 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/4 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/2 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/4 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/2 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

In [22]:
records, feedback = tru.get_records_and_feedback(app_ids=[])
records.head()

Unnamed: 0,app_id,app_json,type,record_id,input,output,tags,record_json,cost_json,perf_json,ts,Answer Relevance,Context Relevance,Groundedness,Answer Relevance_calls,Context Relevance_calls,Groundedness_calls,latency,total_tokens,total_cost
0,App_1,"{""tru_class_info"": {""name"": ""TruLlama"", ""modul...",RetrieverQueryEngine(llama_index.core.query_en...,record_hash_68a5a07d88c9d889c1e9d62bfe1fdafa,"""how can I make topic model?""","""To create a topic model, log in to the FS Tex...",-,"{""record_id"": ""record_hash_68a5a07d88c9d889c1e...","{""n_requests"": 1, ""n_successful_requests"": 1, ...","{""start_time"": ""2024-05-14T09:31:12.040127"", ""...",2024-05-14T09:31:16.417120,1.0,0.5,1.0,[{'args': {'prompt': 'how can I make topic mod...,[{'args': {'question': 'how can I make topic m...,[{'args': {'source': 'Model creation form Now...,4,563,0.000888
1,App_1,"{""tru_class_info"": {""name"": ""TruLlama"", ""modul...",RetrieverQueryEngine(llama_index.core.query_en...,record_hash_18f471fc1b797765e626e795c71c09c1,"""how much data is needed for topic model?""","""The more data you have, the longer the traini...",-,"{""record_id"": ""record_hash_18f471fc1b797765e62...","{""n_requests"": 1, ""n_successful_requests"": 1, ...","{""start_time"": ""2024-05-14T09:31:16.796523"", ""...",2024-05-14T09:31:20.074531,0.2,0.8,1.0,[{'args': {'prompt': 'how much data is needed ...,[{'args': {'question': 'how much data is neede...,[{'args': {'source': 'You have to provide the ...,3,441,0.000673
2,App_1,"{""tru_class_info"": {""name"": ""TruLlama"", ""modul...",RetrieverQueryEngine(llama_index.core.query_en...,record_hash_e934efd789a0bd6e47b6b3856c102f4d,"""can LLMs be used for topic models?""","""LLMs can be used for topic models.""",-,"{""record_id"": ""record_hash_e934efd789a0bd6e47b...","{""n_requests"": 1, ""n_successful_requests"": 1, ...","{""start_time"": ""2024-05-14T09:31:20.432088"", ""...",2024-05-14T09:31:23.709058,0.2,0.2,0.0,[{'args': {'prompt': 'can LLMs be used for top...,[{'args': {'question': 'can LLMs be used for t...,[{'args': {'source': 'Model creation form Now...,3,391,0.000591
3,App_1,"{""tru_class_info"": {""name"": ""TruLlama"", ""modul...",RetrieverQueryEngine(llama_index.core.query_en...,record_hash_b46b33c7c6c3a4fd583b0ba70c713713,"""what are the steps for training topic model?""","""To train a topic model using the FS Text appl...",-,"{""record_id"": ""record_hash_b46b33c7c6c3a4fd583...","{""n_requests"": 1, ""n_successful_requests"": 1, ...","{""start_time"": ""2024-05-14T09:31:24.001187"", ""...",2024-05-14T09:31:28.216402,0.8,0.7,1.0,[{'args': {'prompt': 'what are the steps for t...,[{'args': {'question': 'what are the steps for...,[{'args': {'source': 'The main topic/area: ...,4,610,0.000942
4,App_1,"{""tru_class_info"": {""name"": ""TruLlama"", ""modul...",RetrieverQueryEngine(llama_index.core.query_en...,record_hash_1a1e43c360341e65ac0aefd7d6f07cc6,"""What kind of analysis can I make in the platf...","""You can conduct topic prediction analysis on ...",-,"{""record_id"": ""record_hash_1a1e43c360341e65ac0...","{""n_requests"": 1, ""n_successful_requests"": 1, ...","{""start_time"": ""2024-05-14T09:31:28.691255"", ""...",2024-05-14T09:31:31.661930,0.8,0.8,0.7,[{'args': {'prompt': 'What kind of analysis ca...,[{'args': {'question': 'What kind of analysis ...,[{'args': {'source': 'Predict “Predict”. Thi...,2,329,0.000499


In [23]:
import pandas as pd

pd.set_option("display.max_colwidth", None)
records[["input", "output"] + feedback]

Unnamed: 0,input,output,Answer Relevance,Context Relevance,Groundedness
0,"""how can I make topic model?""","""To create a topic model, log in to the FS Text application and choose the Model builder menu item. In the Model builder view, click on the New model option in the left-hand menu. Fill in the form with a descriptive name, description, business purpose, and language. Choose Topic model as the Model type from the dropdown menu and then press Create model. Follow the model creation steps as listed in tabs after creating the model.""",1.0,0.5,1.0
1,"""how much data is needed for topic model?""","""The more data you have, the longer the training process duration will be, usually not longer than a few minutes.""",0.2,0.8,1.0
2,"""can LLMs be used for topic models?""","""LLMs can be used for topic models.""",0.2,0.2,0.0
3,"""what are the steps for training topic model?""","""To train a topic model using the FS Text application, follow these steps:\n\n1. Log in to the FS Text application.\n2. Choose the Model builder menu item.\n3. In the Model builder view, click on the New model option in the left-hand menu.""",0.8,0.7,1.0
4,"""What kind of analysis can I make in the platform?""","""You can conduct topic prediction analysis on the platform.""",0.8,0.8,0.7
5,"""how to find unwanted calls?""","""To find unwanted calls, you can utilize a Topic model in the Model builder view. By analyzing the topics generated in the bubble chart, you can identify topics related to unwanted calls based on the words associated with each topic. Look for topics that contain keywords or phrases commonly associated with unwanted calls, such as spam, telemarketing, or nuisance. If you encounter topics that do not align with identifying unwanted calls, you may need to adjust the cleaning pipeline by removing more words or consider changing the number of topics to better capture the relevant information.""",0.8,0.3,
6,"""what are options for sentiment detection in call transcripts?""","""The options for sentiment detection in call transcripts include using sentiment analysis models that can analyze the text data to determine the sentiment expressed in the conversations. These models can classify the sentiment as positive, negative, or neutral based on the language used in the call transcripts.""",0.8,0.2,0.0
7,"""where should I start if I am total beginner in using the application?""","""Start by logging into the FS Text application and selecting the Model builder menu item.""",,,
8,"""can I download data into excel file?""","""Yes, you can download data into an Excel file.""",,,
9,"""anonymization is not working correctly, some names are not removed. What should I do?""","""You should review the list of words that are meant to be removed during the anonymization process. Check if the names that are not being removed are included in the list of words to be removed. If they are not, consider adding those specific names to the list in the correct format and save the updated list. This should help improve the anonymization process by ensuring that the names you want to remove are included in the removal list.""",,,


In [24]:
tru.get_leaderboard(app_ids=[])

Groundedness per statement in source:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Answer Relevance,Context Relevance,Groundedness,latency,total_cost
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
App_1,0.657143,0.5,0.616667,3.4,0.000723


In [25]:
tru.run_dashboard()

Starting dashboard ...
Config file already exists. Skipping writing process.
Credentials file already exists. Skipping writing process.


Accordion(children=(VBox(children=(VBox(children=(Label(value='STDOUT'), Output())), VBox(children=(Label(valu…

Dashboard started at http://10.8.0.8:8501 .


<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/4 [00:00<?, ?it/s]