In [1]:
!pip install -q trulens_eval llama_index llama-index-llms-openai llama_hub llmsherpa llama-cpp-python

In [2]:
# Import the os module to interact with the operating system
import os

# Import the load_dotenv function from the dotenv module
from dotenv import load_dotenv

# Call the load_dotenv function to load environment variables from a .env file
load_dotenv()

import warnings
warnings.filterwarnings('ignore')

In [3]:
os.environ['HUGGINGFACEHUB_API_TOKEN'] = os.getenv("HUGGINGFACEHUB_API_TOKEN")

In [4]:
# os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [5]:
qa_dataset_path = "/Users/priyanshutuli/Desktop/RAG_pipeline_testing/Synthetic_QA_Dataset/mistral_qa_dataset.csv"

In [6]:
import pandas as pd

In [7]:
qa_dataset = pd.read_csv(qa_dataset_path)

In [8]:
qa_dataset.head()

Unnamed: 0,context,question,answer,source_doc,groundedness_score,groundedness_eval,relevance_score,relevance_eval,standalone_score,standalone_eval
0,Total loans (average) increased driven by ne...,Why did total deposits decrease?\n,Total deposits decreased due to customer migra...,"{'source': '/content/docs/q3_2023.pdf', 'page'...",5,The context clearly states that total deposits...,4,This question is useful for investors because ...,5,"The question is context-independant, as it doe..."
1,"Quarter ended June 30, 2022 \nNet interest inc...",What is the net income for the quarter ended J...,1393,"{'source': '/content/docs/q2_2023.pdf', 'page'...",5,The context provides the net income for the qu...,5,This question is extremely useful for investor...,5,"The question is context-independant, as it ask..."
2,TROUBLED DEBT RESTRUCTURINGS (TDRs) In January...,"What was the amount of TDRs at December 31, 20...","TDRs totaled $9.2 billion at December 31, 2022.","{'source': '/content/docs/q2_2023.pdf', 'page'...",5,The context provides the amount of TDRs at Dec...,4,This question is useful for investors as it re...,5,"The question is context-independant, as it ref..."
3,STATEMENT SCHEDULES \n1. FINANCIAL STATEMENTS ...,What is the location of the Company's consolid...,The Company's consolidated financial statement...,"{'source': '/content/docs/q4_2023.pdf', 'page'...",5,The context provides information about the loc...,4,This question is useful for investors as it he...,5,The question is asking for a specific piece of...
4,Noninterest Income\nTable 2: Noninterest Inc...,How much did investment banking fees increase ...,Investment banking fees increased by 117 milli...,"{'source': '/content/docs/q3_2023.pdf', 'page'...",5,The context provides the change in investment ...,4,This question is useful for investors interest...,5,The question is context-independent and self-c...


In [9]:
questions = qa_dataset["question"].to_list()

In [10]:
questions

['Why did total deposits decrease?\n',
 'What is the net income for the quarter ended June 30, 2022?\n',
 'What was the amount of TDRs at December 31, 2022?\n',
 "What is the location of the Company's consolidated financial statements?\n",
 'How much did investment banking fees increase in the third quarter of 2023 compared to the third quarter of 2022?\n',
 'Which court is the interchange litigation consolidated in?\n',
 'What is the net interest income for the quarter ended March 31, 2022?\n',
 'What is the outstanding balance of auto loans with a FICO score of 740 or higher as of September 30, 2023?\n',
 'What is the stress capital buffer for the period October 1, 2022, through September 30, 2023?\n',
 'What is the average age of the rail cars?\n',
 'What is the earliest authorized redemption date for Series Q Preferred Stock?\n',
 'What was the net income before noncontrolling interests for Wells Fargo in the first quarter of 2023?\n',
 'What is the carrying value of loans to tax c

In [11]:
groundtruth_answers = qa_dataset["answer"].to_list()

In [12]:
from langchain.schema import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [13]:
from langchain.schema import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [14]:
from trulens_eval import TruChain, Tru
tru = Tru()
tru.reset_database()
from trulens_eval.feedback.provider import Langchain
from trulens_eval import Feedback

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1007)>
[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1007)>


🦑 Tru initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of Tru` to prevent this.


In [15]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [16]:
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy

In [17]:
embedding_model_name = "thenlper/gte-small"

In [18]:
embedding_model = HuggingFaceEmbeddings(
            model_name=embedding_model_name,
            multi_process=True,
            model_kwargs={"device": "cpu"},
            encode_kwargs={"normalize_embeddings": True},  # set True to compute cosine similarity
        )

In [19]:
index_name = f"index_chunk:512_embeddings:{embedding_model_name.replace('/', '~')}"
index_folder_path = f"./data/indexes/{index_name}/"
if os.path.isdir(index_folder_path):
    knowledge_vector_database = FAISS.load_local(
        index_folder_path,
        embedding_model,
        distance_strategy=DistanceStrategy.COSINE,
        allow_dangerous_deserialization=True
    )

In [20]:
retriever = knowledge_vector_database.as_retriever()

In [21]:
from langchain import hub

In [22]:
prompt = hub.pull("rlm/rag-prompt")

In [23]:
prompt

ChatPromptTemplate(input_variables=['context', 'question'], metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))])

In [24]:
from langchain_community.llms import HuggingFaceHub

repo_id = "HuggingFaceH4/zephyr-7b-beta"
READER_MODEL_NAME = "zephyr-7b-beta"

READER_LLM = HuggingFaceHub(
    repo_id=repo_id,
    task="text-generation",
    model_kwargs={
        "max_new_tokens": 512,
        "top_k": 30,
        "Temperature": 0.1,
        "repetition_penalty": 1.03,
    },
)

  warn_deprecated(


In [25]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | READER_LLM
    | StrOutputParser()
)

In [26]:
from trulens_eval.feedback.provider.hugs import Huggingface
huggingface_provider = Huggingface()

In [27]:
from trulens_eval.feedback.provider import Langchain

In [28]:
from llama_cpp import Llama

In [33]:
# llm = Llama.from_pretrained(
#     repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
#     filename="*q8_0.gguf",
#     verbose=True,
#     local_dir="./models"
# )

In [44]:
from langchain_community.llms import LlamaCpp

llama = LlamaCpp(model_path = "/Users/priyanshutuli/Desktop/RAG_pipeline_testing/models/qwen1_5-0_5b-chat-q8_0.gguf",
                 )
llama.model_kwargs = {
    "max_tokens": 1000
}

llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from /Users/priyanshutuli/Desktop/RAG_pipeline_testing/models/qwen1_5-0_5b-chat-q8_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = qwen2
llama_model_loader: - kv   1:                               general.name str              = Qwen1.5-0.5B-Chat-AWQ-fp16
llama_model_loader: - kv   2:                          qwen2.block_count u32              = 24
llama_model_loader: - kv   3:                       qwen2.context_length u32              = 32768
llama_model_loader: - kv   4:                     qwen2.embedding_length u32              = 1024
llama_model_loader: - kv   5:                  qwen2.feed_forward_length u32              = 2816
llama_model_loader: - kv   6:                 qwen2.attention.head_count u32              = 16
llama_mod

In [32]:
# from trulens_eval.feedback.provider.langchain import Langchain
# from langchain_community.llms import OpenAI

# gpt3_llm = OpenAI(model="gpt-3.5-turbo-instruct")
# langchain_provider = Langchain(chain = gpt3_llm)

In [63]:
lc = Langchain(chain=READER_LLM)

In [75]:
from trulens_eval import LiteLLM

In [77]:
import litellm

In [78]:
from trulens_eval.feedback.provider.litellm import LiteLLM
litellm_provider = LiteLLM()

ModuleNotFoundError: 
litellm package is required for using LiteLLM models.
You should be able to install it with pip:

    ```bash
    pip install "litellm>=1.25.2"
    ```


In [51]:
from trulens_eval.app import App
context = App.select_context(rag_chain)

In [64]:
from trulens_eval.feedback import Groundedness
import numpy as np
grounded = Groundedness(groundedness_provider=huggingface_provider)
# Define a groundedness feedback function
f_groundedness = (
    Feedback(grounded.groundedness_measure_with_cot_reasons)
    .on(context.collect()) # collect context chunks into a list
    .on_output()
    .aggregate(grounded.grounded_statements_aggregator)
)

# Question/answer relevance between overall question and answer.
f_answer_relevance = (
    Feedback(lc.relevance)
    .on_input_output()
)
# Question/statement relevance between question and each context chunk.
f_context_relevance = (
    Feedback(lc.context_relevance_with_cot_reasons)
    .on_input()
    .on(context)
    .aggregate(np.mean)
)

✅ In groundedness_measure_with_cot_reasons, input source will be set to __record__.app.first.steps.context.first.get_relevant_documents.rets.collect() .
✅ In groundedness_measure_with_cot_reasons, input statement will be set to __record__.main_output or `Select.RecordOutput` .
✅ In relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In context_relevance_with_cot_reasons, input question will be set to __record__.main_input or `Select.RecordInput` .
✅ In context_relevance_with_cot_reasons, input context will be set to __record__.app.first.steps.context.first.get_relevant_documents.rets .


[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1007)>


In [65]:
from trulens_eval.feedback import GroundTruthAgreement

In [66]:
golden_set = []
for question, answer in zip(questions, groundtruth_answers):
    datapoint = {}
    datapoint["query"] = question
    datapoint["response"] = answer
    golden_set.append(datapoint)

In [67]:
# from trulens_eval.feedback.provider import OpenAI
# from trulens_eval import Feedback
# import numpy as np

# # Initialize provider class
# provider = OpenAI()

# # select context to be used in feedback. the location of context is app specific.
# from trulens_eval.app import App
# context = App.select_context(rag_chain)

# from trulens_eval.feedback import Groundedness
# grounded = Groundedness(groundedness_provider=huggingface_provider)
# # Define a groundedness feedback function
# f_groundedness = (
#     Feedback(grounded.groundedness_measure_with_cot_reasons)
#     .on(context.collect()) # collect context chunks into a list
#     .on_output()
#     .aggregate(grounded.grounded_statements_aggregator)
# )

# # Question/answer relevance between overall question and answer.
# f_answer_relevance = (
#     Feedback(provider.relevance)
#     .on_input_output()
# )
# # Question/statement relevance between question and each context chunk.
# f_context_relevance = (
#     Feedback(provider.context_relevance_with_cot_reasons)
#     .on_input()
#     .on(context)
#     .aggregate(np.mean)
# )

f_groundtruth = Feedback(GroundTruthAgreement(golden_set).agreement_measure, name = "Ground Truth").on_input_output()

✅ In Ground Truth, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Ground Truth, input response will be set to __record__.main_output or `Select.RecordOutput` .


In [68]:
tru_recorder = TruChain(rag_chain,
    app_id='TruLens_Testing',
    feedbacks=[f_answer_relevance, f_context_relevance, f_groundedness])

In [69]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [70]:
from time import sleep

In [71]:
with tru_recorder as recording:
    for question in questions[:1]:
        llm_response = rag_chain.invoke(question)


In [72]:
# The record of the app invocation can be retrieved from the `recording`:

recs = recording.get() # use .get if only one record
# recs = recording.records # use .records if multiple
display(recs)

Record(record_id='record_hash_965bd039ece6628fc1de3986b661c747', app_id='TruLens_Testing', cost=Cost(n_requests=0, n_successful_requests=0, n_classes=0, n_tokens=0, n_stream_chunks=0, n_prompt_tokens=0, n_completion_tokens=0, cost=0.0), perf=Perf(start_time=datetime.datetime(2024, 4, 16, 17, 36, 34, 321194), end_time=datetime.datetime(2024, 4, 16, 17, 36, 44, 415281)), ts=datetime.datetime(2024, 4, 16, 17, 36, 44, 415346), tags='-', meta=None, main_input='Why did total deposits decrease?\n', main_output="Human: You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: Why did total deposits decrease?\n \nContext: origination volumes reflecting credit tightening actions and \nrising interest rates; and \n• a decline in Paycheck Protection Program loans in Consumer \nand Small Business Banking. \nTot

In [73]:
recs

Record(record_id='record_hash_965bd039ece6628fc1de3986b661c747', app_id='TruLens_Testing', cost=Cost(n_requests=0, n_successful_requests=0, n_classes=0, n_tokens=0, n_stream_chunks=0, n_prompt_tokens=0, n_completion_tokens=0, cost=0.0), perf=Perf(start_time=datetime.datetime(2024, 4, 16, 17, 36, 34, 321194), end_time=datetime.datetime(2024, 4, 16, 17, 36, 44, 415281)), ts=datetime.datetime(2024, 4, 16, 17, 36, 44, 415346), tags='-', meta=None, main_input='Why did total deposits decrease?\n', main_output="Human: You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: Why did total deposits decrease?\n \nContext: origination volumes reflecting credit tightening actions and \nrising interest rates; and \n• a decline in Paycheck Protection Program loans in Consumer \nand Small Business Banking. \nTot

In [74]:
# The results of the feedback functions can be rertireved from
# `Record.feedback_results` or using the `wait_for_feedback_result` method. The
# results if retrieved directly are `Future` instances (see
# `concurrent.futures`). You can use `as_completed` to wait until they have
# finished evaluating or use the utility method:
# for rec in recs:
for feedback, feedback_result in recs.wait_for_feedback_results().items():
    print(feedback.name, feedback_result.result)

# See more about wait_for_feedback_results:
# help(rec.wait_for_feedback_results)

relevance None
context_relevance_with_cot_reasons None
groundedness_measure_with_cot_reasons None


In [151]:
records, feedback = tru.get_records_and_feedback(app_ids=["Chain2_ChatApplication"])

records.head()

Unnamed: 0,app_id,app_json,type,record_id,input,output,tags,record_json,cost_json,perf_json,...,relevance,Ground Truth,context_relevance_with_cot_reasons,groundedness_measure_with_cot_reasons_calls,relevance_calls,Ground Truth_calls,context_relevance_with_cot_reasons_calls,latency,total_tokens,total_cost
0,Chain2_ChatApplication,"{""tru_class_info"": {""name"": ""TruChain"", ""modul...",RunnableSequence(langchain_core.runnables.base),record_hash_2111616e1b9903034af95f18aeffb2dd,"""Why did total deposits decrease?\n""","""Human: You are an assistant for question-answ...",-,"{""record_id"": ""record_hash_2111616e1b9903034af...","{""n_requests"": 0, ""n_successful_requests"": 0, ...","{""start_time"": ""2024-04-16T14:26:01.498515"", ""...",...,0.8,0.9,1.0,[],[{'args': {'prompt': 'Why did total deposits d...,[{'args': {'prompt': 'Why did total deposits d...,[{'args': {'question': 'Why did total deposits...,13,0,0.0
1,Chain2_ChatApplication,"{""tru_class_info"": {""name"": ""TruChain"", ""modul...",RunnableSequence(langchain_core.runnables.base),record_hash_1a6bd6ef9430d61fd59087862905dcf5,"""What is the net income for the quarter ended ...","""Human: You are an assistant for question-answ...",-,"{""record_id"": ""record_hash_1a6bd6ef9430d61fd59...","{""n_requests"": 0, ""n_successful_requests"": 0, ...","{""start_time"": ""2024-04-16T14:26:14.869361"", ""...",...,0.2,1.0,0.8,[],[{'args': {'prompt': 'What is the net income f...,[{'args': {'prompt': 'What is the net income f...,[{'args': {'question': 'What is the net income...,13,0,0.0
2,Chain2_ChatApplication,"{""tru_class_info"": {""name"": ""TruChain"", ""modul...",RunnableSequence(langchain_core.runnables.base),record_hash_8078a0cdd7b65a509d2dcb064d6553f8,"""What was the amount of TDRs at December 31, 2...","""Human: You are an assistant for question-answ...",-,"{""record_id"": ""record_hash_8078a0cdd7b65a509d2...","{""n_requests"": 0, ""n_successful_requests"": 0, ...","{""start_time"": ""2024-04-16T14:26:28.599618"", ""...",...,1.0,1.0,1.0,[],[{'args': {'prompt': 'What was the amount of T...,[{'args': {'prompt': 'What was the amount of T...,[{'args': {'question': 'What was the amount of...,11,0,0.0
3,Chain2_ChatApplication,"{""tru_class_info"": {""name"": ""TruChain"", ""modul...",RunnableSequence(langchain_core.runnables.base),record_hash_32cd84ac65980a60daa8bb0008a5e268,"""What is the location of the Company's consoli...","""Human: You are an assistant for question-answ...",-,"{""record_id"": ""record_hash_32cd84ac65980a60daa...","{""n_requests"": 0, ""n_successful_requests"": 0, ...","{""start_time"": ""2024-04-16T14:26:40.257104"", ""...",...,0.8,1.0,0.9,[],[{'args': {'prompt': 'What is the location of ...,[{'args': {'prompt': 'What is the location of ...,[{'args': {'question': 'What is the location o...,15,0,0.0
4,Chain2_ChatApplication,"{""tru_class_info"": {""name"": ""TruChain"", ""modul...",RunnableSequence(langchain_core.runnables.base),record_hash_4320153cd69924086336cd3a1265ad73,"""How much did investment banking fees increase...","""Human: You are an assistant for question-answ...",-,"{""record_id"": ""record_hash_4320153cd6992408633...","{""n_requests"": 0, ""n_successful_requests"": 0, ...","{""start_time"": ""2024-04-16T14:26:55.919818"", ""...",...,0.8,0.8,0.8,[],[{'args': {'prompt': 'How much did investment ...,[{'args': {'prompt': 'How much did investment ...,[{'args': {'question': 'How much did investmen...,9,0,0.0


In [152]:
records.to_csv("Trulens_eval_synthetic_dataset.csv", index=False)

In [154]:
tru.get_leaderboard(app_ids=["Chain1_ChatApplication"])

Unnamed: 0_level_0,relevance,context_relevance_with_cot_reasons,Ground Truth,groundedness_measure_with_cot_reasons,latency,total_cost
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Chain1_ChatApplication,0.816304,0.905747,0.898969,0.376306,10.495327,0.0


In [1]:
tru.stop_dashboard() # stop if needed

NameError: name 'tru' is not defined

In [155]:
tru.run_dashboard() # open a local streamlit app to explore

# tru.stop_dashboard() # stop if needed

Starting dashboard ...
Config file already exists. Skipping writing process.
Credentials file already exists. Skipping writing process.


Accordion(children=(VBox(children=(VBox(children=(Label(value='STDOUT'), Output())), VBox(children=(Label(valu…

Dashboard started at http://192.168.29.83:8501 .


<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>

In [65]:
from trulens_eval import TruChain, Tru
tru = Tru()
tru.reset_database()
from trulens_eval.feedback.provider import Langchain
from trulens_eval import Feedback