In [9]:
import os
import numpy as np
import litellm
import pandas as pd

In [178]:
from langchain.schema import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain import hub
from langchain_core.output_parsers import JsonOutputParser

In [174]:
from trulens_eval.feedback.provider.hugs import Huggingface
from trulens_eval.feedback.provider import Langchain, litellm
from trulens_eval import Feedback
from trulens_eval import TruChain, Tru
from trulens_eval.feedback import Groundedness, GroundTruthAgreement

In [12]:
from ragas import evaluate
from ragas.metrics import faithfulness, answer_correctness, context_recall, context_relevancy, context_precision, answer_similarity, answer_correctness, AspectCritique

In [13]:
from trulens_eval.app import App

In [14]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['HUGGINGFACEHUB_API_TOKEN'] = os.getenv("HUGGINGFACEHUB_API_TOKEN")

In [242]:
class RAG_testing():

    def __init__(self, embedding_model_name, chunk_size, llm, qa_dataset_path):
        self.embedding_model_name = embedding_model_name
        self.chunk_size = chunk_size
        self.index_name = f"index_chunk:{self.chunk_size}_embeddings:{self.embedding_model_name.replace('/', '~')}"
        self.index_folder_path = f"./data/indexes/{self.index_name}/"
        self.embedding_model = self.load_embedding_model()
        self.knowledge_vector_database = self.load_knowledge_vector_database()
        self.prompt = hub.pull("rlm/rag-prompt")
        self.llm = llm
        self.qa_dataset = pd.read_csv(qa_dataset_path)
        self.rag_chain = self.create_rag_chain()
        self.golden_set, self.golden_df = self.create_golden_set()
        

    def format_docs(self, docs):
        return "\n\n".join(doc.page_content for doc in docs)
    
    def load_embedding_model(self):
        embedding_model = HuggingFaceEmbeddings(
            model_name=self.embedding_model_name,
            multi_process=True,
            model_kwargs={"device": "cpu"},
            encode_kwargs={"normalize_embeddings": True},  # set True to compute cosine similarity
        )
        return embedding_model

    def create_rag_chain(self):
        return (
            RunnableParallel({"context": self.knowledge_vector_database.as_retriever() | self.format_docs, "question": RunnablePassthrough()})
            | self.prompt
            | self.llm
            | StrOutputParser()
        )

    def create_golden_set(self):
        questions = self.qa_dataset["question"].to_list()
        answers = self.qa_dataset["answer"].to_list()
        golden_set = []
        datapoint_list = []
        for question, answer in zip(questions, answers):
            datapoint = {}
            datapoint["query"] = question
            datapoint["response"] = answer
            golden_set.append(datapoint)
            datapoint_list.append({"question": question, "groundtruth_answer": answer, "llm_anaswer": self.rag_chain.invoke(question)})
        golden_df = pd.DataFrame()
        return golden_set, golden_df

    def load_knowledge_vector_database(self):
        if os.path.isdir(self.index_folder_path):
            knowledge_vector_database = FAISS.load_local(
                self.index_folder_path,
                self.embedding_model,
                distance_strategy=DistanceStrategy.COSINE,
                allow_dangerous_deserialization=True
            )
            return knowledge_vector_database
        else:
            raise Exception("Vector Database path does not exist")
    
    def load_context(self):
        self.context = App.select_context(self.rag_chain)

    def trulens_metrics(self, llm_provider, huggingface_provider):
        self.load_context()
        grounded = Groundedness(groundedness_provider=llm_provider)
        # Define a groundedness feedback function
        f_groundedness = (
            Feedback(grounded.groundedness_measure_with_cot_reasons)
            .on(self.context.collect()) # collect context chunks into a list
            .on_output()
            .aggregate(grounded.grounded_statements_aggregator)
        )

        # Question/answer relevance between overall question and answer.
        f_answer_relevance = (
            Feedback(llm_provider.relevance)
            .on_input_output()
        )

        # Question/statement relevance between question and each context chunk.
        f_context_relevance = (
            Feedback(llm_provider.context_relevance_with_cot_reasons)
            .on_input()
            .on(self.context)
            .aggregate(np.mean)
        )

        # f_groundtruth = Feedback(GroundTruthAgreement(self.golden_set).agreement_measure, name = "Ground Truth").on_input_output()

        f_pii_detection = Feedback(huggingface_provider.pii_detection_with_cot_reasons, name= "PII score").on_input()

        f_coherence = Feedback(llm_provider.coherence_with_cot_reasons).on_output()

        f_comprehensiveness = Feedback(llm_provider.comprehensiveness_with_cot_reasons).on_input_output()

        f_conciseness = Feedback(llm_provider.conciseness_with_cot_reasons).on_output()

        f_correctness = Feedback(llm_provider.correctness_with_cot_reasons).on_output()

        f_criminality = Feedback(llm_provider.criminality_with_cot_reasons).on_output()

        f_harmfulness = Feedback(llm_provider.harmfulness_with_cot_reasons).on_output()

        f_helpfulness = Feedback(llm_provider.helpfulness_with_cot_reasons).on_output()

        f_maliciousness = Feedback(llm_provider.maliciousness_with_cot_reasons).on_output()

        self.trulens_feedback = [f_groundedness, f_answer_relevance, f_context_relevance, f_pii_detection,
                                 f_coherence, f_comprehensiveness,
                f_conciseness, f_correctness, f_criminality, f_harmfulness, f_helpfulness, f_maliciousness]
    
    def create_tru_recorder(self, name):
        self.tru = Tru()
        self.tru.reset_database()
        self.tru_chain = TruChain(self.rag_chain,
                app_id=name,
                feedbacks=self.trulens_feedback,
                metadata = {})
    
    def get_trulens_metrics(self, name="RAG_Triad_Testing"):
        self.create_tru_recorder(name)
        with self.tru_chain as recording:
            for qa in self.golden_set[:1]:
                llm_response = self.rag_chain.invoke(qa["query"])
        records, feedback = self.tru.get_records_and_feedback(app_ids=[name])
        return records, feedback

    
    def ragas_metrics(self):
        pass

In [243]:
from langchain_community.llms import HuggingFaceHub

repo_id = "HuggingFaceH4/zephyr-7b-beta"
READER_MODEL_NAME = "zephyr-7b-beta"

READER_LLM = HuggingFaceHub(
    repo_id=repo_id,
    task="text-generation",
    model_kwargs={
        "max_new_tokens": 512,
        "top_k": 30,
        "Temperature": 0.1,
        "repetition_penalty": 1.03,
    },
)

In [244]:
rag_testing = RAG_testing("thenlper/gte-small", 512, READER_LLM, "/Users/priyanshutuli/Desktop/RAG_pipeline_testing/Synthetic_QA_Dataset/mistral_qa_dataset.csv")

In [245]:
from trulens_eval.feedback.provider import OpenAI
provider = OpenAI(model_engine="gpt-4-turbo")

In [246]:
from trulens_eval.feedback.provider.litellm import LiteLLM
litellm_provider = LiteLLM(model_engine="mistral/mistral-tiny", info=True)

In [247]:
litellm.set_verbose=True

In [250]:
rag_testing.trulens_metrics(provider, huggingface_provider=Huggingface("mistralai/Mistral-7B-Instruct-v0.1"))

✅ In groundedness_measure_with_cot_reasons, input source will be set to __record__.app.first.steps.context.first.get_relevant_documents.rets.collect() .
✅ In groundedness_measure_with_cot_reasons, input statement will be set to __record__.main_output or `Select.RecordOutput` .
✅ In relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In context_relevance_with_cot_reasons, input question will be set to __record__.main_input or `Select.RecordInput` .
✅ In context_relevance_with_cot_reasons, input context will be set to __record__.app.first.steps.context.first.get_relevant_documents.rets .
✅ In PII score, input text will be set to __record__.main_input or `Select.RecordInput` .


[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1007)>


In [251]:
records, feedback = rag_testing.get_trulens_metrics(name="Test4")

Groundedness per statement in source:   0%|          | 0/15 [00:00<?, ?it/s]

In [252]:
records.head()

Unnamed: 0,app_id,app_json,type,record_id,input,output,tags,record_json,cost_json,perf_json,ts,latency,total_tokens,total_cost
0,Test4,"{""tru_class_info"": {""name"": ""TruChain"", ""modul...",RunnableSequence(langchain_core.runnables.base),record_hash_e5603a89b63c7e5b22fcdb5a28175e0c,"""Why did total deposits decrease?\n""","""Human: You are an assistant for question-answ...",-,"{""record_id"": ""record_hash_e5603a89b63c7e5b22f...","{""n_requests"": 0, ""n_successful_requests"": 0, ...","{""start_time"": ""2024-04-18T13:03:15.163947"", ""...",2024-04-18T13:03:28.036055,12,0,0.0


In [253]:
feedback

[]

In [206]:
with tru_recorder as recording:
    for question in rag_testing.qa_dataset["question"].to_list()[:1]:
        llm_response = rag_testing.rag_chain.invoke(question)
        print(llm_response)

Human: You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: Why did total deposits decrease?
 
Context: origination volumes reflecting credit tightening actions and 
rising interest rates; and 
• a decline in Paycheck Protection Program loans in Consumer 
and Small Business Banking. 
Total deposits (average) decreased due to consumer deposit 
outflows on consumer spending, as well as customer migration 
to higher yielding alternatives. 
Wells Fargo & Company 3 
16

Total assets 
Total deposits $ 
borrowed 
$ 130,008 
59,020 17,804 
206,832 
559,520 
158,770 109,634 
42,696 24,540 
176,870 
567,733 
162,439 20,374 16,324 
(6,736) 
29,962 
(8,213) (3,669) 19 38 
(27) 
17 
(1) (2) $ 
$ 130,008 
59,020 
17,804 
206,832 
559,520 
158,770 109,634 
42,696 24,540 
176,870 
567,733 
162,439 20,374 16,32

In [207]:
recs = recording.get() # use .get if only one record
# recs = recording.records # use .records if multiple
display(recs)

Record(record_id='record_hash_d468d579f561fc86968ffc5d8575ef65', app_id='TruLens_Testing', cost=Cost(n_requests=0, n_successful_requests=0, n_classes=0, n_tokens=0, n_stream_chunks=0, n_prompt_tokens=0, n_completion_tokens=0, cost=0.0), perf=Perf(start_time=datetime.datetime(2024, 4, 18, 0, 46, 39, 234998), end_time=datetime.datetime(2024, 4, 18, 0, 46, 50, 416210)), ts=datetime.datetime(2024, 4, 18, 0, 46, 50, 416314), tags='-', meta=None, main_input='Why did total deposits decrease?\n', main_output="Human: You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: Why did total deposits decrease?\n \nContext: origination volumes reflecting credit tightening actions and \nrising interest rates; and \n• a decline in Paycheck Protection Program loans in Consumer \nand Small Business Banking. \nTotal 

In [208]:
for feedback, feedback_result in recs.wait_for_feedback_results().items():
    print(feedback.name, feedback_result.result)

PII score 1.0
Positive Sentiment Score None
Language Match Score None
Hallucination Score None


In [177]:
records, feedback = tru.get_records_and_feedback(app_ids=["RAG-Testing"])

records.head()

Unnamed: 0,app_id,app_json,type,record_id,input,output,tags,record_json,cost_json,perf_json,ts,latency,total_tokens,total_cost
