In [20]:
from trulens_eval import Tru
from trulens_eval.tru_custom_app import instrument
tru = Tru()

import dotenv
dotenv.load_dotenv()

from pinecone_code import PineconeServerless

In [21]:
#prompt = "You are a chatbot build to answer questions about the meeting. You will receive relevent meeting transcript. You should understand the transcript and answer the user query. You can ask questions to the user and answer their questions based on the context provided."
#prompt = "You are a chatbot build to answer questions about the meeting. You will refrain answering questions for which the context is not provided. You will answer the questions solely based on the context provided. You can ask questions to the user and answer their questions based on the context provided."
prompt  = "You are a friendly chat model. You will respond to the user's message using only the context provided. The context can be anything from a single message to a whole conversation. You can also ask questions to the user."
#prompt = "You are a friendly chatbot build to answer questions about the meeting. You will be benalized if you answer questions for which the context is not provided. You will answer the questions solely based on the context provided. You can ask questions to the user and answer their questions based on the context provided."
#prompt = "You are provided the meeting transcript file. The data includes meeting speaker, time of the conversation and the conversation text. Your job is to answer user questions based on the context provided. You can ask questions to the user and answer their questions based on the context provided."
#prompt = "Youa re a professional chat bot. You will answer the user's questions based on the context provided. You can ask questions to the user and answer their questions based on the context provided. You will refrain from answering questions for which the context is not provided. If the context is not clear, you will ask the user for more information. You will be penalized if you answer questions for which the context is not provided."

In [22]:
from openai import OpenAI
oai_client = OpenAI()

oai_client.embeddings.create(
        model="text-embedding-ada-002",
        input=prompt,
    )

CreateEmbeddingResponse(data=[Embedding(embedding=[-0.02812923863530159, 0.008014813996851444, 0.0110767288133502, -0.017456278204917908, -0.009064613841474056, 0.01981159672141075, -0.02820999175310135, -0.005037018097937107, -0.010767173022031784, -0.022436095401644707, -0.008849269710481167, 0.009818315505981445, -0.00585128553211689, -0.01360701397061348, -0.013035008683800697, -0.0370929092168808, 0.018573373556137085, -0.012866770848631859, 0.01565277762711048, -0.019650090485811234, -0.01581428386271, 0.0039569358341395855, -0.010275919921696186, -0.0022526944521814585, -0.019959645345807076, 0.0006334125646390021, 0.0049394406378269196, -0.010026929900050163, -0.008304181508719921, -0.027644716203212738, -0.014589519239962101, 0.004959628917276859, -0.0007183722918853164, -0.03111712820827961, 0.0015494634862989187, 0.01241589616984129, 0.009542406536638737, 0.0059656864032149315, 0.02912520244717598, 0.019475122913718224, 0.03203233703970909, 0.017187099903821945, -0.007772552

In [23]:
def parse_conversations(conversations) -> str:
    data = []
    for cluster_id, cluster_df in conversations.items():
        for i, row in cluster_df.iterrows():
            data.append(
                f"{row['start_time']} - {row['speaker']}: {row['text']}"
            )
        data.append("\n\n")
    data = "\n".join(data)
    return data

In [24]:
class RAG_from_scratch:
    @instrument
    def retrieve(self, query: str, in_filter: list) -> list:
        """
        Retrieve relevant text from vector store.
        """
        obj1 = PineconeServerless()
        response1 = obj1.query_pinecone(query, in_filter)
        conversation1_df = obj1.query_delta_conversations()
        #text_list = list(conversation1_df)
        context = parse_conversations(conversation1_df)
        return context
    
    @instrument
    def generate_completion(self, query: str, context_str) -> str:
        """
        Generate answer from context.
        """
        completion = oai_client.chat.completions.create(
        model="gpt-3.5-turbo",
        temperature=0,
        messages=
        [
            {"role": "user",
            "content": 
            f"We have provided context information below. \n"
            f"---------------------\n"
            f"{context_str}"
            f"\n---------------------\n"
            f"Given this information, please answer the question: {query}"
            }
        ]
        ).choices[0].message.content
        return completion

    @instrument
    def query(self, query: str, in_filter:list) -> str:
        context_str = self.retrieve(query, in_filter)
        #print("CONTEXT ",context_str)
        completion = self.generate_completion(query, context_str)
        return completion

rag = RAG_from_scratch()

In [25]:
from trulens_eval import Feedback, Select
from trulens_eval.feedback import Groundedness
from trulens_eval.feedback.provider.openai import OpenAI as fOpenAI

import numpy as np
# Initialize provider class
fopenai = fOpenAI()

grounded = Groundedness(groundedness_provider=fopenai)

# Define a groundedness feedback function
f_groundedness = (
    Feedback(grounded.groundedness_measure_with_cot_reasons, name = "Groundedness")
    .on(Select.RecordCalls.retrieve.rets.collect())
    .on_output()
    .aggregate(grounded.grounded_statements_aggregator)
)

# Question/answer relevance between overall question and answer.
f_qa_relevance = (
    Feedback(fopenai.relevance_with_cot_reasons, name = "Answer Relevance")
    .on(Select.RecordCalls.retrieve.args.query)
    .on_output()
)

# Question/statement relevance between question and each context chunk.
f_context_relevance = (
    Feedback(fopenai.qs_relevance_with_cot_reasons, name = "Context Relevance")
    .on(Select.RecordCalls.retrieve.args.query)
    .on(Select.RecordCalls.retrieve.rets.collect())
    .aggregate(np.mean)
)

✅ In Groundedness, input source will be set to __record__.app.retrieve.rets.collect() .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Answer Relevance, input prompt will be set to __record__.app.retrieve.args.query .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input question will be set to __record__.app.retrieve.args.query .
✅ In Context Relevance, input statement will be set to __record__.app.retrieve.rets.collect() .


In [26]:
from trulens_eval import TruCustomApp
tru_rag = TruCustomApp(rag,
    app_id = 'RAG v1',
    feedbacks = [f_groundedness, f_qa_relevance, f_context_relevance])

Function <function RAG_from_scratch.generate_completion at 0x000001AEB8EAD820> was not found during instrumentation walk. Make sure it is accessible by traversing app <__main__.RAG_from_scratch object at 0x000001AEBDF09070> or provide a bound method for it as TruCustomApp constructor argument `methods_to_instrument`.
Function <function RAG_from_scratch.retrieve at 0x000001AEB8EAD670> was not found during instrumentation walk. Make sure it is accessible by traversing app <__main__.RAG_from_scratch object at 0x000001AEBDF09070> or provide a bound method for it as TruCustomApp constructor argument `methods_to_instrument`.
Function <function RAG_from_scratch.query at 0x000001AEB8EAD940> was not found during instrumentation walk. Make sure it is accessible by traversing app <__main__.RAG_from_scratch object at 0x000001AEBDF09070> or provide a bound method for it as TruCustomApp constructor argument `methods_to_instrument`.


In [27]:
# if testing with clustering 
#in_filter = ['a4454589-6c49-4d8d-a6f6-c2b5c99ef229', '7c3c7ac7-94ee-4045-b94a-e9b1695c5b7e']
in_filter = []  #if not using clustering

In [28]:


'''
# Create a CSV with Q&A pairs
# read using pd


'''

queries = ["how many patients came to the hospital campuses today?",
           "How many patients are in the ICU?",
           "What is the recommended waiting period for COVID-19 vaccination after receiving monoclonal antibodies treatment for individuals who have previously contracted COVID-19?",
           "How much of hospital staff is expected to be lost?",
           "When will children between 5-11 years old be eligible for COVID-19 vaccination?",
           "Do you have high risk if you live in King County?",
           
    
           "How heavy is a shark?",
           "What is the capital of France?",
           
           
           "When would people not have to wear masks in schools?",
           "Who recommends universal masking?",
           ]

ground_truths = ["The hospital has 54 patients across the four campuses today.",
                 "There are 23 patients in the ICU.",
                 "The recommended waiting period for COVID-19 vaccination after receiving monoclonal antibodies treatment for individuals who have previously contracted COVID 19 is 90 days.",
                 "5 percent of hospital staff is expected to be lost.",
                 "Children between 5-11 years old will be eligible for COVID-19 vaccination starting from November 1st.",
                 "Yes, if you live in King County, you have nine folds greater likelihood of getting infected.",
                 
                 
                 "The provided context information does not contain any relevant information about the weight of a shark.",
                 "The provided context information does not contain any relevant information about Pais",
                 
                 
                 "People would not have to wear masks in schools if the school have met the 80% vaccination criteria.",
                 "The American Academy of Pediatrics and the Centers for Disease Control recommend universal masking.",
                 ]

qna = []

In [None]:
completion = rag.query("how many patients came to the hospital campuses today?", in_filter)

In [None]:
completion

In [None]:
for query in queries:
    with tru_rag as recording:
        completion = rag.query(query, in_filter)
        print(completion)
        qna.append([query, completion, ground_truths[queries.index(query)]])

In [29]:
tru.get_leaderboard(app_ids=["RAG v1"])

Unnamed: 0_level_0,Answer Relevance,Context Relevance,Groundedness,latency,total_cost
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
RAG v1,0.97,0.63,0.7,3.2,0.003014


In [None]:
for q,a,gt in qna:
    print(f"Query: {q}\nAnswer: {a}", f"\nGround Truth: {gt}\n\n")

In [None]:
import pandas as pd
res = pd.DataFrame(qna, columns = ['Query', 'Answer', 'Ground Truth'])
res.to_csv('rag_results.csv', index = False)

In [None]:
results_df=tru.get_records_and_feedback(app_ids=["RAG v1"])[0]
results_df.to_csv('rag_feedback.csv', index = False)

In [None]:
# RUNS THE DASHBOARD
tru.run_dashboard()

In [None]:
# #evaluating questions and answers uby finding cosine similarity between the question and the answer

# from sentence_transformers import SentenceTransformer, util
# model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# def get_similarity_score(query, answer):
#     query_embedding = model.encode(query, convert_to_tensor=True)
#     answer_embedding = model.encode(answer, convert_to_tensor=True)
#     cosine_scores = util.pytorch_cos_sim(query_embedding, answer_embedding)
#     return cosine_scores.item()

# # for q,a in qna:
# #     print(f"Query: {q}\nAnswer: {a}\nSimilarity Score: {get_similarity_score(q,a)}\n\n")
    
# res['Query_Answer Similarity Score'] = res.apply(lambda x: get_similarity_score(x['Query'], x['Answer']), axis=1)    
# res['Answer_Ground Truth Similarity Score'] = res.apply(lambda x: get_similarity_score(x['Ground Truth'], x['Answer']), axis=1) 


In [None]:
# EXTRA EVAL: ROGUE

In [None]:
# Getting thr context from the conversation that was received from the Pinecone

obj2 = PineconeServerless()

str_list = []
for q, _, _ in qna:
    response2 = obj2.query_pinecone(q, in_filter)
    conversation1_df = obj2.query_delta_conversations()
    text_list = list(conversation1_df['text'])
    str_text = ' '.join(text_list)
    str_list.append(str_text)


res['Context'] = str_list

In [None]:
from rouge import Rouge

rouge = Rouge()
def get_rouge_score(generated_summary, reference_summary):
    scores = rouge.get_scores(generated_summary, reference_summary, avg=True)
    return scores

# for i in range(len(res)):
#     print(f"Query: {res['Query'][i]}\nAnswer: {res['Answer'][i]}\nROUGE Score: {get_rouge_score(res['Answer'][i], res['Context'][i])}\n\n")

res['ROUGE Score'] = res.apply(lambda x: get_rouge_score(x['Answer'], x['Ground Truth']), axis=1)


In [None]:
for i in range(len(res)):
    print(f"Query: {res['Ground Truth'][i]}\nAnswer: {res['Answer'][i]}\nROUGE Score: {get_rouge_score(res['Answer'][i], res['Ground Truth'][i])}\n\n")


In [None]:
# Dashboard
tru.run_dashboard()

In [None]:
'''
ROUGE-1:
r: Recall. It measures the overlap of unigrams (single words) between the generated answer and the reference answer. 
p: Precision. It measures the percentage of unigrams in the generated answer that are also present in the reference answer. 
f: F1 Score. It is the harmonic mean of precision and recall. It provides a balance between precision and recall.

ROUGE-2:
r: Recall. It measures the overlap of bigrams (sequences of two words) between the generated answer and the reference answer. 
p: Precision. It measures the percentage of bigrams in the generated answer that are also present in the reference answer.
f: F1 Score. It represents the overall performance of the ROUGE-2 metric.

ROUGE-L:
r: Recall. It measures the longest common subsequence of words between the generated answer and the reference answer. 
p: Precision. It measures the percentage of words in the generated answer that are also present in the reference answer. 
f: F1 Score. It represents the overall performance of the ROUGE-L metric.

'''

In [None]:
# RUNS THE DASHBOARD
tru.run_dashboard()