Gemini LLM evaluation - Model is gemini-pro

In [None]:
from trulens_eval import TruChain, Feedback, Tru, feedback, Select

from trulens_eval.tru_custom_app import instrument
tru = Tru()
import dotenv
dotenv.load_dotenv()
from pinecone_code import PineconeServerless
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity



In [None]:
prompt  = "You are a friendly chat model. You will respond to the user's message using only the context provided. The context can be anything from a single message to a whole conversation. You can also ask questions to the user."

In [None]:
import google.generativeai as genai
import os
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
model = genai.GenerativeModel('gemini-pro')

In [None]:
def parse_conversations(conversations) -> str:
    data = []
    for cluster_id, cluster_df in conversations.items():
        for i, row in cluster_df.iterrows():
            data.append(
                f"{row['start_time']} - {row['speaker']}: {row['text']}"
            )
        data.append("\n\n")
    data = "\n".join(data)
    return data

In [None]:
class RAG_from_scratch:
    @instrument
    def retrieve(self, query: str, in_filter: list) -> list:
        """
        Retrieve relevant text from vector store.
        """
        obj1 = PineconeServerless()
        response1 = obj1.query_pinecone(query, in_filter)
        conversation1_df = obj1.query_delta_conversations()
        #text_list = list(conversation1_df)
        context = parse_conversations(conversation1_df)
        return context
    
    @instrument
    def generate_completion(self, query: str, context_str) -> str:
        """
        Generate answer from context.
        """
        completion = model.generate_content(f"[We have provided context information below.{context_str}Given this information, please answer the question: {query}]",safety_settings={'HARASSMENT':'block_none','HARM_CATEGORY_HARASSMENT':'block_none', 'HARM_CATEGORY_HATE_SPEECH':'block_none', 'HARM_CATEGORY_SEXUALLY_EXPLICIT':'block_none', 'HARM_CATEGORY_DANGEROUS_CONTENT':'block_none'})
        print(completion.prompt_feedback)
        return completion.text
        
        

    @instrument
    def query(self, query: str, in_filter:list) -> str:
        context_str = self.retrieve(query, in_filter)
        print("CONTEXT ",context_str)
        completion = self.generate_completion(query, context_str)
        return completion,context_str

rag = RAG_from_scratch()

In [None]:
from trulens_eval import Feedback, Select
from trulens_eval.feedback import Groundedness
from trulens_eval.feedback.provider.openai import OpenAI as fOpenAI

import numpy as np
# Initialize provider class
fopenai = fOpenAI()

grounded = Groundedness(groundedness_provider=fopenai)

# Define a groundedness feedback function
f_groundedness = (
    Feedback(grounded.groundedness_measure_with_cot_reasons, name = "Groundedness")
    .on(Select.RecordCalls.retrieve.rets.collect())
    .on_output()
    .aggregate(grounded.grounded_statements_aggregator)
)

# Question/answer relevance between overall question and answer.
f_qa_relevance = (
    Feedback(fopenai.relevance_with_cot_reasons, name = "Answer Relevance")
    .on(Select.RecordCalls.retrieve.args.query)
    .on_output()
)

# Question/statement relevance between question and each context chunk.
f_context_relevance = (
    Feedback(fopenai.qs_relevance_with_cot_reasons, name = "Context Relevance")
    .on(Select.RecordCalls.retrieve.args.query)
    .on(Select.RecordCalls.retrieve.rets.collect())
    .aggregate(np.mean)
)

In [None]:
from trulens_eval import TruCustomApp
tru_rag = TruCustomApp(rag,
    app_id = 'RAG gemini',
    feedbacks = [f_groundedness, f_qa_relevance, f_context_relevance])

In [None]:
# if testing with clustering 
#in_filter = ['a4454589-6c49-4d8d-a6f6-c2b5c99ef229', '7c3c7ac7-94ee-4045-b94a-e9b1695c5b7e']
in_filter = []  #if not using clustering

In [None]:

import pandas as pd 
questions= pd.read_csv("Questions.csv")
queries = questions['Question']

ground_truths = questions['Groundtruth']
qna = []

In [None]:
queries=queries.to_list()
ground_truths=ground_truths.to_list()

In [None]:
for query in queries:
    with tru_rag as recording:
        completion,context = rag.query(query, in_filter)
        print(completion)
        qna.append([query, completion, ground_truths[queries.index(query)], context])

In [None]:
for q,a,con,gt in qna:
    print(f"Query: {q}\nAnswer: {a}\nContext: {con}", f"\nGround Truth: {gt}\n\n")

In [None]:
import pandas as pd
res = pd.DataFrame(qna, columns = ['Query', 'Answer', 'Ground Truth','Context'])
res.to_csv('rag_results_gemini.csv', index = False)

In [None]:
results_df=tru.get_records_and_feedback(app_ids=["RAG gemini"])[0]

In [1]:
df=pd.read_csv("rag_results_gemini.csv")


def calculate_cosine_similarity(text1, text2):
    vectorizer = CountVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    return cosine_similarity([vectors[0]], [vectors[1]])[0][0]

results_df['cosine_similarity']= df.apply(lambda row: calculate_cosine_similarity(row['Answer'], row['Ground Truth']), axis=1)

results_df.to_csv("gemini_pro_results.csv")