In [75]:
import pandas as pd
import dotenv
import json
import datetime
import os
import uuid
from pinecone import Pinecone, ServerlessSpec
from langchain.embeddings.openai import OpenAIEmbeddings
import time
dotenv.load_dotenv()

# PENDING : Move these to a config file
INDEX_NAME = 'langchain-retrieval-transcript'
PINECONE_VECTOR_DIMENSION = 1536
PINECONE_UPSERT_BATCH_LIMIT = 90
PINECONE_TOP_K_RESULTS = 10
DELTA = 2
CLOUD_PROVIDER = 'aws'
REGION = 'us-west-2'
METRIC = 'cosine'

EMBEDDING = 'OpenAI'
EMBEDDING_MODEL = 'text-embedding-ada-002'

NAMESPACE = 'default_namespace'
master_json_file = 'master_meeting_details'

class PineconeServerless:
    def __init__(self) -> None:
        PINECONE_API_KEY = os.getenv('PINECONE_SERVERLESS_API_KEY') or 'PINECONE_SERVERLESS_API_KEY'
        self.OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') or 'OPENAI_API_KEY'
        self.index_name = INDEX_NAME
        self.meeting_title = None
        self.pinecone = Pinecone(api_key=PINECONE_API_KEY)
        self.base_data_path = os.path.join(os.getcwd(), '../../','bin/data/', NAMESPACE)
        self.master_json_file = os.path.join(self.base_data_path, master_json_file)
        self.response = None

    def check_index_already_exists(self) -> bool:
        return self.index_name in self.pinecone.list_indexes()

    def _get_vector_embedder(self, EMBEDDING: str = 'OpenAI'):
        if EMBEDDING == 'OpenAI':
            return OpenAIEmbeddings(
                model=EMBEDDING_MODEL,
                openai_api_key=self.OPENAI_API_KEY)

    def _get_index(self):
        return self.pinecone.Index(self.index_name)
    
    def _create_index(self, INDEX_NAME: str) -> None:
        try:
            self.pinecone.create_index(
                name=INDEX_NAME,
                metric=METRIC,
                dimension=PINECONE_VECTOR_DIMENSION,
            
                spec=ServerlessSpec(
                    cloud=CLOUD_PROVIDER, 
                    region=REGION,
                    # pod_type="p1.x1",
                ) 
            )    

            while not self.pinecone.describe_index(self.index_name).status['ready']:
                time.sleep(5)

        except Exception as e:
            print('Index creation failed: ', e)      


    def describe_index_stats(self) -> dict:
        try:
            index = self._get_index()
            return index.describe_index_stats()
        except Exception as e:
            print('Index does not exist: ', e)
            return {}

    
    def _delete_index(self, index_name: str) -> None:
        try:
            self.pinecone.delete_index(index_name)
        except Exception as e:
            print('Index does not exist: ', e)


    def _create_master_json(self) -> dict:

        data = {
                "index": INDEX_NAME,
                "namespace": NAMESPACE,
                "last_conversation_no": 0,
                "meeting_uuids" : [],
                "meetings" :[]
        }
        #print('master_json_file: ', self.master_json_file)
        if not os.path.exists(self.base_data_path):
            os.makedirs(self.base_data_path)
            
        #     with open(self.master_json_file, 'w') as f:
        #         json.dump(data, f)
        
        #with open(master_json_file+'.json', 'r') as f:
        #os.path.join(self.base_data_path, f'{master_json_file}.json')

        meeting_details_file = os.path.join(self.base_data_path, f'{master_json_file}.json') 
        print('meeting_details_file: ', meeting_details_file)
        #print('master_json_file: ', self.master_json_file   )
        with open(meeting_details_file, 'w') as f:
            json.dump(data, f, indent=4)


    def _update_master_json(self, meeting_uuid:str, meeting_title:str, last_conversation_no:int,
                               meeting_video_file:bool, time_stamp:str ) -> dict:
    
        meeting_details_file = os.path.join(self.base_data_path, f'{master_json_file}.json')
        with open(meeting_details_file, 'r+') as f:
            data = json.load(f)
            print("MASTER JSON: LOADED ", data['last_conversation_no'])
            data['meeting_uuids'] = list(set(data['meeting_uuids'] + [meeting_uuid]))

            data['last_conversation_no'] = last_conversation_no 
            data['meetings'].append(
                {
                    "meeting_uuid" : meeting_uuid,
                    "meeting_title" : meeting_title,
                    "meeting_date" : time_stamp,
                    "meeting_video_file" : meeting_video_file,
                }
            )
            print("UPDATED MASTER JSON: ", data['last_conversation_no'] )
            return data
               
    def _get_meeting_members(self, transcript: pd.DataFrame) -> list[str]:
        return list(transcript['speaker_label'].unique())

    def _create_new_meeting_json(self, meeting_uuid:str, meeting_title:str, last_conversation_no:int,
                                  meeting_members:list[str], meeting_video_file:bool, time_stamp:str, meeting_summary:str) -> dict:
        data = {
                "index": INDEX_NAME,
                "namespace": NAMESPACE,

                "meeting_title" : meeting_title,
                "meeting_uuid" : meeting_uuid,
                "meeting_date" : time_stamp,

                "last_conversation_no": last_conversation_no,
                "meeting_video_file": meeting_video_file,
                "meeting_members": meeting_members,
                "meeting_summary" : meeting_summary,
        } 

        meeting_details_file = os.path.join(self.base_data_path,f'{meeting_uuid}.json') 
        with open(meeting_details_file, 'w') as f:
            json.dump(data, f, indent=4)

    def _get_last_conversation_no(self) -> list[str]:   

        meeting_details_file = os.path.join(self.base_data_path, f'{master_json_file}.json')
        with open(meeting_details_file, 'r') as f:
            data = json.load(f)
            print('last_conversation_no fetched from master json: ', data['last_conversation_no'])
            return data['last_conversation_no']

    def _set_new_meeting_json(self, meeting_uuid: str, meeting_title: str, last_conversation_no: str,
                               meeting_members: list[str], meeting_video_file: bool, meeting_summary:str) -> dict:
        
        time_stamp = str(datetime.datetime.now().strftime("%Y%m%d%H%M%S"))
        # if not os.path.exists(master_json_file):
        #     self._create_master_json()

        self._create_new_meeting_json(meeting_uuid, meeting_title, last_conversation_no, 
                                      meeting_members, meeting_video_file, time_stamp, meeting_summary)
        data = self._update_master_json(meeting_uuid, meeting_title, last_conversation_no, meeting_video_file, time_stamp)   

        meeting_details_file = os.path.join(self.base_data_path, f'{master_json_file}.json')
        with open(meeting_details_file, 'w') as f:
            json.dump(data, f, indent=4)


    def _convert_to_hr_min_sec(self, time_in_minutes) -> str:
        # Hr:Min:Sec
        hours = int(time_in_minutes // 60)
        minutes = int(time_in_minutes % 60)
        seconds = int((time_in_minutes - int(time_in_minutes)) * 60)
        return f"{hours:02d}:{minutes:02d}:{seconds:02d}"

    def pinecone_upsert(self, transcript: pd.DataFrame, meeting_uuid:str ='', 
                        meeting_video_file: bool=False, meeting_title: str = 'Unnamed', meeting_summary:str='') -> None:
        '''
        Upserts the transcript into Pinecone
        '''
        texts = []
        metadatas = []
        
        last_conversation_no = self._get_last_conversation_no()
        print('last_conversation_no: ', last_conversation_no)
        last_conversation_no = int(last_conversation_no) #+ 1
        
        embed = self._get_vector_embedder(EMBEDDING)
        meeting_members = self._get_meeting_members(transcript)
        #meeting_uuid = #str(uuid.uuid4())
        index = self._get_index()

        for _ , record in transcript.iterrows():
            start_time = self._convert_to_hr_min_sec(record['start_time'])

            metadata = {
                'speaker': record['speaker_label'],
                'start_time': start_time,
                'text': record['text'], 
                'meeting_uuid': meeting_uuid,
            }        
            texts.append(record['text']) 
            metadatas.append(metadata)

            if len(texts) >= PINECONE_UPSERT_BATCH_LIMIT:
                ids = list(map(lambda i: str(i+1), range(last_conversation_no, last_conversation_no + len(texts))))
                print('ids: ', ids)
                last_conversation_no += len(texts)
                embeds = embed.embed_documents(texts)
                try:
                    index.upsert(vectors=zip(ids, embeds, metadatas), namespace=NAMESPACE)
                except Exception as e:
                    print('Error upserting into Pinecone: ', e)    
                texts = []
                metadatas = []

        if len(texts) > 0:
            ids = list(map(lambda i: str(i+1), range(last_conversation_no, last_conversation_no + len(texts))))
            last_conversation_no += len(texts)
            print('ids: ', ids)
            embeds = embed.embed_documents(texts)
            try:
                index.upsert(vectors=zip(ids, embeds, metadatas), namespace=NAMESPACE)
            except Exception as e:
                print('Error upserting into Pinecone: ', e)

        print("Sending last_conversation_no to update main " ,last_conversation_no)
        self._set_new_meeting_json(meeting_uuid, meeting_title, last_conversation_no, meeting_members, meeting_video_file, meeting_summary)  


    def _extract_id_from_response(self, response: list) -> list[int]:
        return list(int(match['id']) for match in response['matches'])    

    # def _get_all_meeting_uuids(self) -> list[str]:
    #     with open(self.master_json_file, 'r') as f:
    #         data = json.load(f)
    #         return list(meeting['meeting_uuid'] for meeting in data['meetings'])

    def query_pinecone(self, query: str, in_filter: list[str]=[], complete_db_flag:bool = False) -> list:
        '''
        Queries Pinecone for the given query
        '''
        try:
            index = self._get_index()
            embed = self._get_vector_embedder(EMBEDDING)
            
            filter = None if complete_db_flag else {"meeting_uuid": {"$in": in_filter}}
                
            self.response = index.query(
                vector= embed.embed_documents([query])[0],
                namespace = NAMESPACE, 
                top_k = PINECONE_TOP_K_RESULTS,
                include_metadata = True,
                filter = filter,
            )
            return self.response
        except Exception as e:
            print('Error querying Pinecone: ', e)
        return []
        

    def query_delta_conversations(self) -> pd.DataFrame: 
        '''
        Queries Pinecone for the given query and returns the delta conversations
        '''
        ids = self._extract_id_from_response(self.response)
        last_conversation_no = self._get_last_conversation_no()
        index = self._get_index()
        conversation = {}

        for id in ids: 
            left = id - DELTA if id - DELTA > 0 else 1
            right = id + DELTA if id + DELTA <= last_conversation_no else last_conversation_no
            window = [str(i) for i in range(left, right + 1)]    
            try:
                fetch_response = index.fetch(ids=window, namespace=NAMESPACE)
                conversation[id] = fetch_response
            except Exception as e:
                print('Error fetching from Pinecone for id:', id, "Error:", e)
                continue

        print('conversation length: ', len(conversation))
        return self._parse_fetch_conversations(conversation)

    def _parse_fetch_conversations(self, conversation) -> pd.DataFrame:  
        data_rows = []
        for primary_hit_id, primary_hit_data in conversation.items():
            for _ , vector_data in primary_hit_data['vectors'].items():
                id = vector_data['id']
                meeting_uuid = vector_data['metadata']['meeting_uuid']
                
                speaker = vector_data['metadata']['speaker']
                start_time = vector_data['metadata']['start_time']
                text = vector_data['metadata']['text']
                
                data_rows.append((primary_hit_id, id, meeting_uuid, speaker, start_time, text))

        columns = ['primary_id', 'id', 'meeting_uuid','speaker', 'start_time', 'text']
        delta_conversation_df = pd.DataFrame(data_rows, columns=columns)
        delta_conversation_df = delta_conversation_df.sort_values(by=['id'])
        print('LENGTH delta_conversation_df: ', len(delta_conversation_df))
        delta_conversation_df = delta_conversation_df.drop_duplicates(subset=['id'])

        #creating seprate df for rows with same meeting_cluster_id
        grouped_dfs = {group_name: group.reset_index(drop=True, inplace=False) for group_name, group in delta_conversation_df.groupby('meeting_uuid')}

        #return delta_conversation_df
        return grouped_dfs


In [76]:
obj1 = PineconeServerless()

In [None]:
#obj1._delete_index('langchain-retrieval-transcript')

In [None]:
# obj1._create_index('langchain-retrieval-transcript')

In [None]:
# obj1._create_master_json()

In [None]:
print(obj1.describe_index_stats())

In [None]:
# obj1._create_index('langchain-retrieval-transcript')
# # # # ## obj2._create_index(INDEX_NAME)  # No need as both obj1 and obj2 are pointing to the same index

# print(obj1.describe_index_stats())
# ## print(obj2.describe_index_stats()) # No need as both obj1 and obj2 are pointing to the same index

In [None]:
# self, transcript: pd.DataFrame, meeting_uuid:str ='', meeting_cluster_id:str='undefined', 
#                         meeting_video_file: bool=False, meeting_title: str = 'Unnamed', meeting_summary:str=''

for i in range(3, 4):
    print(i)
    transcript = pd.read_csv(f'healthcare_{i} copy.csv')
    transcript.dropna(inplace=True)
    obj1.pinecone_upsert(transcript, meeting_uuid=str(uuid.uuid4()), meeting_cluster_id='healthcare_3',
                          meeting_video_file=False, meeting_title='Healthcare Meeting 3',
                          meeting_summary='Healthcare Meeting Summary Meeting 3')
    time.sleep(5)

In [None]:
obj1.describe_index_stats()

In [None]:
# to be received via clustering model
in_filter = ['healthcare_2']

In [77]:
#query = "how many patients came to the hospital campuses today?"
query = "I am one of the directors in Wappingers Central School District."

In [78]:
#def query_pinecone(self, query: str, in_filter: list[str]=[], complete_db_flag:bool = False)

response1 = obj1.query_pinecone(query, in_filter, True)
print(response1)
print('*'*25)
# response2 = obj2.query_pinecone('What was discussed about Atlassian ?', namespace2)
# print(response2)

{'matches': [{'id': '1',
              'metadata': {'meeting_cluster_id': 'healthcare_1',
                           'meeting_uuid': '9da09451-0efc-4473-a9e3-78a155da8e26',
                           'speaker': 'spk_0',
                           'start_time': '00:00:00',
                           'text': 'I am one of the directors in Wappingers '
                                   'Central School District. I love seeing so '
                                   'many cameras on. So as Im talking, I see '
                                   'um its, its like lights flickering and '
                                   'more faces are appearing and um so excited '
                                   'to have so many of our Wappingers Central '
                                   'School District students joining us today. '
                                   'So I am gonna be hopping back and forth '
                                   'between our screens, but I did want to um '
             

In [79]:
print(obj1._extract_id_from_response(response1))
# print(obj2._extract_id_from_response(response2))

[1, 150, 55, 97, 121, 95, 2, 78, 146, 98]


In [80]:
conversation1_df = obj1.query_delta_conversations()

last_conversation_no fetched from master json:  154
conversation length:  10
LENGTH delta_conversation_df:  47


In [85]:
# conversation1_df

# drop index from conversation1_df
# conversation1_df.reset_index(drop=True, inplace=True)
# conversation1_df
data = []
for cluster_id, cluster_df in conversation1_df.items():
    print(f"DataFrame for meeting_cluster_id '{cluster_id}':")
    # print(cluster_df)
    data.append(cluster_df)
    print("\n")

DataFrame for meeting_cluster_id 'healthcare_1':


DataFrame for meeting_cluster_id 'healthcare_2':


DataFrame for meeting_cluster_id 'healthcare_3':




In [90]:
data[0]

Unnamed: 0,primary_id,id,meeting_uuid,meeting_cluster_id,speaker,start_time,text
0,1,1,9da09451-0efc-4473-a9e3-78a155da8e26,healthcare_1,spk_0,00:00:00,I am one of the directors in Wappingers Centra...
1,2,2,9da09451-0efc-4473-a9e3-78a155da8e26,healthcare_1,spk_1,00:07:19,"Sorry, I put my face off screen and then I cou..."
2,2,3,9da09451-0efc-4473-a9e3-78a155da8e26,healthcare_1,spk_2,00:10:39,"If youre not sure what this is about,"
3,2,4,9da09451-0efc-4473-a9e3-78a155da8e26,healthcare_1,spk_1,00:10:41,thats OK too because well explain more as we ...
4,55,53,9da09451-0efc-4473-a9e3-78a155da8e26,healthcare_1,spk_3,00:31:10,"Well, if the,\n if the school has any connect..."
5,55,54,9da09451-0efc-4473-a9e3-78a155da8e26,healthcare_1,spk_5,00:31:18,"uh facilities, that might be the best way then..."
6,55,55,9da09451-0efc-4473-a9e3-78a155da8e26,healthcare_1,spk_3,00:31:37,"right? And I think like, um I can speak from t..."
7,55,56,9da09451-0efc-4473-a9e3-78a155da8e26,healthcare_1,spk_6,00:33:08,Um Im here to talk a little bit about um what ...
8,55,57,9da09451-0efc-4473-a9e3-78a155da8e26,healthcare_1,spk_3,00:39:30,Absolutely. We did have a few questions\n com...
9,78,76,9da09451-0efc-4473-a9e3-78a155da8e26,healthcare_1,spk_4,00:43:17,OK.


In [None]:
#Evaluating LLM model

In [None]:
from trulens_eval import Tru
from trulens_eval.tru_custom_app import instrument
tru = Tru()

In [None]:
#prompt = "You are a chatbot build to answer questions about the meeting. You will receive relevent meeting transcript. You should understand the transcript and answer the user query. You can ask questions to the user and answer their questions based on the context provided."
#prompt = "You are a chatbot build to answer questions about the meeting. You will refrain answering questions for which the context is not provided. You will answer the questions solely based on the context provided. You can ask questions to the user and answer their questions based on the context provided."
prompt  = "You are a friendly chat model. You will respond to the user's message using only the context provided. The context can be anything from a single message to a whole conversation. You can also ask questions to the user."
#prompt = "You are a friendly chatbot build to answer questions about the meeting. You will be benalized if you answer questions for which the context is not provided. You will answer the questions solely based on the context provided. You can ask questions to the user and answer their questions based on the context provided."
#prompt = "You are provided the meeting transcript file. The data includes meeting speaker, time of the conversation and the conversation text. Your job is to answer user questions based on the context provided. You can ask questions to the user and answer their questions based on the context provided."
#prompt = "Youa re a professional chat bot. You will answer the user's questions based on the context provided. You can ask questions to the user and answer their questions based on the context provided. You will refrain from answering questions for which the context is not provided. If the context is not clear, you will ask the user for more information. You will be penalized if you answer questions for which the context is not provided."

In [None]:
from openai import OpenAI
oai_client = OpenAI()

oai_client.embeddings.create(
        model="text-embedding-ada-002",
        input=prompt,
    )

In [None]:
class RAG_from_scratch:
    @instrument
    def retrieve(self, query: str, in_filter: list) -> list:
        """
        Retrieve relevant text from vector store.
        """
        obj1 = PineconeServerless()
        response1 = obj1.query_pinecone(query, in_filter)
        conversation1_df = obj1.query_delta_conversations()
        text_list = list(conversation1_df['text'])
        return text_list
    
    @instrument
    def generate_completion(self, query: str, context_str) -> str:
        """
        Generate answer from context.
        """
        completion = oai_client.chat.completions.create(
        model="gpt-3.5-turbo",
        temperature=0,
        messages=
        [
            {"role": "user",
            "content": 
            f"We have provided context information below. \n"
            f"---------------------\n"
            f"{context_str}"
            f"\n---------------------\n"
            f"Given this information, please answer the question: {query}"
            }
        ]
        ).choices[0].message.content
        return completion

    @instrument
    def query(self, query: str, in_filter:list) -> str:
        context_str = self.retrieve(query, in_filter)
        completion = self.generate_completion(query, context_str)
        return completion

rag = RAG_from_scratch()

In [None]:
from trulens_eval import Feedback, Select
from trulens_eval.feedback import Groundedness
from trulens_eval.feedback.provider.openai import OpenAI as fOpenAI

import numpy as np

# Initialize provider class
fopenai = fOpenAI()

grounded = Groundedness(groundedness_provider=fopenai)

# Define a groundedness feedback function
f_groundedness = (
    Feedback(grounded.groundedness_measure_with_cot_reasons, name = "Groundedness")
    .on(Select.RecordCalls.retrieve.rets.collect())
    .on_output()
    .aggregate(grounded.grounded_statements_aggregator)
)

# Question/answer relevance between overall question and answer.
f_qa_relevance = (
    Feedback(fopenai.relevance_with_cot_reasons, name = "Answer Relevance")
    .on(Select.RecordCalls.retrieve.args.query)
    .on_output()
)

# Question/statement relevance between question and each context chunk.
f_context_relevance = (
    Feedback(fopenai.qs_relevance_with_cot_reasons, name = "Context Relevance")
    .on(Select.RecordCalls.retrieve.args.query)
    .on(Select.RecordCalls.retrieve.rets.collect())
    .aggregate(np.mean)
)

In [None]:
from trulens_eval import TruCustomApp
tru_rag = TruCustomApp(rag,
    app_id = 'RAG v1',
    feedbacks = [f_groundedness, f_qa_relevance, f_context_relevance])

In [None]:
# to be received via clustering model
in_filter = ['a4454589-6c49-4d8d-a6f6-c2b5c99ef229', '7c3c7ac7-94ee-4045-b94a-e9b1695c5b7e']

In [None]:
queries = ["how many patients came to the hospital campuses today?",
           "How many patients are in the ICU?",
           "What is the recommended waiting period for COVID-19 vaccination after receiving monoclonal antibodies treatment for individuals who have previously contracted COVID-19?",
           "How much of hospital staff is expected to be lost?",
           "When will children between 5-11 years old be eligible for COVID-19 vaccination?",
           "Do you have high risk if you live in King County?",
           
    
           "How heavy is a shark?",
           "What is the capital of France?",
           
           
           "When would people not have to wear masks in schools?",
           "Who recommends universal masking?",
           ]

ground_truths = ["The hospital has 54 patients across the four campuses today.",
                 "There are 23 patients in the ICU.",
                 "The recommended waiting period for COVID-19 vaccination after receiving monoclonal antibodies treatment for individuals who have previously contracted COVID 19 is 90 days.",
                 "5 percent of hospital staff is expected to be lost.",
                 "Children between 5-11 years old will be eligible for COVID-19 vaccination starting from November 1st.",
                 "Yes, if you live in King County, you have nine folds greater likelihood of getting infected.",
                 
                 
                 "The provided context information does not contain any relevant information about the weight of a shark.",
                 "The provided context information does not contain any relevant information about Pais",
                 
                 
                 "People would not have to wear masks in schools if the school have met the 80% vaccination criteria.",
                 "The American Academy of Pediatrics and the Centers for Disease Control recommend universal masking.",
                 ]

qna = []

In [None]:
for query in queries:
    with tru_rag as recording:
        completion = rag.query(query, in_filter)
        qna.append([query, completion, ground_truths[queries.index(query)]])

In [None]:
tru.get_leaderboard(app_ids=["RAG v1"])

In [None]:
# https://www.trulens.org/

In [None]:
'''
Groundedness:  0 to 1
Measure of how well the answer is supported by the context.

Answer Relevance:  0 to 1
Measure of how well the answer is relevant to the question.

Context Relevance:  0 to 1
Measure of how well the context fetched from DB is relevant to the question.

'''

In [None]:
for q,a,gt in qna:
    print(f"Query: {q}\nAnswer: {a}\n", f"Ground Truth: {gt}\n\n")

In [None]:
# Atutomatic instructions.  finetuning instructions using reinforcement learning

# OBSERVATIONS
'''
Query: how many patients came to the hospital campuses today?
--> Correct answer

Query: How many patients are in the ICU?
--> Correct answer

Query: What is the recommended waiting period for COVID-19 vaccination after receiving monoclonal antibodies treatment for individuals who have previously contracted COVID-19?
--> Correct answer

Query: How much of hospital staff is expected to be lost?
--> Correct answer

Query: When will children between 5-11 years old be eligible for COVID-19 vaccination?
--> Correct answer

Query: Do you have high risk if you live in King County?
--> Incorrect answer --> answer is yes

Query: How heavy is a shark?
--> SHOULD NOT BE ANSWERED

Query: What is the capital of France?
--> SHOULD NOT BE ANSWERED

Query: When would people not have to wear masks in schools?
--> Correct answer

Query: Who recommends universal masking?
--> Correct answer

'''

In [None]:
res = pd.DataFrame(qna, columns = ['Query', 'Answer', 'Ground Truth'])

In [None]:
#evaluating questions and answers uby finding cosine similarity between the question and the answer

from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def get_similarity_score(query, answer):
    query_embedding = model.encode(query, convert_to_tensor=True)
    answer_embedding = model.encode(answer, convert_to_tensor=True)
    cosine_scores = util.pytorch_cos_sim(query_embedding, answer_embedding)
    return cosine_scores.item()

# for q,a in qna:
#     print(f"Query: {q}\nAnswer: {a}\nSimilarity Score: {get_similarity_score(q,a)}\n\n")
    
res['Query_Answer Similarity Score'] = res.apply(lambda x: get_similarity_score(x['Query'], x['Answer']), axis=1)    
res['Answer_Ground Truth Similarity Score'] = res.apply(lambda x: get_similarity_score(x['Ground Truth'], x['Answer']), axis=1) 


In [None]:
res

In [None]:
# Getting thr context from the conversation that was received from the Pinecone

obj2 = PineconeServerless()

str_list = []
for q, _, _ in qna:
    response2 = obj2.query_pinecone(q, in_filter)
    conversation1_df = obj2.query_delta_conversations()
    text_list = list(conversation1_df['text'])
    str_text = ' '.join(text_list)
    str_list.append(str_text)


res['Context'] = str_list

In [None]:
# ROUGE SCORE

In [None]:
from rouge import Rouge

rouge = Rouge()

def get_rouge_score(generated_summary, reference_summary):
    scores = rouge.get_scores(generated_summary, reference_summary, avg=True)
    return scores

# for i in range(len(res)):
#     print(f"Query: {res['Query'][i]}\nAnswer: {res['Answer'][i]}\nROUGE Score: {get_rouge_score(res['Answer'][i], res['Context'][i])}\n\n")

res['ROUGE Score'] = res.apply(lambda x: get_rouge_score(x['Answer'], x['Ground Truth']), axis=1)


In [None]:
for i in range(len(res)):
    print(f"Query: {res['Ground Truth'][i]}\nAnswer: {res['Answer'][i]}\nROUGE Score: {get_rouge_score(res['Answer'][i], res['Ground Truth'][i])}\n\n")


In [None]:
res

In [None]:
'''
ROUGE-1:
r: Recall. It measures the overlap of unigrams (single words) between the generated answer and the reference answer. 
p: Precision. It measures the percentage of unigrams in the generated answer that are also present in the reference answer. 
f: F1 Score. It is the harmonic mean of precision and recall. It provides a balance between precision and recall.

ROUGE-2:
r: Recall. It measures the overlap of bigrams (sequences of two words) between the generated answer and the reference answer. 
p: Precision. It measures the percentage of bigrams in the generated answer that are also present in the reference answer.
f: F1 Score. It represents the overall performance of the ROUGE-2 metric.

ROUGE-L:
r: Recall. It measures the longest common subsequence of words between the generated answer and the reference answer. 
p: Precision. It measures the percentage of words in the generated answer that are also present in the reference answer. 
f: F1 Score. It represents the overall performance of the ROUGE-L metric.

'''

In [None]:
# Dashboard
tru.run_dashboard()