# Benchmarking with generate QnA Pairs

In [2]:
import os
from openai import AzureOpenAI
from dotenv import load_dotenv
import csv

load_dotenv()


#AzureOpenAI setup
client = AzureOpenAI(
    api_key=os.getenv('AZURE_OPENAI_API_KEY'),
    azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT'),
    # api_type='azure',
    api_version='2023-05-15',  # this may change in the future
    timeout=20*60,  # 20 minutes
)

#To generate summaries from 1000's of comments
def generate_summary(question, ground_truth, answer):
    response = client.chat.completions.create(
        temperature=0.0,
        model="gpt-4-1106-preview",
        messages=[
                {"role": "system", "content":"""You are a validation agent, validating if the answers given to the asked questions are right.
                Return the cosine similarity in the format: "cosine similarity : x
                """},
            {"role": "user", "content": '''
                To the answer given, return the cosine similarity of how accurate the generated answer is to the context of the answer.
                do not return anything other than the cosine similarity score
                question:''' + question + 
             """Context for Ansewr:""" + ground_truth + 
             """generated answer:""" + answer},
            ]
    )

    return response.choices[0].message.content




# Loading the QnA pairs
The QnA JSON file has the following fields
1. Question: The question that was asked
2. Answer: The answer that was generated
3. Ground_Truth: either manually typed answers or copied and pasted relevant content from the website

In [3]:
import json

dataset = 'Trafilatura_Response.json' #change this to the name of the dataset you want to use

with open(dataset, 'r') as file:
    question_answer = json.load(file)

print("Number of question and answer pairs: ", len(question_answer['QA']))
print("\n")
print("Example question: ", question_answer['QA'][0]['question'])
print("\n")
print("Example Answer: ", question_answer['QA'][0]['answer'])

Number of question and answer pairs:  10


Example question:  How to make an appointment?


Example Answer:  To make an appointment, you should contact the specific facility you wish to visit. The context provided does not include a direct method for scheduling an appointment, but you can find contact information for the facilities by visiting the Tenet Healthcare Corporation website and searching for the location nearest to you. Once you have located the facility, you can use the provided address, phone number, or other contact details to reach out to them and inquire about making an appointment.


## Running the loop validating each QnA pair

In [4]:
cosine_similarity = []

for i in range(len(question_answer['QA'])):
    print("_______________________________________")
    print("iteration: ", i)
    question = question_answer['QA'][i]['question']
    print(f"Query send: {question}\n")
    ground_truth = question_answer['QA'][i]['ground_truth']
    gen_response = question_answer['QA'][i]['answer'] #Calling RAG application
    print(f"\n Generated response: {gen_response}\n")
    print(f"\nGround truth: {ground_truth}\n")
    #gen_str = str(gen_response) #TODO: test if this is required
    cosine_similarity.append(generate_summary(question, ground_truth, gen_response))
    print(cosine_similarity)
    print("_______________________________________")

_______________________________________
iteration:  0
Query send: How to make an appointment?


 Generated response: To make an appointment, you should contact the specific facility you wish to visit. The context provided does not include a direct method for scheduling an appointment, but you can find contact information for the facilities by visiting the Tenet Healthcare Corporation website and searching for the location nearest to you. Once you have located the facility, you can use the provided address, phone number, or other contact details to reach out to them and inquire about making an appointment.


Ground truth:  To book an appointment, you would typically need to contact the healthcare facility directly. Look for contact information on their official website or any official correspondence you have received from them. If you have a specific department or service in mind, their direct contact details may also be available on the website. If you are a new patient, you might need

## Final Results of the selected dataset

In [7]:
i = 1
for result in cosine_similarity:
    print(f"For QnA pair {i}, the cosine similarity is: {result}")
    i += 1

For QnA pair 1, the cosine similarity is: cosine similarity : 0.86
For QnA pair 2, the cosine similarity is: cosine similarity : 0.95
For QnA pair 3, the cosine similarity is: cosine similarity: 0.865
For QnA pair 4, the cosine similarity is: cosine similarity: 0
For QnA pair 5, the cosine similarity is: cosine similarity : 0.759
For QnA pair 6, the cosine similarity is: cosine similarity: 0.78
For QnA pair 7, the cosine similarity is: cosine similarity: 0.0
For QnA pair 8, the cosine similarity is: cosine similarity : 0.0
For QnA pair 9, the cosine similarity is: cosine similarity: 1
For QnA pair 10, the cosine similarity is: cosine similarity: 1


## Note:
1. The accuracy seems unstable specially for those values where the accuracy score is less than 50
2. While, the scraped text is complete in both BeautiufulSoup and Trafilatura. The format in the way they do this is slightly different. BSoup includes a lot more whitesapces while scraping
3. BSoup may be slower, and may cost more tokens but it has on average 4 to 7% better accuracy.