# Evaluating the chatbot

The ground truth data was manually created by the team and is stored in an Excel file. The file contains the following information:
1. Question
2. Correct answer
3. The ID's of the relevant chunks that should be returned by the retriever, per each embedding type.

In [1]:
from backend.evaluation.Evaluator import Evaluator
from backend.pipeline.DBHandler import DBHandler

import pandas as pd
import transformers
from retrying import retry

transformers.logging.set_verbosity_error()

In [2]:
@retry(stop_max_attempt_number=3, wait_fixed=60*1000)  # 3 attempts, 60 seconds between retries
def get_ground_truth_data(emb, gt_file="./FAQ.xlsx"):
	# Load ground truth data from a file
	data = pd.read_excel(gt_file)
	QA_list = []
	for i, row in data.iterrows():
		if not row.isnull().values.any():
			relevant_chunks_id = row[f"relevant_chunks_id_{emb}"].strip().split(",")
			QA_list.append((row["question"], row["answer"], relevant_chunks_id))
	return QA_list


In [3]:
def evaluate_chatbot(ground_truth_data, style, embedding_type, search_method, llm_name):
	db_handler = DBHandler(org_id=f's_maccabi_{embedding_type}', user_id='evaluator', search_method=search_method)
	evaluator = Evaluator(db_handler, style=style, llm_model_name=llm_name, embedding_model_name=embedding_type)
	results = evaluator.evaluate(ground_truth_data)
	results['style'] = style if style != '' else 'neutral'
	results['embedding_type'] = embedding_type
	results['search_method'] = search_method
	results['llm_name'] = llm_name
	return results

Evaluating the chatbot's answers with the ground truth data. Various configurations are tested:
1. 3 llms
2. 3 embedding types
3. 2 search methods
4. 5 styles

In [None]:
styles = ['', 'kids', 'elderly', 'emoji', 'rhymes']  # empty string means no style
embedding_types = ['emb1',  # models/text-embedding-004
				   'emb2',  # models/embedding-001
				   'emb3']  # from HW1: SentenceTransformer('all-MiniLM-L6-v2')
llm_names = ['gemini-1.5-flash',
			 'meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo',
			 'mistralai/Mistral-7B-Instruct-v0.1'
			 ]
search_methods = ['approximate',
				  'exact'
				  ]

full_results = pd.DataFrame(
	columns=['style', 'embedding_type', 'search_method', 'llm_name', 'question', 'true_answer', 'chatbot_answer',
			 'cosine_similarity', 'correctness_score', 'faithfulness_score', 'retriever_scores'])

for style in styles:
	str_style = style if style != '' else 'neutral'
	for embedding_type in embedding_types:
		ground_truth_data = get_ground_truth_data(embedding_type)
		for llm_name in llm_names:
			for search_method in search_methods:
				print(f"Style: {str_style}", end=" | ")
				print(f"embedding_type: {embedding_type}", end=" | ")
				print(f"llm_name: {llm_name}", end=" | ")
				print(f"search_method: {search_method} | ", end=" Status: ")
				try:
					results = evaluate_chatbot(ground_truth_data, style, embedding_type, search_method, llm_name)
					full_results = pd.concat([full_results, results], ignore_index=True)
					full_results.to_csv("full_results.csv", index=False)  #re-save after each iteration to be safe :)
					print("Done")
				except Exception as e:
					print(f"Configuration failed")
					print(f"Error content: {e}", end="\n\n")
					continue

Style: neutral | embedding_type: emb1 | llm_name: gemini-1.5-flash | search_method: approximate |  Status: 

In [None]:
full_results.head(100)