In [None]:
import os
import numpy as np
import pandas as pd

from dotenv import load_dotenv
load_dotenv('../../LegalBot/Database'+'/.env')

from deepeval import assert_test
from deepeval.test_case import LLMTestCase
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric
from deepeval.metrics import ContextualPrecisionMetric, ContextualRecallMetric

import sys
sys.path.append('../../')

from LegalBot.RAG_v1 import RAG_Bot 

In [None]:
faithfulness_metric = FaithfulnessMetric(threshold=0.5)
answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.5)

context_precision_metric = ContextualPrecisionMetric(threshold=0.5)
context_recall_metric = ContextualRecallMetric(threshold=0.5)

# Defining Functions for Evaluations

# Generation Evaluation

In [None]:
def test_answer_relevancy(query, actual_answer, LLM_Response, retrieved_contexts):
    test_case = LLMTestCase(
        input=f'{query},',
        actual_output=f'{LLM_Response}',
        retrieval_context=retrieved_contexts,
    )
    answer_relevancy_metric.measure(test_case)
    answer_relavancy_score = answer_relevancy_metric.score
    answer_relavancy_reason = answer_relevancy_metric.reason
    return answer_relavancy_score, answer_relavancy_reason

def test_faithfulness(query, actual_answer, LLM_Response, retrieved_contexts):
    test_case = LLMTestCase(
        input=f'{query},',
        actual_output=f'{LLM_Response}',
        retrieval_context=retrieved_contexts,
    )
    faithfulness_metric.measure(test_case)
    faithfulness_score = faithfulness_metric.score
    faithfulness_reason = faithfulness_metric.reason
    return faithfulness_score, faithfulness_reason

# Retrieval Evaluation

In [None]:
def test_context_precision(query, actual_answer, LLM_Response, retrieved_contexts):
    test_case = LLMTestCase(
        input=f'{query},',
        actual_output=f'{LLM_Response}',
        expected_output=f'{actual_answer}',
        retrieval_context=retrieved_contexts,
    )
    context_precision_metric.measure(test_case)
    context_precision_score = context_precision_metric.score
    context_precision_reason = context_precision_metric.reason
    return context_precision_score, context_precision_reason

def test_context_recall(query, actual_answer, LLM_Response, retrieved_contexts):
    test_case = LLMTestCase(
        input=f'{query},',
        actual_output=f'{LLM_Response}',
        expected_output=f'{actual_answer}',
        retrieval_context=retrieved_contexts,
    )
    context_recall_metric.measure(test_case)
    context_recall_score = context_recall_metric.score
    context_recall_reason = context_recall_metric.reason
    return context_recall_score, context_recall_reason

# Making an Instance of the RAG chatbot to get responses

In [None]:
collection_names = ['Uk', 'Wales', 'NothernIreland', 'Scotland']
bot = RAG_Bot(collection_names=collection_names, text_splitter='SpaCy', embedding_model="SentenceTransformers")

# Evaluating the benchmark data on a combination of parameters

In [None]:
# params_grid = {
#     'k' : [5,7,9,11,13,15,17,19,21,23,25],
#     'search_type' : ['Hybrid', 'Vector'],
#     'multi_query' : [True, False],
#     'rerank' : [True, False],
# }

# for idx_k, k in enumerate(params_grid['k']):
#     for idx_search_type, search_type in enumerate(params_grid['search_type']):
#         for idx_multi_query, multi_query in enumerate(params_grid['multi_query']):
#             for idx_rerank, rerank in enumerate(params_grid['rerank']):
#                 file_name = f'./Responses/RAG_Bot_Responses search type {search_type} rerank {rerank} multi query {multi_query} k {k}.csv'
#                 os.makedirs('./Responses', exist_ok=True)
#                 benchmark_data_df = pd.read_csv('../Evaluation_Dataset/Benchmark-Data.csv')

#                 print(f'Current Parameters Grid: search_type: {search_type}, rerank: {rerank}, multi_query: {multi_query}, k: {k}')

#                 results = []
#                 for idx_row, row in benchmark_data_df.iterrows():
#                     country = row['Country']
#                     prompt = row['Question']
#                     actual_answer = row['Actual Answer']

#                     (response, individual_context_texts) = bot.query(
#                         query = country+prompt,
#                         k = k,
#                         search_type = search_type,
#                         multi_query = multi_query,
#                         rerank = rerank,
#                         verbose = False,
#                         mode = 'eval'
#                     )

#                     # Create a dictionary to store the results of this iteration
#                     result = {
#                         'Country': country,
#                         'Prompt': prompt,
#                         'Actual Answer': actual_answer,
#                         'Response': response,
#                         'k': k,
#                         'search_type': search_type,
#                         'multi_query': multi_query,
#                         'rerank': rerank
#                     }

#                     # Add individual context texts as separate columns
#                     for idx_context, context in enumerate(individual_context_texts):
#                         result[f'Context_{idx_context+1}'] = context
#                     results.append(result)

#                 results_df = pd.DataFrame(results)
#                 results_df.to_csv(file_name, index=False)

# Deep Eval

## Answer Relavancy && Faithfulness

In [None]:
for idx_csv, csv in enumerate(os.listdir('./Responses')):
    Answer_Relavancy_Scores = []
    Answer_Relavancy_Reasons = []
    Faithfulness_Scores = []
    Faithfulness_Reasons = []

    Context_Precision_Scores = []
    Context_Precision_Reasons = []
    Context_Recall_Scores = []
    Context_Recall_Reasons = []

    Scores_DF_Path = os.path.join('./Scores', f'{csv}')

    data = pd.read_csv(f'./Responses/{csv}')
    for idx, row in data.iterrows():
        row = row.dropna()
        selected_cols = row.filter(like='Context_')
        score, reason = test_answer_relevancy(row['Prompt'], row['Actual Answer'], row['Response'], selected_cols.values.tolist())
        Answer_Relavancy_Scores.append(score)
        Answer_Relavancy_Reasons.append(reason)

        score, reason = test_faithfulness(row['Prompt'], row['Actual Answer'], row['Response'], selected_cols.values.tolist())
        Faithfulness_Scores.append(score)
        Faithfulness_Reasons.append(reason)

        score, reason = test_context_precision(row['Prompt'], row['Actual Answer'], row['Response'], selected_cols.values.tolist())
        Context_Precision_Scores.append(score)
        Context_Precision_Reasons.append(reason)

        score, reason = test_context_recall(row['Prompt'], row['Actual Answer'], row['Response'], selected_cols.values.tolist())
        Context_Recall_Scores.append(score)
        Context_Recall_Reasons.append(reason)
        
    Score_DF = pd.DataFrame({
        'Answer Relavancy Score': Answer_Relavancy_Scores,
        'Answer Relavancy Reason': Answer_Relavancy_Reasons,
        'Faithfulness Score': Faithfulness_Scores,
        'Faithfulness Reason': Faithfulness_Reasons,
        'Context Precision Score': Context_Precision_Scores,
        'Context Precision Reason': Context_Precision_Reasons,
        'Context Recall Score': Context_Recall_Scores,
        'Context Recall Reason': Context_Recall_Reasons
    })

    Score_DF.to_csv(Scores_DF_Path, index=False)

In [None]:
# Answer_Relavancy_Scores = []
# Answer_Relavancy_Reasons = []
# Faithfulness_Scores = []
# Faithfulness_Reasons = []
# for idx, row in data.iterrows():
#     selected_cols = row[['Response1', 'Response2', 'Response3']]
#     score, reason = test_answer_relevancy(row['Question'], row['Actual Answer'], row['Response'], selected_cols.values.tolist())
#     Answer_Relavancy_Scores.append(score)
#     Answer_Relavancy_Reasons.append(reason)
    
#     score, reason = test_faithfulness(row['Question'], row['Actual Answer'], row['Response'], selected_cols.values.tolist())
#     Faithfulness_Scores.append(score)
#     Faithfulness_Reasons.append(reason)
    
# Generation_Eval_DF = pd.DataFrame({
#     'Answer Relavancy Score': Answer_Relavancy_Scores,
#     'Answer Relavancy Reason': Answer_Relavancy_Reasons,
#     'Faithfulness Score': Faithfulness_Scores,
#     'Faithfulness Reason': Faithfulness_Reasons
# })
# Generation_Eval_DF.to_csv('Generation_Metric_Evaluation.csv', index=False)
# Generation_Eval_DF

In [None]:
# Context_Precision_Scores = []
# Context_Precision_Reasons = []
# Context_Recall_Scores = []
# Context_Recall_Reasons = []
# for idx, row in data.iterrows():
#     selected_cols = row[['Response1', 'Response2', 'Response3']]
#     score, reason = test_context_precision(row['Question'], row['Actual Answer'], row['Response'], selected_cols.values.tolist())
#     Context_Precision_Scores.append(score)
#     Context_Precision_Reasons.append(reason)
    
#     score, reason = test_context_recall(row['Question'], row['Actual Answer'], row['Response'], selected_cols.values.tolist())
#     Context_Recall_Scores.append(score)
#     Context_Recall_Reasons.append(reason)
    
# Retrieval_Eval_DF = pd.DataFrame({
#     'Context Precision Score': Context_Precision_Scores,
#     'Context Precision Reason': Context_Precision_Reasons,
#     'Context Recall Score': Context_Recall_Scores,
#     'Context Recall Reason': Context_Recall_Reasons
# })
# Retrieval_Eval_DF.to_csv('Retrieval_Metric_Evaluation.csv', index=False)
# Retrieval_Eval_DF