### Set OpenAI API key

In [1]:
import os
import getpass


os.environ["OPENAI_API_KEY"] = getpass.getpass()

### Setup RAGAS

In [2]:
from datasets import Dataset
from ragas import evaluate
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from ragas.metrics import context_precision, answer_relevancy, faithfulness, context_recall, answer_correctness
from ragas.run_config import RunConfig
import os
import json


llm = AzureChatOpenAI(
    azure_endpoint = "https://keystone1.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-08-01-preview",
    api_key = os.environ["OPENAI_API_KEY"],
    api_version = "2024-08-01-preview",
    azure_deployment = "gpt-4o"
)

embeddings = AzureOpenAIEmbeddings(
    azure_endpoint="https://keystone1.openai.azure.com/openai/deployments/text-embedding-3-large-2/embeddings?api-version=2023-05-15",
    api_key=os.environ["OPENAI_API_KEY"],
    model="TextEmbedding3LargeDeployment",
    api_version="2023-05-15"
)

### Evaluate RAG with RAGAS

In [None]:
GENERATOR_MODEL_NAME = "GPT_4o_mini"
dataset_type = "easy"
rag_type = "RAG_filter_retriever"

input_file = f"all_QA_{dataset_type}.json"
output_file = f"results_{dataset_type}.json"

for model_name in ["text_embedding_3_large"]:
    for chunking_type in ["fixed_number"]:
        for chunk_size in [256, 384, 512]:
            for chunk_overlap in [100]:

                if chunking_type == "page_chunking":
                    chunking = chunking_type
                elif chunking_type == "fixed_number":
                    chunking = f"{chunk_size}_{chunk_overlap}"

                settings_name = f"rag_chunk:{chunking}_embeddings:{model_name}_reader-model:{GENERATOR_MODEL_NAME}"
                folder = f"output/{GENERATOR_MODEL_NAME}/Text+Images/{rag_type}/{model_name}/{settings_name}"
                input_folder = "../generation/" + folder

                with open(f"{input_folder}/{input_file}", "r") as f:
                    dataset = json.load(f)

                # Take a subset
                # dataset = dataset[:15]

                d = {
                    "question": [entry["question"] for entry in dataset],
                    "contexts": [entry["retrieved_docs"] for entry in dataset],
                    "answer": [entry["generated_answer"] for entry in dataset],
                    "ground_truth": [entry["true_answer"] for entry in dataset],
                }

                eval_dataset = Dataset.from_dict(d)
                
                if not os.path.exists(f"{folder}"):
                    os.mkdir(f"{folder}")
                
                if not os.path.exists(f"{folder}/{output_file}"):
                    run_config = RunConfig(timeout = 50_000, max_retries = 20, max_wait = 50, log_tenacity = False)
                    print(settings_name)
                    results = evaluate(dataset = eval_dataset, 
                                    metrics = [context_precision, faithfulness, answer_relevancy, context_recall, answer_correctness], 
                                    llm = llm, 
                                    embeddings = embeddings,
                                    run_config = run_config)
                    
                    results.to_pandas().to_json(f"{folder}/{output_file}", indent = 4)


### RAGAS results processing

In [None]:
from datasets import Dataset
from statistics import mean, stdev


RAG_type = "RAG_filter_retriever"
chunking_type = "fixed_number"
chunk_size = 512
chunk_overlap = 100
model_name = "text_embedding_3_large"
GENERATOR_MODEL_NAME = "GPT_4o_mini"

if chunking_type == "page_chunking":
  chunking = chunking_type
else:
    chunking = f"{chunk_size}_{chunk_overlap}"

settings_name = f"output/{GENERATOR_MODEL_NAME}/Text+Images/{RAG_type}/{model_name}/rag_chunk:{chunking}_embeddings:{model_name}_reader-model:{GENERATOR_MODEL_NAME}"
path1 = f"{settings_name}/results_easy.json"
path2 = f"{settings_name}/results_years_countries.json"
path3 = f"{settings_name}/results_countries&years_brands.json"

dataset1 = Dataset.from_json(path1)
dataset2 = Dataset.from_json(path2)
dataset3 = Dataset.from_json(path3)

metrics1 = {}
metrics2 = {}
metrics3 = {}

print(settings_name + "\n")
for metric in ["context_precision", "faithfulness", "answer_relevancy", "context_recall", "answer_correctness"]:
      
    # for d in dataset1[metric]:
    #     metrics1[metric] = {"mean": mean([ d[entry] if d[entry] != None else 0.0 for entry in d ]),
    #     	"stdev": stdev([ d[entry] if d[entry] != None else 0.0 for entry in d ]) }

    metrics1[metric] = {
        "mean": mean([ val if val != None else 0.0 for val in dataset1[metric] ]),
        "stdev": stdev([ val if val != None else 0.0 for val in dataset1[metric] ]) 
    }
    
    for d in dataset2[metric]:
        metrics2[metric] = {"mean": mean([ d[entry] if d[entry] != None else 0.0 for entry in d ]),
              "stdev": stdev([ d[entry] if d[entry] != None else 0.0 for entry in d ]) }
        
    for d in dataset3[metric]:
        metrics3[metric] = {"mean": mean([ d[entry] if d[entry] != None else 0.0 for entry in d ]),
              "stdev": stdev([ d[entry] if d[entry] != None else 0.0 for entry in d ]) }
	
    (m1, v1) = metrics1[metric]["mean"], metrics1[metric]["stdev"]
    (m2, v2) = metrics2[metric]["mean"], metrics2[metric]["stdev"]
    (m3, v3) = metrics3[metric]["mean"], metrics3[metric]["stdev"]

    print(f"{metric}:")
    # print(f"{m1:.3f}, {v1:.3f}\n{m2:.3f}, {v2:.3f}\n{m3:.3f}, {v3:.3f}")
    print(f"{mean([m1, m2, m3]):.3f}, {mean([v1, v2, v3]):.3f}")

filter_f = lambda ans: "I don't know" in ans
idks1 = len(list(filter(lambda ans: "I don't know" in ans, dataset1["response"])))
idks2 = len(list(filter(lambda ans: "I don't know" in ans, dataset2["response"][0].values())))
idks3 = len(list(filter(lambda ans: "I don't know" in ans, dataset3["response"][0].values())))

print("\n" + "I don't know:")
# print(f"{idks1}\n{idks2}\n{idks3}")
print(f"{idks1 + idks2 + idks3}")

### Inspect post-evaluation

In [None]:
from datasets import Dataset


path = f"{folder}/results.json"
dataset = Dataset.from_json(path)

print(path)
key = "2"
# print(f"{dataset["user_input"][0][key]}")
# print(f"{dataset["reference"][0][key]}")
print(f"{dataset["response"][0][key]}")
print(f"{dataset["context_precision"][0][key]:.3f}")
print(f"{dataset["faithfulness"][0][key]:.3f}")
print(f"{dataset["answer_relevancy"][0][key]:.3f}")
print(f"{dataset["context_recall"][0][key]:.3f}")
print(f"{dataset["answer_correctness"][0][key]:.3f}")

### Compute intersection and union between retrieved context and reference context

In [83]:
from difflib import SequenceMatcher
import json
from statistics import mean, stdev


dataset_type = "easy"
rag_type = "RAG_filter_retriever"
model_name = "text_embedding_3_large"
chunking_type = "fixed_number"
chunk_size = 256
chunk_overlap = 100

if chunking_type == "page_chunking":
  chunking = chunking_type
elif chunking_type == "fixed_number":
    chunking = f"{chunk_size}_{chunk_overlap}"

reference_file = f"dataset/all_QA_{dataset_type}.json"
base_path = f"../generation/output/GPT_4o_mini/Text+Images/{rag_type}/{model_name}"
settings = f"rag_chunk:{chunking}_embeddings:{model_name}_reader-model:GPT_4o_mini"
generation_file = f"{base_path}/{settings}/dataset_{dataset_type}.json"

f1 = open(reference_file, "r")
f2 = open(generation_file, "r")
rf = json.load(f1)
gf = json.load(f2)

intersections = []
unions = []

for item in list(zip(rf, gf)):
    ref_context = item[0]["context"]
    gen_context = "".join(item[1]["retrieved_docs"])

    a = ref_context.split()
    b = gen_context.split()
    seq_matcher = SequenceMatcher(None, a, b)
    lcs = seq_matcher.find_longest_match(0, len(a), 0, len(b))

    intersection = lcs.size
    union = (len(a) + len(b) - intersection) / 256
    intersections.append(intersection)
    unions.append(union)
    
f1.close()
f2.close()

(m_i, v_i) = mean(intersections), stdev(intersections)
(m_u, v_u) = mean(unions), stdev(unions)

print(f"Intersection:\n{m_i:.1f}, {v_i:.1f}")
print(f"Union:\n{m_u:.2f}, {v_u:.2f}")


Intersection:
79.1, 80.8
Union:
3.67, 0.54
