### Set OpenAI API key

In [2]:
import os
import getpass


os.environ["OPENAI_API_KEY"] = getpass.getpass()

### Evaluate RAG with RAGAS

In [None]:
from datasets import Dataset
from ragas import evaluate
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from ragas.metrics import context_precision, answer_relevancy, faithfulness, context_recall, answer_correctness
from ragas.run_config import RunConfig
import os
import json


llm = AzureChatOpenAI(
    azure_endpoint = "https://keystone1.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-08-01-preview",
    api_key = os.environ["OPENAI_API_KEY"],
    api_version = "2024-08-01-preview",
    azure_deployment = "gpt-4o"
)

embeddings = AzureOpenAIEmbeddings(
    azure_endpoint="https://keystone1.openai.azure.com/openai/deployments/text-embedding-3-large-2/embeddings?api-version=2023-05-15",
    api_key=os.environ["OPENAI_API_KEY"],
    model="TextEmbedding3LargeDeployment",
    api_version="2023-05-15"
)

GENERATOR_MODEL_NAME = "GPT_4o_mini"

for model_name in ["multilingual", "text_embedding_3_large"]:
    for chunking_type in ["page_chunking", "fixed_number"]:
        for chunk_size in [256, 384]:
            for chunk_overlap in [0, 20, 50, 100]:

                if chunking_type == "page_chunking":
                    chunking = chunking_type
                elif chunking_type == "semantic":
                    chunking = f"{chunking_type}_{semantic_chunking_type}"
                else:
                    chunking = f"{chunk_size}_{chunk_overlap}"

                settings_name = f"rag_chunk:{chunking}_embeddings:{model_name}_reader-model:{GENERATOR_MODEL_NAME}"
                folder = f"output/{GENERATOR_MODEL_NAME}/Text+Images/RAG_filter_hard/{settings_name}"

                with open(f"{folder}/dataset_countries&years_brands.json", "r") as f:
                    dataset = json.load(f)

                # Take a subset
                dataset = dataset[:15]

                d = {
                    "question": [entry["question"] for entry in dataset],
                    "contexts": [entry["retrieved_docs"] for entry in dataset],
                    "answer": [entry["generated_answer"] for entry in dataset],
                    "ground_truth": [entry["true_answer"] for entry in dataset],
                }

                eval_dataset = Dataset.from_dict(d)

                if not os.path.exists(f"{folder}/results_countries&years_brands.json"):
                    run_config = RunConfig(timeout = 6000, max_retries = 20, max_wait = 50, log_tenacity = False)
                    print(settings_name)
                    results = evaluate(dataset = eval_dataset, 
                                    metrics = [context_precision, faithfulness, answer_relevancy, context_recall, answer_correctness], 
                                    llm = llm, 
                                    embeddings = embeddings,
                                    run_config = run_config)
                    
                    results.to_pandas().to_json(f"{folder}/results_countries&years_brands.json", indent = 4)


### RAGAS results processing

In [None]:
from datasets import Dataset
from statistics import mean, variance


RAG_type = "RAG_filter_retriever"
chunking_type = "fixed_number"
chunk_size = 256
chunk_overlap = 100
model_name = "text_embedding_3_large"
GENERATOR_MODEL_NAME = "GPT_4o_mini"

if chunking_type == "page_chunking":
  chunking = chunking_type
else:
    chunking = f"{chunk_size}_{chunk_overlap}"

settings_name = f"output/{GENERATOR_MODEL_NAME}/Text+Images/{RAG_type}/rag_chunk:{chunking}_embeddings:{model_name}_reader-model:{GENERATOR_MODEL_NAME}"
path1 = f"{settings_name}/results_easy.json"
path2 = f"{settings_name}/results_years_countries.json"
path3 = f"{settings_name}/results_countries&years_brands.json"

dataset1 = Dataset.from_json(path1)
dataset2 = Dataset.from_json(path2)
dataset3 = Dataset.from_json(path3)

metrics1 = {}
metrics2 = {}
metrics3 = {}

print(settings_name + "\n")
for metric in ["context_precision", "faithfulness", "answer_relevancy", "context_recall", "answer_correctness"]:

    for d in dataset1[metric]:
        metrics1[metric] = {"mean": mean([ d[entry] if d[entry] != None else 0.0 for entry in d ]),
              "variance": variance([ d[entry] if d[entry] != None else 0.0 for entry in d ]) }
    for d in dataset2[metric]:
        metrics2[metric] = {"mean": mean([ d[entry] if d[entry] != None else 0.0 for entry in d ]),
              "variance": variance([ d[entry] if d[entry] != None else 0.0 for entry in d ]) }
    for d in dataset3[metric]:
        metrics3[metric] = {"mean": mean([ d[entry] if d[entry] != None else 0.0 for entry in d ]),
              "variance": variance([ d[entry] if d[entry] != None else 0.0 for entry in d ]) }

    (m1, v1) = metrics1[metric]["mean"], metrics1[metric]["variance"]
    # (m2, v2) = metrics2[metric]["mean"], metrics2[metric]["variance"]
    # (m3, v3) = metrics3[metric]["mean"], metrics3[metric]["variance"]
    # print(f"{metric}:\n{mean([m1, m2, m3]):.3f}, {mean([v1, v2, v3]):.3f}")
    print(f"{m1:.3f}, ")

# all_answers = list(dataset1["response"][0].values()) + list(dataset2["response"][0].values()) + list(dataset3["response"][0].values())
# idks = len(list(filter(lambda ans: "I don't know" in ans, all_answers)))
# print("\n" + "I don't know:\n" + str(idks))


### Inspect post-evaluation

In [None]:
from datasets import Dataset


path = f"{folder}/results.json"
dataset = Dataset.from_json(path)

print(path)
cc
key = "2"
# print(f"{dataset["user_input"][0][key]}")
# print(f"{dataset["reference"][0][key]}")
print(f"{dataset["response"][0][key]}")
print(f"{dataset["context_precision"][0][key]:.3f}")
print(f"{dataset["faithfulness"][0][key]:.3f}")
print(f"{dataset["answer_relevancy"][0][key]:.3f}")
print(f"{dataset["context_recall"][0][key]:.3f}")
print(f"{dataset["answer_correctness"][0][key]:.3f}")