### Set OpenAI API key

In [74]:
import os
import getpass


os.environ["OPENAI_API_KEY"] = getpass.getpass()

### Setup RAGAS

In [104]:
from datasets import Dataset
from ragas import evaluate
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from ragas.metrics import context_precision, answer_relevancy, faithfulness, context_recall, answer_correctness
from ragas.run_config import RunConfig
from time import sleep
from tqdm.auto import tqdm
import os
import json


llm = AzureChatOpenAI(
    azure_endpoint = "https://keystone1.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-08-01-preview",
    api_key = os.environ["OPENAI_API_KEY"],
    api_version = "2024-08-01-preview",
    azure_deployment = "gpt-4o"
)

embeddings = AzureOpenAIEmbeddings(
    azure_endpoint="https://keystone1.openai.azure.com/openai/deployments/text-embedding-3-large-2/embeddings?api-version=2023-05-15",
    api_key=os.environ["OPENAI_API_KEY"],
    model="TextEmbedding3LargeDeployment",
    api_version="2023-05-15"
)

ragas_metrics = [context_precision, faithfulness, answer_relevancy, context_recall, answer_correctness]

### Evaluate RAG with RAGAS

In [None]:
GENERATOR_MODEL_NAME = "GPT_4o_mini"
dataset_type = "countries&years_brands"
rag_type = "RAG_filter_retriever"

input_file = f"all_QA_{dataset_type}.json"
output_file = f"results_{dataset_type}.json"

for model_name in ["text_embedding_3_large"]:
    for chunking_type in ["fixed_number"]:
        for chunk_size in [256, 384, 512]:
            for chunk_overlap in [100]:

                if chunking_type == "page_chunking":
                    chunking = chunking_type
                elif chunking_type == "fixed_number":
                    chunking = f"{chunk_size}_{chunk_overlap}"

                settings_name = f"rag_chunk:{chunking}_embeddings:{model_name}_reader-model:{GENERATOR_MODEL_NAME}"
                folder = f"output/{GENERATOR_MODEL_NAME}/Text+Images/{rag_type}/{model_name}/{settings_name}"
                input_folder = "../generation/" + folder
                run_config = RunConfig(timeout = 1200, max_retries = 20, max_wait = 50, log_tenacity = False)

                if not os.path.exists(f"{folder}"):
                    os.mkdir(f"{folder}")

                with open(f"{input_folder}/{input_file}", "r") as f:
                    dataset = json.load(f)

                if not os.path.exists(f"{folder}/{output_file}"):   
                    all_results = []
                    print(settings_name)

                    with open(f"{folder}/{output_file}", "w") as f:
                        for entry in tqdm(dataset):
    
                            eval_dataset = Dataset.from_dict({
                                "question": [entry["question"]],
                                "contexts": [entry["retrieved_docs"]],
                                "answer": [entry["generated_answer"]],
                                "ground_truth": [entry["true_answer"]],
                            })
                            results = evaluate(dataset = eval_dataset, 
                                            metrics = ragas_metrics, 
                                            llm = llm, 
                                            embeddings = embeddings,
                                            run_config = run_config,
                                            show_progress = False)
                            
                            results = results.to_pandas()
                        
                            all_results.append({
                                "question": results.loc[0, "user_input"],
                                "retrieved_docs": results.loc[0, "retrieved_contexts"],
                                "generated_answer": results.loc[0, "response"],
                                "true_answer": results.loc[0, "reference"],
                                "cp": results.loc[0, "context_precision"],
                                "f": results.loc[0, "faithfulness"],
                                "ar": results.loc[0, "answer_relevancy"],
                                "cr": results.loc[0, "context_recall"],
                                "ac": results.loc[0, "answer_correctness"]
                            })

                        json.dump(all_results, f, indent = 4, ensure_ascii = False)
                    


### RAGAS results processing

In [None]:
from statistics import mean, stdev
from math import isnan


RAG_type = "RAG_filter_retriever"
chunking_type = "fixed_number"
chunk_size = 512
chunk_overlap = 100
model_name = "text_embedding_3_large"
GENERATOR_MODEL_NAME = "GPT_4o_mini"

if chunking_type == "page_chunking":
  chunking = chunking_type
else:
    chunking = f"{chunk_size}_{chunk_overlap}"

settings_name = f"output/{GENERATOR_MODEL_NAME}/Text+Images/{RAG_type}/{model_name}/rag_chunk:{chunking}_embeddings:{model_name}_reader-model:{GENERATOR_MODEL_NAME}"
path1 = f"{settings_name}/results_easy.json"
path2 = f"{settings_name}/results_years_countries.json"
path3 = f"{settings_name}/results_countries&years_brands.json"

d1, d2, d3 = open(path1, "r"), open(path2, "r"), open(path3, "r")
dataset1, dataset2, dataset3 = json.load(d1), json.load(d2), json.load(d3)
d1.close(), d2.close(), d3.close()

metrics1 = {}
metrics2 = {}
metrics3 = {}

print(settings_name + "\n")
for metric in ["cp","f", "ar", "cr", "ac"]:
    
    metrics1[metric] = {
        "mean": mean([ entry[metric] if not isnan(entry[metric]) else 0.0 for entry in dataset1 ]),
        "stdev": stdev([ entry[metric] if not isnan(entry[metric]) else 0.0 for entry in dataset1 ]) 
    }
    metrics2[metric] = {
        "mean": mean([ entry[metric] if not isnan(entry[metric]) else 0.0 for entry in dataset2 ]),
        "stdev": stdev([ entry[metric] if not isnan(entry[metric]) else 0.0 for entry in dataset2 ]) 
    }
    metrics3[metric] = {
        "mean": mean([ entry[metric] if not isnan(entry[metric]) else 0.0 for entry in dataset3 ]),
        "stdev": stdev([ entry[metric] if not isnan(entry[metric]) else 0.0 for entry in dataset3 ]) 
    }
    
    (m1, v1) = metrics1[metric]["mean"], metrics1[metric]["stdev"]
    (m2, v2) = metrics2[metric]["mean"], metrics2[metric]["stdev"]
    (m3, v3) = metrics3[metric]["mean"], metrics3[metric]["stdev"]

    print(f"{metric}:")
    print(f"{mean([m1, m2, m3]):.3f}, {mean([v1, v2, v3]):.3f}")

filter_f = lambda ans: "I don't know" in ans
print("\nI don't know:")
for i, dataset in enumerate([dataset1, dataset2, dataset3]):
    idks = len(list(filter(filter_f, [qa["generated_answer"] for qa in dataset])))
    print(f"Dataset {i + 1}: {idks}")

In [None]:
dataset3[11]["generated_answer"]

### Compute intersection and union between retrieved context and reference context

In [139]:
import json
from difflib import SequenceMatcher
from statistics import mean, stdev


dataset_type = "countries&years_brands"
rag_type = "RAG_filter_retriever"
model_name = "text_embedding_3_large"
chunking_type = "fixed_number"
chunk_size = 512
chunk_overlap = 100

if chunking_type == "page_chunking":
  chunking = chunking_type
elif chunking_type == "fixed_number":
    chunking = f"{chunk_size}_{chunk_overlap}"

reference_file = f"dataset/all_QA_{dataset_type}.json"
base_path = f"../generation/output/GPT_4o_mini/Text+Images/{rag_type}/{model_name}"
settings = f"rag_chunk:{chunking}_embeddings:{model_name}_reader-model:GPT_4o_mini"
generation_file = f"{base_path}/{settings}/all_QA_{dataset_type}.json"
res_file = f"{base_path.replace("generation", "evaluation")}/{settings}/results_{dataset_type}.json"

f1 = open(reference_file, "r")
f2 = open(generation_file, "r")
rf = json.load(f1)
gf = json.load(f2)

intersections = []
unions = []

for item in list(zip(rf, gf)):
    ref_context = item[0]["context"]
    gen_context = "".join(item[1]["retrieved_docs"])

    a = ref_context.split()
    b = gen_context.split()
    seq_matcher = SequenceMatcher(None, a, b)
    lcs = seq_matcher.find_longest_match(0, len(a), 0, len(b))

    intersection = lcs.size
    union = (len(a) + len(b) - intersection) / 256
    intersections.append(intersection)
    unions.append(union)
    jaccards = [intersections[i]/unions[i] for i in range(len(intersections))]

with open(res_file, "r") as f3:
    resf = json.load(f3)
    # resf["intersection"] = {str(i): intersections[i] for i in range(len(intersections))}   
    # resf["union"] = {str(i): unions[i] for i in range(len(unions))} 
    for i, entry in enumerate(resf):
        entry["intersection"] = intersections[i]
        entry["union"] = unions[i]
        entry["jaccard"] = jaccards[i]

# resf = {"intersection": intersections, "union": unions, "jaccard": jaccards}
with open(res_file, "w") as f3:
    json.dump(resf, f3, indent = 4, ensure_ascii = False)

f1.close()
f2.close()

(m_i, v_i) = mean(intersections), stdev(intersections)
(m_u, v_u) = mean(unions), stdev(unions)

print(f"Intersection:\n{m_i:.1f}, {v_i:.1f}")
print(f"Union:\n{m_u:.2f}, {v_u:.2f}")


Intersection:
140.3, 41.3
Union:
10.13, 0.75


### Anomaly detector between RAG types

In [162]:
from datasets import Dataset


rag_type_1 = "RAG_simple"
rag_type_2 = "RAG_filter_retriever"
dataset_name = "results_countries&years_brands.json"
model_name = "text_embedding_3_large"
chunk_size = 512
chunk_overlap = 100

settings_name_1 = f"output/GPT_4o_mini/Text+Images/{rag_type_1}/{model_name}/rag_chunk:{chunk_size}_{chunk_overlap}_embeddings:{model_name}_reader-model:GPT_4o_mini"
settings_name_2 = f"output/GPT_4o_mini/Text+Images/{rag_type_2}/{model_name}/rag_chunk:{chunk_size}_{chunk_overlap}_embeddings:{model_name}_reader-model:GPT_4o_mini"

path1 = f"{settings_name_1}/{dataset_name}"
path2 = f"{settings_name_2}/{dataset_name}"
dataset1 = Dataset.from_json(path1)
dataset2 = Dataset.from_json(path2)

# 2 should be advanced RAG and 1 simple RAG
ints_simple = list(dataset1["intersection"][0].values())
ints_advanced = list(dataset2["intersection"])

anomalies = [i for i, val in enumerate(ints_advanced) if val < ints_simple[i]]
anomalies


[15]

In [163]:
for a in anomalies:
    print(f"{a}: {ints_advanced[a]} vs {ints_simple[a]}")


15: 97 vs 117
