### Set OpenAI API key

In [1]:
import os
import getpass


os.environ["OPENAI_API_KEY"] = getpass.getpass()

### Setup RAG generator

In [2]:
from langchain_openai import AzureChatOpenAI


llm = AzureChatOpenAI(
    azure_endpoint = "https://keystone1.openai.azure.com/openai/deployments/gpt-4o-mini/chat/completions?api-version=2024-08-01-preview",
    api_key = os.environ["OPENAI_API_KEY"],
    api_version = "2024-08-01-preview",
    azure_deployment = "gpt-4o-mini",
    max_tokens = 256
)


### Simple RAG 

In [3]:
SIMPLE_RAG_PROMPT_TEMPLATE = """
    You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum, keep the answer concise and DO NOT mention the context and from which documents you take information.
    Question: {question}\n 
    Context: {context}\n 
    Answer: 
"""

### Hard-filtered RAG

In [4]:
HARD_FILTERED_RAG_METADATA_EXTRACTOR = """
    You are tasked to extract useful info from a query provided. 
    These info are: year, country, the country's continent and target (clinic or lab). 
    If the country is not specified, country and continent will be "Europe"; if the target is not specified it will be "all".
    Query: {query}\n
    Country:
    Continent:
    Year:
    Target:
"""

def context_filter(llm_metadata_output: str):

    metadata = {}
    fields = llm_metadata_output.replace(" ", "").split("\n")
    metadata["country"] = fields[0].split(":")[-1]
    metadata["continent"] = fields[1].split(":")[-1]
    metadata["year"] = fields[2].split(":")[-1]
    metadata["target"] = fields[3].split(":")[-1]

    return metadata

In [5]:
from ragatouille import RAGPretrainedModel
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_core.language_models.llms import LLM
from langchain.docstore.document import Document
from typing import Optional, List, Tuple


def answer_with_rag(
    question: str,
    llm: LLM,
    knowledge_index: InMemoryVectorStore,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 30,
    num_docs_final: int = 7,
    rag_type: str = "RAG_simple"
) -> Tuple[str, List[Document]]:
    
    """Answer a question using RAG with the given knowledge index."""
    # Gather documents with retriever
    
    match rag_type:
        case "RAG_simple":
            relevant_docs = knowledge_index.similarity_search(query = question, k = num_retrieved_docs)

        case "RAG_filter_hard":
            llm_metadata_output = llm.invoke(HARD_FILTERED_RAG_METADATA_EXTRACTOR.format(query = question)).content
            metadata = context_filter(llm_metadata_output)
            year = metadata["year"]
            country = metadata["country"]
            continent = metadata["continent"]
            target = metadata["target"] if metadata["target"] != "all" else ""
            relevant_docs = knowledge_index.similarity_search(query = question, 
                                                              k = num_retrieved_docs, 
                                                              filter = lambda doc: 
                                                              f"{year}" in doc.metadata["keywords"] and
                                                              (f"{country}" in doc.metadata["keywords"] or f"{continent}" in doc.metadata["keywords"]) and 
                                                              f"{target}" in doc.metadata["keywords"] if target == "lab" or target == "clinic" else True)
            
        case _: 
            return
           
    relevant_docs = [doc.page_content for doc in relevant_docs]  # keep only the text

    # Optionally rerank results
    if reranker:
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
        relevant_docs = [doc["content"] for doc in relevant_docs]

    relevant_docs = relevant_docs[:num_docs_final]

    # Build the final prompt
    context = "\nExtracted documents:\n"
    context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])

    final_prompt = SIMPLE_RAG_PROMPT_TEMPLATE.format(question = question, context = context)

    # Redact an answer
    answer = llm.invoke(final_prompt).content

    return answer, relevant_docs

### RAG tests runner (simple RAG, RAG with hard filters, RAG with llm filters)

In [6]:
import json
from time import sleep
from tqdm.auto import tqdm
from langchain_core.vectorstores import VectorStore
from datasets import Dataset
from typing import Optional
from ragatouille import RAGPretrainedModel


def run_rag_tests(
    eval_dataset: Dataset,
    llm,
    knowledge_index: VectorStore,
    output_file: str,
    reranker: Optional[RAGPretrainedModel] = None,
    verbose: Optional[bool] = True,
    test_settings: Optional[str] = None,  # To document the test settings used
    k = 4, 
    k_final = 4,
    rag_type: str = "RAG_simple"
):
    """Runs RAG tests on the given dataset and saves the results to the given output file."""
    try:  # load previous generations if they exist
        with open(output_file, "r") as f:
            outputs = json.load(f)
    except:
        outputs = []

    for example in tqdm(eval_dataset):
        question = example["question"]
        if question in [output["question"] for output in outputs]:
            continue

        answer, relevant_docs = answer_with_rag(question, llm, 
                                                knowledge_index, 
                                                reranker = reranker,
                                                num_retrieved_docs = k,
                                                num_docs_final = k_final, 
                                                rag_type = rag_type)

        if verbose:
            print("=======================================================")
            print(f"Question: {question}")
            print(f"Answer: {answer}")
            print(f'True answer: {example["answer"]}')
        result = {
            "question": question,
            "true_answer": example["answer"],
            "source_doc": example["source_doc"],
            "generated_answer": answer,
            "retrieved_docs": [doc for doc in relevant_docs],
        }
        if test_settings:
            result["test_settings"] = test_settings
        outputs.append(result)

        with open(output_file, "w") as f:
            json.dump(outputs, f, indent = 4, ensure_ascii = False)

        sleep(5)

### Define embeddings

In [7]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_openai import AzureOpenAIEmbeddings


all_embeddings = {     
    # "mpnet_base_v2": HuggingFaceEmbeddings(model_name = "sentence-transformers/all-mpnet-base-v2"),
     
    # "minilm_l6": HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2"),
    
    # "minilm_l12": HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L12-v2"),

    "multilingual": HuggingFaceEmbeddings(model_name = "intfloat/multilingual-e5-large"),
    
    "text_embedding_3_large": AzureOpenAIEmbeddings(
        azure_endpoint="https://keystone1.openai.azure.com/openai/deployments/text-embedding-3-large-2/embeddings?api-version=2023-05-15",
        api_key = os.environ["OPENAI_API_KEY"],
        model = "TextEmbedding3LargeDeployment",
        api_version = "2023-05-15",
        chunk_size = 384
    )
}

### Generate answers with RAG

In [None]:
import os
import json


def run_tests(chunking_type, 
              semantic_chunking_type, 
              model_name, chunk_size, 
              chunk_overlap, 
              eval_dataset, 
              generator_name,
              rag_type):

    if chunking_type == "page_chunking":
        chunking = chunking_type
    elif chunking_type == "semantic":
        chunking = f"{chunking_type}_{semantic_chunking_type}"
    else:
        chunking = f"{chunk_size}_{chunk_overlap}"

    settings_name = f"chunk:{chunking}_embeddings:{model_name}_reader-model:{generator_name}"
    output_file_name = f"./output/{generator_name}/Text+Images/{rag_type}/rag_{settings_name}/dataset_countries&years_brands.json"

    if os.path.exists(output_file_name):
        return
    if not os.path.exists(f"./output/{generator_name}/Text+Images/{rag_type}/rag_{settings_name}"):
        os.mkdir(f"./output/{generator_name}/Text+Images/{rag_type}/rag_{settings_name}")
    
    try:
        with open(output_file_name, "r"):
            pass
    except:
        print("Running RAG...")
        print(f"Configuration: model: {model_name}, chunking: {chunking}")
        reranker = None
        vector_store_path = f"../indexing/models/Text+Images/{model_name}/{chunking}/{chunking}_{model_name}"
        embeddings = all_embeddings[model_name]
        vector_store = InMemoryVectorStore.load(path = vector_store_path, embedding = embeddings)
        run_rag_tests(
            eval_dataset = eval_dataset,
            llm = llm,
            knowledge_index = vector_store,
            output_file = output_file_name,
            reranker = reranker,
            verbose = False,
            test_settings = settings_name,
            k = 4,
            k_final = 4,
            rag_type = rag_type
        )

    return


if not os.path.exists("./output"):
    os.mkdir("./output")

RAG_type = "RAG_filter_hard"
chunking_types = ["page_chunking", "fixed_number"]
chunk_sizes = [384, 256]
chunk_overlaps = [0, 20, 50, 100]
semantic_chunking_types = ["percentile", "interquartile", "gradient"]
model_names = all_embeddings.keys()
GENERATOR_MODEL_NAME = "GPT_4o_mini"

with open("dataset/all_QA_countries&years_brands.json", "r") as f:
    eval_dataset = json.load(f)

for model_name in model_names:
    for chunking_type in chunking_types:

        if chunking_type == "fixed_number":
            for chunk_size in chunk_sizes:
                for chunk_overlap in chunk_overlaps:
                    run_tests(chunking_type = chunking_type, 
                            semantic_chunking_type = None, 
                            model_name = model_name,
                            chunk_size = chunk_size,
                            chunk_overlap = chunk_overlap,
                            eval_dataset = eval_dataset,
                            generator_name = GENERATOR_MODEL_NAME,
                            rag_type = RAG_type)
                    
        elif chunking_type == "semantic":
            for semantic_chunking_type in semantic_chunking_types:
                run_tests(chunking_type = chunking_type, 
                        semantic_chunking_type = semantic_chunking_type, 
                        model_name = model_name,
                        chunk_size = None,
                        chunk_overlap = None,
                        eval_dataset = eval_dataset,
                        generator_name = GENERATOR_MODEL_NAME,
                        rag_type = RAG_type)
                
        else:
            run_tests(chunking_type = chunking_type, 
                    semantic_chunking_type = None, 
                    model_name = model_name,
                    chunk_size = None,
                    chunk_overlap = None,
                    eval_dataset = eval_dataset,
                    generator_name = GENERATOR_MODEL_NAME,
                    rag_type = RAG_type)

### Evaluate RAG with RAGAS

In [3]:
from datasets import Dataset
from ragas import evaluate
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from ragas.metrics import context_precision, answer_relevancy, faithfulness, context_recall, answer_correctness
from ragas.run_config import RunConfig
import os
import json


llm = AzureChatOpenAI(
    azure_endpoint = "https://keystone1.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-08-01-preview",
    api_key = os.environ["OPENAI_API_KEY"],
    api_version = "2024-08-01-preview",
    azure_deployment = "gpt-4o"
)

embeddings = AzureOpenAIEmbeddings(
    azure_endpoint="https://keystone1.openai.azure.com/openai/deployments/text-embedding-3-large-2/embeddings?api-version=2023-05-15",
    api_key=os.environ["OPENAI_API_KEY"],
    model="TextEmbedding3LargeDeployment",
    api_version="2023-05-15"
)

chunking_type = "page_chunking"
chunk_size = 384
chunk_overlap = 0
model_name = "multilingual"
GENERATOR_MODEL_NAME = "GPT_4o_mini"

for model_name in ["multilingual", "text_embedding_3_large"]:
    for chunking_type in ["page_chunking", "fixed_number"]:
        for chunk_size in [256, 384]:
            for chunk_overlap in [0, 20, 50, 100]:

                if chunking_type == "page_chunking":
                    chunking = chunking_type
                elif chunking_type == "semantic":
                    chunking = f"{chunking_type}_{semantic_chunking_type}"
                else:
                    chunking = f"{chunk_size}_{chunk_overlap}"

                settings_name = f"rag_chunk:{chunking}_embeddings:{model_name}_reader-model:{GENERATOR_MODEL_NAME}"
                folder = f"output/{GENERATOR_MODEL_NAME}/Text+Images/RAG_simple/{settings_name}"

                with open(f"{folder}/dataset_countries&years_brands.json", "r") as f:
                    dataset = json.load(f)

                # Take a subset
                dataset = dataset[:15]

                d = {
                    "question": [entry["question"] for entry in dataset],
                    "contexts": [entry["retrieved_docs"] for entry in dataset],
                    "answer": [entry["generated_answer"] for entry in dataset],
                    "ground_truth": [entry["true_answer"] for entry in dataset],
                }

                eval_dataset = Dataset.from_dict(d)

                if not os.path.exists(f"{folder}/results_countries&years_brands.json"):
                    run_config = RunConfig(timeout = 6000, max_retries = 20, max_wait = 50, log_tenacity = False)
                    print(settings_name)
                    results = evaluate(dataset = eval_dataset, 
                                    metrics = [context_precision, faithfulness, answer_relevancy, context_recall, answer_correctness], 
                                    llm = llm, 
                                    embeddings = embeddings,
                                    run_config = run_config)
                    
                    results.to_pandas().to_json(f"{folder}/results_countries&years_brands.json", indent = 4)


rag_chunk:page_chunking_embeddings:multilingual_reader-model:GPT_4o_mini


Evaluating:   0%|          | 0/75 [00:00<?, ?it/s]

Exception raised in Job[13]: RateLimitError(Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the ChatCompletions_Create Operation under Azure OpenAI API version 2024-08-01-preview have exceeded token rate limit of your current OpenAI S0 pricing tier. Please retry after 86400 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit.'}})
Exception raised in Job[11]: RateLimitError(Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the ChatCompletions_Create Operation under Azure OpenAI API version 2024-08-01-preview have exceeded token rate limit of your current OpenAI S0 pricing tier. Please retry after 86400 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit.'}})
Exception raised in Job[31]: RateLimitError(Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the ChatCompletions_Create Operation under Azure O

rag_chunk:256_0_embeddings:multilingual_reader-model:GPT_4o_mini


Evaluating:   0%|          | 0/75 [00:00<?, ?it/s]

rag_chunk:256_20_embeddings:multilingual_reader-model:GPT_4o_mini


Evaluating:   0%|          | 0/75 [00:00<?, ?it/s]

rag_chunk:256_50_embeddings:multilingual_reader-model:GPT_4o_mini


Evaluating:   0%|          | 0/75 [00:00<?, ?it/s]

rag_chunk:256_100_embeddings:multilingual_reader-model:GPT_4o_mini


Evaluating:   0%|          | 0/75 [00:00<?, ?it/s]

rag_chunk:384_0_embeddings:multilingual_reader-model:GPT_4o_mini


Evaluating:   0%|          | 0/75 [00:00<?, ?it/s]

rag_chunk:384_20_embeddings:multilingual_reader-model:GPT_4o_mini


Evaluating:   0%|          | 0/75 [00:00<?, ?it/s]

rag_chunk:384_50_embeddings:multilingual_reader-model:GPT_4o_mini


Evaluating:   0%|          | 0/75 [00:00<?, ?it/s]

rag_chunk:384_100_embeddings:multilingual_reader-model:GPT_4o_mini


Evaluating:   0%|          | 0/75 [00:00<?, ?it/s]

rag_chunk:page_chunking_embeddings:text_embedding_3_large_reader-model:GPT_4o_mini


Evaluating:   0%|          | 0/75 [00:00<?, ?it/s]

rag_chunk:256_0_embeddings:text_embedding_3_large_reader-model:GPT_4o_mini


Evaluating:   0%|          | 0/75 [00:00<?, ?it/s]

rag_chunk:256_20_embeddings:text_embedding_3_large_reader-model:GPT_4o_mini


Evaluating:   0%|          | 0/75 [00:00<?, ?it/s]

rag_chunk:256_50_embeddings:text_embedding_3_large_reader-model:GPT_4o_mini


Evaluating:   0%|          | 0/75 [00:00<?, ?it/s]

rag_chunk:256_100_embeddings:text_embedding_3_large_reader-model:GPT_4o_mini


Evaluating:   0%|          | 0/75 [00:00<?, ?it/s]

rag_chunk:384_0_embeddings:text_embedding_3_large_reader-model:GPT_4o_mini


Evaluating:   0%|          | 0/75 [00:00<?, ?it/s]

rag_chunk:384_20_embeddings:text_embedding_3_large_reader-model:GPT_4o_mini


Evaluating:   0%|          | 0/75 [00:00<?, ?it/s]

rag_chunk:384_50_embeddings:text_embedding_3_large_reader-model:GPT_4o_mini


Evaluating:   0%|          | 0/75 [00:00<?, ?it/s]

rag_chunk:384_100_embeddings:text_embedding_3_large_reader-model:GPT_4o_mini


Evaluating:   0%|          | 0/75 [00:00<?, ?it/s]

### RAGAS results processing

In [None]:
from datasets import Dataset
from statistics import mean, variance


path = f"{folder}/results.json"
dataset = Dataset.from_json(path)
metrics = {}

for metric in ["context_precision", "faithfulness", "answer_relevancy", "context_recall", "answer_correctness"]:
    for d in dataset[metric]:
        metrics[metric] = {"mean": mean([ d[entry] if d[entry] != None else 0.0 for entry in d ]),
            "variance": variance([ d[entry] if d[entry] != None else 0.0 for entry in d ]),                  
        }

    print(f"{metric}:\n{metrics[metric]["mean"]:.3f}, {metrics[metric]["variance"]:.3f}\n")

print("\n")
for d in dataset["response"]:
    for k in d:
        print(f"{k}: {d[k]}\n")

### Inspect post-evaluation

In [None]:
from datasets import Dataset


path = f"{folder}/results.json"
dataset = Dataset.from_json(path)

print(path)

key = "2"
# print(f"{dataset["user_input"][0][key]}")
# print(f"{dataset["reference"][0][key]}")
print(f"{dataset["response"][0][key]}")
print(f"{dataset["context_precision"][0][key]:.3f}")
print(f"{dataset["faithfulness"][0][key]:.3f}")
print(f"{dataset["answer_relevancy"][0][key]:.3f}")
print(f"{dataset["context_recall"][0][key]:.3f}")
print(f"{dataset["answer_correctness"][0][key]:.3f}")