### Set OpenAI API key

In [1]:
import os
import getpass


os.environ["OPENAI_API_KEY"] = getpass.getpass()

### Setup RAG generator

In [2]:
from langchain_openai import AzureChatOpenAI


llm = AzureChatOpenAI(
    azure_endpoint = "https://keystone1.openai.azure.com/openai/deployments/gpt-4o-mini/chat/completions?api-version=2024-08-01-preview",
    api_key = os.environ["OPENAI_API_KEY"],
    api_version = "2024-08-01-preview",
    azure_deployment = "gpt-4o-mini",
    max_tokens = 256
)


### Generation prompt

In [3]:
SIMPLE_RAG_PROMPT_TEMPLATE = """
    You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum, keep the answer concise and DO NOT mention the context and from which documents you take information.
    Question: {question}\n 
    Context: {context}\n 
    Answer: 
"""

### Retriever-filtered RAG

In [14]:
from langchain.docstore.document import Document


METADATA_EXTRACTOR = """
    You are tasked to extract useful info from a query provided. 
    These info are: years, countries, the countries' continent and target (clinic or lab). 
    If the country is not specified, country and continent will be "Europe"; if the target is not specified it will be "all". 
    You must specify the continent of each country in the query.
    
    Query: {query}\n
    Country:
    Continent:
    Year:
    Target:
"""


def context_filter(llm_metadata_output: str):

    metadata = {}
    fields = llm_metadata_output.replace(" ", "").split("\n")
    metadata["country"] = fields[0].split(":")[-1]
    metadata["continent"] = fields[1].split(":")[-1]
    metadata["year"] = fields[2].split(":")[-1]
    metadata["target"] = fields[3].split(":")[-1]

    return metadata


def filter_function(doc: Document, metadata):

    years = metadata["year"].split(",")
    countries = metadata["country"].split(",")
    continents = metadata["continent"].split(",")
    min_len = min(len(countries), len(continents))
    
    filter_country = any(countries[i] in doc.metadata["keywords"] or continents[i] in doc.metadata["keywords"] for i in range(min_len))
    filter_year = any(f"{year}" in doc.metadata["keywords"] for year in years)
    filter_target = (f"{metadata["target"]}" in doc.metadata["keywords"] or "all" in doc.metadata["keywords"] if metadata["target"] == "lab" or metadata["target"] == "clinic" else True)
    
    return filter_year and filter_country and filter_target


### RAG tests runner (simple RAG, RAG with hard filters, RAG with llm filters)

In [5]:
from ragatouille import RAGPretrainedModel
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_core.language_models.llms import LLM
from langchain.docstore.document import Document
from typing import Optional, List, Tuple


def answer_with_rag(
    question: str,
    llm: LLM,
    knowledge_index: InMemoryVectorStore,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 30,
    num_docs_final: int = 7,
    rag_type: str = "RAG_simple"
) -> Tuple[str, List[Document]]:
    
    """Answer a question using RAG with the given knowledge index."""
    # Gather documents with retriever
    
    match rag_type:
        
        case "RAG_simple":
            relevant_docs = knowledge_index.similarity_search(query = question, k = num_retrieved_docs)

        case "RAG_filter_retriever":
            llm_metadata_output = llm.invoke(METADATA_EXTRACTOR.format(query = question)).content
            metadata = context_filter(llm_metadata_output)
            relevant_docs = knowledge_index.similarity_search(query = question, k = num_retrieved_docs, filter = lambda doc: filter_function(doc, metadata))     
        
        case _: 
            return
           
    relevant_docs = [f"{doc.metadata["keywords"]}\n{doc.page_content}" for doc in relevant_docs]  # keep only the text

    # Optionally rerank results
    if reranker:
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
        relevant_docs = [doc["content"] for doc in relevant_docs]

    relevant_docs = relevant_docs[:num_docs_final]

    # Build the final prompt
    context = "\nExtracted documents:\n"
    context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])

    final_prompt = SIMPLE_RAG_PROMPT_TEMPLATE.format(question = question, context = context)

    # Redact an answer
    answer = llm.invoke(final_prompt).content

    return answer, relevant_docs

In [11]:
import json
from time import sleep
from tqdm.auto import tqdm
from langchain_core.vectorstores import VectorStore
from datasets import Dataset
from typing import Optional
from ragatouille import RAGPretrainedModel


def run_rag_tests(
    eval_dataset: Dataset,
    llm,
    knowledge_index: VectorStore,
    output_file: str,
    reranker: Optional[RAGPretrainedModel] = None,
    verbose: Optional[bool] = True,
    test_settings: Optional[str] = None,  # To document the test settings used
    k = 4, 
    k_final = 4,
    rag_type: str = "RAG_simple"
):
    """Runs RAG tests on the given dataset and saves the results to the given output file."""
    try:  # load previous generations if they exist
        with open(output_file, "r") as f:
            outputs = json.load(f)
    except:
        outputs = []

    for example in tqdm(eval_dataset):
        question = example["question"]
        if question in [output["question"] for output in outputs]:
            continue

        answer, relevant_docs = answer_with_rag(question, llm, 
                                                knowledge_index, 
                                                reranker = reranker,
                                                num_retrieved_docs = k,
                                                num_docs_final = k_final, 
                                                rag_type = rag_type)

        if verbose:
            print("=======================================================")
            print(f"Question: {question}")
            print(f"Answer: {answer}")
            print(f'True answer: {example["answer"]}')
        result = {
            "question": question,
            "true_answer": example["answer"],
            "source_doc": example["source_doc"],
            "generated_answer": answer,
            "retrieved_docs": [doc for doc in relevant_docs],
        }
        if test_settings:
            result["test_settings"] = test_settings
        outputs.append(result)

        with open(output_file, "w") as f:
            json.dump(outputs, f, indent = 4, ensure_ascii = False)

        sleep(5)

### Single run debug

In [None]:
from langchain_openai import AzureOpenAIEmbeddings


all_embeddings = {     
    # "mpnet_base_v2": HuggingFaceEmbeddings(model_name = "sentence-transformers/all-mpnet-base-v2"),
     
    # "minilm_l6": HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2"),
    
    # "minilm_l12": HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L12-v2"),

    # "multilingual": HuggingFaceEmbeddings(model_name = "intfloat/multilingual-e5-large"),
    
    "text_embedding_3_large": AzureOpenAIEmbeddings(
        azure_endpoint="https://keystone1.openai.azure.com/openai/deployments/text-embedding-3-large-2/embeddings?api-version=2023-05-15",
        api_key = os.environ["OPENAI_API_KEY"],
        model = "TextEmbedding3LargeDeployment",
        api_version = "2023-05-15",
        chunk_size = 384
    )
}

question = "In 2022, how much was the accessible market potential and the core target in Brazil for CAD-CAM technologies?"

model_name = "text_embedding_3_large"

vector_store_path = f"../indexing/models/Text+Images/{model_name}/384_100/384_100_{model_name}"
embeddings = all_embeddings[model_name]
knowledge_index = InMemoryVectorStore.load(path = vector_store_path, embedding = embeddings)
answer, _ = answer_with_rag(question=question, knowledge_index=knowledge_index, llm=llm, num_retrieved_docs=4, num_docs_final=4, rag_type="RAG_filter_retriever")
answer

### Define embeddings

In [7]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_openai import AzureOpenAIEmbeddings


all_embeddings = {     
    # "mpnet_base_v2": HuggingFaceEmbeddings(model_name = "sentence-transformers/all-mpnet-base-v2"),
     
    # "minilm_l6": HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2"),
    
    # "minilm_l12": HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L12-v2"),

    # "multilingual": HuggingFaceEmbeddings(model_name = "intfloat/multilingual-e5-large"),
    
    "text_embedding_3_large": AzureOpenAIEmbeddings(
        azure_endpoint="https://keystone1.openai.azure.com/openai/deployments/text-embedding-3-large-2/embeddings?api-version=2023-05-15",
        api_key = os.environ["OPENAI_API_KEY"],
        model = "TextEmbedding3LargeDeployment",
        api_version = "2023-05-15",
        chunk_size = 384
    )
}

### Generate answers with RAG

In [None]:
import os
import json


def run_tests(chunking_type, 
              semantic_chunking_type, 
              model_name, 
              chunk_size, 
              chunk_overlap, 
              eval_dataset,
              eval_dataset_name,
              generator_name,
              rag_type):

    if chunking_type == "page_chunking":
        chunking = chunking_type
    elif chunking_type == "semantic":
        chunking = f"{chunking_type}_{semantic_chunking_type}"
    else:
        chunking = f"{chunk_size}_{chunk_overlap}"

    settings_name = f"chunk:{chunking}_embeddings:{model_name}_reader-model:{generator_name}"
    output_file_name = f"./output/{generator_name}/Text+Images/{rag_type}/{model_name}/rag_{settings_name}/{eval_dataset_name}"

    if os.path.exists(output_file_name):
        return
    if not os.path.exists(f"./output/{generator_name}/Text+Images/{rag_type}/{model_name}/rag_{settings_name}"):
        os.mkdir(f"./output/{generator_name}/Text+Images/{rag_type}/{model_name}/rag_{settings_name}")
    
    try:
        with open(output_file_name, "r"):
            pass
    except:
        print("Running RAG...")
        print(f"Configuration: model: {model_name}, chunking: {chunking}")
        reranker = None
        vector_store_path = f"../indexing/models/Text+Images/{model_name}/{chunking}/{chunking}_{model_name}"
        embeddings = all_embeddings[model_name]
        vector_store = InMemoryVectorStore.load(path = vector_store_path, embedding = embeddings)
        run_rag_tests(
            eval_dataset = eval_dataset,
            llm = llm,
            knowledge_index = vector_store,
            output_file = output_file_name,
            reranker = reranker,
            verbose = False,
            test_settings = settings_name,
            k = 4,
            k_final = 4,
            rag_type = rag_type
        )

    return


if not os.path.exists("./output"):
    os.mkdir("./output")

RAG_type = "RAG_filter_retriever"
chunking_types = ["fixed_number"]
chunk_sizes = [256, 384, 512]
chunk_overlaps = [100]
semantic_chunking_types = ["percentile", "interquartile", "gradient"]
model_names = all_embeddings.keys()
eval_dataset_name = "all_QA_countries&years_brands.json"
GENERATOR_MODEL_NAME = "GPT_4o_mini"

with open(f"../evaluation/dataset/{eval_dataset_name}", "r") as f:
    eval_dataset = json.load(f)

for model_name in model_names:
    for chunking_type in chunking_types:

        if chunking_type == "fixed_number":
            for chunk_size in chunk_sizes:
                for chunk_overlap in chunk_overlaps:
                    run_tests(chunking_type = chunking_type, 
                            semantic_chunking_type = None, 
                            model_name = model_name,
                            chunk_size = chunk_size,
                            chunk_overlap = chunk_overlap,
                            eval_dataset = eval_dataset,
                            eval_dataset_name = eval_dataset_name,
                            generator_name = GENERATOR_MODEL_NAME,
                            rag_type = RAG_type)
                    
        elif chunking_type == "semantic":
            for semantic_chunking_type in semantic_chunking_types:
                run_tests(chunking_type = chunking_type, 
                        semantic_chunking_type = semantic_chunking_type, 
                        model_name = model_name,
                        chunk_size = None,
                        chunk_overlap = None,
                        eval_dataset = eval_dataset,
                        eval_dataset_name = eval_dataset_name,
                        generator_name = GENERATOR_MODEL_NAME,
                        rag_type = RAG_type)
                
        else:
            run_tests(chunking_type = chunking_type, 
                    semantic_chunking_type = None, 
                    model_name = model_name,
                    chunk_size = None,
                    chunk_overlap = None,
                    eval_dataset = eval_dataset,
                    eval_dataset_name = eval_dataset_name,
                    generator_name = GENERATOR_MODEL_NAME,
                    rag_type = RAG_type)