### Setup RAG generator

In [None]:
from langchain_openai import AzureChatOpenAI
import os
import getpass


os.environ["OPENAI_API_KEY"] = getpass.getpass()

llm = AzureChatOpenAI(
    azure_endpoint = "https://keystone1.openai.azure.com/openai/deployments/gpt-3.5-turbo/chat/completions?api-version=2024-08-01-preview",
    api_key = os.environ["OPENAI_API_KEY"],
    api_version = "2024-08-01-preview",
    azure_deployment = "gpt-3.5-turbo",
)

RAG_PROMPT_TEMPLATE = """
    You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum, keep the answer concise and DO NOT mention from which documents you take information.
    Question: {question}\n 
    Context: {context}\n 
    Answer: 
"""

In [12]:
from ragatouille import RAGPretrainedModel
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_core.language_models.llms import LLM
from langchain.docstore.document import Document
from typing import Optional, List, Tuple


def answer_with_rag(
    question: str,
    llm: LLM,
    knowledge_index: InMemoryVectorStore,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 30,
    num_docs_final: int = 7,
) -> Tuple[str, List[Document]]:
    
    """Answer a question using RAG with the given knowledge index."""
    # Gather documents with retriever
    relevant_docs = knowledge_index.similarity_search(query=question, k=num_retrieved_docs)
    relevant_docs = [doc.page_content for doc in relevant_docs]  # keep only the text

    # Optionally rerank results
    if reranker:
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
        relevant_docs = [doc["content"] for doc in relevant_docs]

    relevant_docs = relevant_docs[:num_docs_final]

    # Build the final prompt
    context = "\nExtracted documents:\n"
    context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])

    final_prompt = RAG_PROMPT_TEMPLATE.format(question = question, context = context)

    # Redact an answer
    answer = llm.invoke(final_prompt).content

    return answer, relevant_docs

In [13]:
import json
from tqdm.auto import tqdm
from langchain_core.language_models import BaseChatModel
from langchain_core.vectorstores import VectorStore
from datasets import Dataset


def run_rag_tests(
    eval_dataset: Dataset,
    llm,
    knowledge_index: VectorStore,
    output_file: str,
    reranker: Optional[RAGPretrainedModel] = None,
    verbose: Optional[bool] = True,
    test_settings: Optional[str] = None,  # To document the test settings used
    k = 4, 
    k_final = 4
):
    """Runs RAG tests on the given dataset and saves the results to the given output file."""
    try:  # load previous generations if they exist
        with open(output_file, "r") as f:
            outputs = json.load(f)
    except:
        outputs = []

    for example in tqdm(eval_dataset):
        question = example["question"]
        if question in [output["question"] for output in outputs]:
            continue

        answer, relevant_docs = answer_with_rag(question, llm, 
                                                knowledge_index, 
                                                reranker = reranker,
                                                num_retrieved_docs = k,
                                                num_docs_final = k_final)

        if verbose:
            print("=======================================================")
            print(f"Question: {question}")
            print(f"Answer: {answer}")
            print(f'True answer: {example["answer"]}')
        result = {
            "question": question,
            "true_answer": example["answer"],
            "source_doc": example["source_doc"],
            "generated_answer": answer,
            "retrieved_docs": [doc for doc in relevant_docs],
        }
        if test_settings:
            result["test_settings"] = test_settings
        outputs.append(result)

        with open(output_file, "w") as f:
            json.dump(outputs, f)

### Define embeddings

In [14]:
from langchain_ollama import OllamaEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_openai import AzureOpenAIEmbeddings


all_embeddings = {
    "llama3.2:1b": OllamaEmbeddings(model = "llama3.2:1b"),
     
    "llama3.2:3b": OllamaEmbeddings(model = "llama3.2:3b"),
     
    "gemma2b": OllamaEmbeddings(model = "llama3.2:1b"),
     
    "mpnet_base_v2": HuggingFaceEmbeddings(model_name = "sentence-transformers/all-mpnet-base-v2"),
     
    "minilm_l6": HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2"),
    
    "minilm_l12": HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L12-v2"),

    "multilingual": HuggingFaceEmbeddings(model_name = "intfloat/multilingual-e5-large"),
    
    "openai": AzureOpenAIEmbeddings(
        azure_endpoint="https://keystone1.openai.azure.com/openai/deployments/text-embedding-3-large-2/embeddings?api-version=2023-05-15",
        api_key=os.environ["OPENAI_API_KEY"],
        model="TextEmbedding3LargeDeployment",
        api_version="2023-05-15",
)
}

### Generate answers with RAG

In [15]:
import os


def run_tests(chunking_type, 
              semantic_chunking_type, 
              model_name, chunk_size, 
              chunk_overlap, 
              eval_dataset, 
              generator_name):

    if chunking_type != "page_chunking" or model_name != "multilingual":
        return
    
    if chunking_type == "page_chunking":
        chunking = chunking_type
    elif chunking_type == "semantic":
        chunking = f"{chunking_type}_{semantic_chunking_type}"
    else:
        chunking = f"{chunk_size}_{chunk_overlap}"

    settings_name = f"chunk:{chunking}_embeddings:{model_name}_reader-model:{generator_name}"
    output_file_name = f"./output/{generator_name}/rag_{settings_name}/dataset.json"

    if os.path.exists(output_file_name):
        return
    os.mkdir(f"./output/{generator_name}/rag_{settings_name}")
    

    try:
        with open(f"output/{generator_name}/rag_{settings_name}/dataset.json", "r") as f:
            dataset = json.load(f)
    except:
        print("Running RAG...")
        print(f"Configuration: model: {model_name}, chunking: {chunking}")
        reranker = None
        vector_store_path = f"../indexing/models/No OCR/{model_name}/{chunking}/{model_name}"
        embeddings = all_embeddings[model_name]
        vector_store = InMemoryVectorStore.load(path = vector_store_path, embedding = embeddings)
        run_rag_tests(
            eval_dataset = eval_dataset,
            llm = llm,
            knowledge_index = vector_store,
            output_file = output_file_name,
            reranker = reranker,
            verbose = False,
            test_settings = settings_name,
            k = 4,
            k_final = 4
        )

    return


if not os.path.exists("./output"):
    os.mkdir("./output")

chunking_types = ["page_chunking", "fixed_number", "semantic"]
chunk_sizes = [384, 256]
chunk_overlaps = [0, 20, 50, 100]
semantic_chunking_types = ["percentile", "standard_deviation", "interquartile", "gradient"]
model_names = all_embeddings.keys()
GENERATOR_MODEL_NAME = "GPT_3.5_turbo"

with open("dataset/all_QA.json", "r") as f:
    eval_dataset = json.load(f)

for model_name in model_names:
    for chunking_type in chunking_types:

        if chunking_type == "fixed_number":
            for chunk_size in chunk_sizes:
                for chunk_overlap in chunk_overlaps:
                    run_tests(chunking_type = chunking_type, 
                              semantic_chunking_type = None, 
                              model_name = model_name,
                              chunk_size = chunk_size,
                              chunk_overlap = chunk_overlap,
                              eval_dataset = eval_dataset,
                              generator_name = GENERATOR_MODEL_NAME)
                    
        elif chunking_type == "semantic":
            for semantic_chunking_type in semantic_chunking_types:
                run_tests(chunking_type = chunking_type, 
                              semantic_chunking_type = semantic_chunking_type, 
                              model_name = model_name,
                              chunk_size = None,
                              chunk_overlap = None,
                              eval_dataset = eval_dataset,
                              generator_name = GENERATOR_MODEL_NAME)
                
        else:
            run_tests(chunking_type = chunking_type, 
                              semantic_chunking_type = None, 
                              model_name = model_name,
                              chunk_size = None,
                              chunk_overlap = None,
                              eval_dataset = eval_dataset,
                              generator_name = GENERATOR_MODEL_NAME)

Running RAG...
Configuration: model: multilingual, chunking: page_chunking


  0%|          | 0/28 [00:00<?, ?it/s]

NotFoundError: Error code: 404 - {'error': {'code': 'DeploymentNotFound', 'message': 'The API deployment for this resource does not exist. If you created the deployment within the last 5 minutes, please wait a moment and try again.'}}

### Evaluate RAG with RAGAS

In [31]:
from datasets import Dataset
from ragas import evaluate
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from ragas.metrics import context_precision, answer_relevancy, faithfulness, context_recall, answer_correctness
from ragas.run_config import RunConfig
import os
import json


llm = AzureChatOpenAI(
    azure_endpoint = "https://keystone1.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-08-01-preview",
    api_key = os.environ["OPENAI_API_KEY"],
    api_version = "2024-08-01-preview",
    azure_deployment = "gpt-4o",
)

embeddings = AzureOpenAIEmbeddings(
    azure_endpoint="https://keystone1.openai.azure.com/openai/deployments/text-embedding-3-large-2/embeddings?api-version=2023-05-15",
    api_key=os.environ["OPENAI_API_KEY"],
    model="TextEmbedding3LargeDeployment",
    api_version="2023-05-15",
)

chunking_type = "page_chunking"
semantic_chunking_type = "percentile"
chunk_size = 384
chunk_overlap = 20

if chunking_type == "page_chunking":
        chunking = chunking_type
elif chunking_type == "semantic":
    chunking = f"{chunking_type}_{semantic_chunking_type}"
else:
    chunking = f"{chunk_size}_{chunk_overlap}"
    
model_name = "minilm_l6"
GENERATOR_MODEL_NAME = "GPT_4o_mini"

settings_name = f"rag_chunk:{chunking}_embeddings:{model_name}_reader-model:{GENERATOR_MODEL_NAME}"
folder = f"output/{GENERATOR_MODEL_NAME}/{settings_name}"

with open(f"{folder}/dataset.json", "r") as f:
    dataset = json.load(f)

d = {
    "question": [entry["question"] for entry in dataset],
    "contexts": [entry["retrieved_docs"] for entry in dataset],
    "answer": [entry["generated_answer"] for entry in dataset],
    "ground_truth": [entry["true_answer"] for entry in dataset],
}

eval_dataset = Dataset.from_dict(d)

if not os.path.exists(f"{folder}/results_AC.json"):
    run_config = RunConfig(timeout = 6000, max_retries = 20, max_wait = 50, log_tenacity = False)
    results = evaluate(dataset = eval_dataset, 
                    metrics = [context_precision, faithfulness, answer_relevancy, context_recall, answer_correctness], 
                    llm = llm, 
                    embeddings = embeddings,
                    run_config = run_config)

    results.to_pandas().to_json(f"{folder}/results_AC.json")


### RAGAS results processing

In [None]:
from datasets import Dataset
from statistics import mean, variance


path = f"{folder}/results.json"
path_ac = f"{folder}/results_AC.json"
dataset = Dataset.from_json(path)
dataset_ac = Dataset.from_json(path_ac)
metrics = {}

for metric in ["context_precision", "faithfulness", "answer_relevancy", "context_recall"]:
    for d in dataset[metric]:
        metrics[metric] = {"mean": mean([ d[entry] if d[entry] != None else 0.0 for entry in d ]),
            "variance": variance([ d[entry] if d[entry] != None else 0.0 for entry in d ]),                  
        }

    print(f"{metric}:\n{metrics[metric]["mean"]:.3f}, {metrics[metric]["variance"]:.3f}\n")

d = dataset_ac["answer_correctness"][0]
ac = {"mean": mean([ d[entry] if d[entry] != None else 0.0 for entry in d ]),
      "variance": variance([ d[entry] if d[entry] != None else 0.0 for entry in d ])}
print(f"answer_correctness:\n{ac["mean"]:.3f}, {ac["variance"]:.3f}")


In [None]:
from datasets import Dataset


path = "/home/peter_known/Documenti/Scuola e università/PoliTO/Tesi/Tesi/evaluation/output/rag_chunk:semantic_interquartile_embeddings:mpnet_base_v2_reader-model:Zephyr_7B/dataset.json"
dataset = Dataset.from_json(path)

for d in dataset["generated_answer"]:
    print(d[:50])


### Inspect post-evaluation

In [None]:
from datasets import Dataset


path = f"{folder}/results.json"
path_ac = f"{folder}/results_AC.json"
dataset = Dataset.from_json(path)
dataset_ac = Dataset.from_json(path_ac)

print(path)
# for d in dataset["user_input"]:
#     for num, i in enumerate(d):
#         print(f"{num+1}: {d[i]}\n")

key = "1"
print(f"{dataset["user_input"][0][key]}")
# print(f"{dataset["reference"][0][key]}")
print(f"{dataset["response"][0][key]}")
print(f"{dataset["context_precision"][0][key]:.3f}")
print(f"{dataset["faithfulness"][0][key]:.3f}")
print(f"{dataset["answer_relevancy"][0][key]:.3f}")
print(f"{dataset["context_recall"][0][key]:.3f}")
print(f"{dataset_ac["answer_correctness"][0][key]:.3f}")

### Make configuration ranking

In [None]:
import os
import json


base_path = "output/GPT_4o_mini"
os.listdir(base_path)
files = []
for conf in os.listdir(base_path):
    with open(f"{base_path}/{conf}/results.json", "r") as f:
        files.append({"conf": conf, "file": json.load(f)})

points = {}
metrics = ["context_precision", "faithfulness", "answer_relevancy", "context_recall"]
for metric in metrics:
    values = [file["file"][metric] for file in files]


In [None]:
files[0]["file"]["context_precision"]