In [None]:
from tqdm.auto import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import json
import datasets
from huggingface_hub import notebook_login


pd.set_option("display.max_colwidth", None)
HF_API_TOKEN = "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"
notebook_login()

### Load chunks

In [93]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader
import os


files = []
for fname in os.listdir("../sources"):
    complete_path = os.path.join("../sources", fname)
    if os.path.isfile(complete_path):
        files.append(complete_path)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap = 200,
    add_start_index = True,
)

docs = []
for file in files:
    loader = PyMuPDFLoader(file, extract_images = False)
    for doc in loader.load_and_split(text_splitter = text_splitter):
            docs.append(doc)

### Setup agent for questions generation

In [95]:
from huggingface_hub import InferenceClient


repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"

llm_client = InferenceClient(
    model=repo_id,
    timeout=500,
)


def call_llm(inference_client: InferenceClient, prompt: str):
    response = inference_client.post(
        json={
            "inputs": prompt,
            "parameters": {"max_new_tokens": 1000},
            "task": "text-generation",
        },
    )
    return json.loads(response.decode())[0]["generated_text"]


call_llm(llm_client, "This is a test context")

'This is a test context for the `@mui/material` library.\n\n## Installation\n\n```sh\nnpm install @mui/material\n```\n\n## Usage\n\n```jsx\nimport React from \'react\';\nimport { Button } from \'@mui/material\';\n\nfunction App() {\n  return (\n    <div className="App">\n      <Button variant="contained" color="primary">\n        Hello World\n      </Button>\n    </div>\n  );\n}\n\nexport default App;\n```\n\n## Documentation\n\n- [Material-UI](https://material-ui.com/)\n- [Material Design](https://material.io/)'

In [94]:
QA_generation_prompt = """
Your task is to write a factoid question and an answer given a context.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".

Provide your answer as follows:

Output:::
Factoid question: (your factoid question)
Answer: (your answer to the factoid question)

Now here is the context.

Context: {context}\n
Output:::"""

### Generate questions

In [98]:
import random


N_GENERATIONS = 50
print(f"Generating {N_GENERATIONS} QA couples...")

outputs = []
for sampled_context in tqdm(random.sample(docs, N_GENERATIONS)):
    # Generate QA couple
    output_QA_couple = call_llm(llm_client, QA_generation_prompt.format(context=sampled_context.page_content))
    try:
        question = output_QA_couple.split("Factoid question: ")[-1].split("Answer: ")[0]
        answer = output_QA_couple.split("Answer: ")[-1]
        assert len(answer) < 300, "Answer is too long"
        outputs.append(
            {
                "context": sampled_context.page_content,
                "question": question,
                "answer": answer,
                "source_doc": sampled_context.metadata["source"],
            }
        )
    except:
        continue

with open("dataset/all_QA.json", "w") as f:
    json.dump(outputs, f)

Generating 50 QA couples...


  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
outputs

In [None]:
with open("dataset/all_QA.json", "r") as f:
    outputs = json.load(f)

### Setup prompts for question filter

In [60]:
question_groundedness_critique_prompt = """
You will be given a context and a question.
Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.
Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here are the question and context.

Question: {question}\n
Context: {context}\n
Answer::: """

question_relevance_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how useful this question can be for users who want to perform market share analysis about brands which produce dental items.
Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

question_standalone_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how context-independant this question is.
Give your answer on a scale of 1 to 5, where 1 means that the question depends on additional information to be understood, and 5 means that the question makes sense by itself.
For instance, if the question refers to a particular setting, like 'in the context' or 'in the document', the rating must be 1.
The questions can contain obscure technical nouns or acronyms like IOS and still be a 5: it must simply be clear to an operator with access to documentation what the question is about.

For instance, "What is the name of the checkpoint from which the ViT model is imported?" should receive a 1, since there is an implicit mention of a context, thus the question is not independant from the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

### Generate LLM response for question evaluation

In [None]:
import os


if not os.path.exists("dataset/unfiltered_dataset"):
    print("Generating critique for each QA couple...")
    for output in tqdm(outputs[0:2]):
        evaluations = {
            "groundedness": call_llm(
                llm_client,
                question_groundedness_critique_prompt.format(context=output["context"], question=output["question"]),
            ),
            "relevance": call_llm(
                llm_client,
                question_relevance_critique_prompt.format(question=output["question"]),
            ),
            "standalone": call_llm(
                llm_client,
                question_standalone_critique_prompt.format(question=output["question"]),
            ),
        }
        try:
            for criterion, evaluation in evaluations.items():
                score, eval = (
                    int(evaluation.split("Total rating: ")[-1].strip()),
                    evaluation.split("Total rating: ")[-2].split("Evaluation: ")[1],
                )
                output.update(
                    {
                        f"{criterion}_score": score,
                        f"{criterion}_eval": eval,
                    }
                )
        except Exception as e:
            continue

    # with open("dataset/unfiltered_dataset", "w") as f:
    #     json.dump(outputs, f)

else:
    with open("dataset/unfiltered_dataset", "r") as f:
        outputs = json.load(f)
        display(pd.DataFrame(outputs).head(100))

In [None]:
# display(pd.DataFrame(outputs).head(100))
len(pd.DataFrame(outputs))
# outputs

### Filter out bad questions

In [None]:
import pandas as pd
import os


pd.set_option("display.max_colwidth", None)

generated_questions = pd.DataFrame.from_dict(outputs)

print("Evaluation dataset before filtering:")
display(
    generated_questions[
        [
            "question",
            "answer",
            "groundedness_score",
            "relevance_score",
            "standalone_score",
        ]
    ]
)
if not os.path.exists("dataset/eval_dataset"):
    generated_questions = generated_questions.loc[
        (generated_questions["groundedness_score"] >= 1)
        & (generated_questions["relevance_score"] >= 1)
        & (generated_questions["standalone_score"] >= 1)
    ]

    with open("dataset/eval_dataset", "w") as f:
        d = pd.DataFrame.to_dict(generated_questions)
        json.dump(d, f)
        
else:
    with open("dataset/eval_dataset", "r") as f:
        generated_questions = pd.DataFrame.from_dict(json.load(f))

print("============================================")
print("Final evaluation dataset:")
display(
    generated_questions[
        [
            "question",
            "answer",
            "groundedness_score",
            "relevance_score",
            "standalone_score",
        ]
    ]
)

eval_dataset = datasets.Dataset.from_pandas(generated_questions, split="train", preserve_index=False)


### Setup RAG generator (LLM model)

In [None]:
RAG_PROMPT_TEMPLATE = """
<|system|>
Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer.</s>
<|user|>
Context:
{context}
---
Now here is the question you need to answer.

Question: {question}
</s>
<|assistant|>
"""

In [None]:
# from langchain_community.llms import HuggingFaceHub
from langchain_huggingface import HuggingFaceEndpoint

repo_id = "HuggingFaceH4/zephyr-7b-beta"
READER_MODEL_NAME = "zephyr-7b-beta"
HF_API_TOKEN = "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"

READER_LLM = HuggingFaceEndpoint(
    repo_id=repo_id,
    task="text-generation",
    huggingfacehub_api_token=HF_API_TOKEN,
    max_new_tokens = 512,
    top_k = 30,
    temperature = 0.1,
    repetition_penalty = 1.03
)

In [None]:
from ragatouille import RAGPretrainedModel
from langchain_core.vectorstores import VectorStore
from langchain_core.language_models.llms import LLM
from langchain.docstore.document import Document


def answer_with_rag(
    question: str,
    llm: LLM,
    knowledge_index: VectorStore,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 30,
    num_docs_final: int = 7,
) -> Tuple[str, List[Document]]:
    """Answer a question using RAG with the given knowledge index."""
    # Gather documents with retriever
    relevant_docs = knowledge_index.similarity_search(query=question, k=num_retrieved_docs)
    relevant_docs = [doc.page_content for doc in relevant_docs]  # keep only the text

    # Optionally rerank results
    if reranker:
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
        relevant_docs = [doc["content"] for doc in relevant_docs]

    relevant_docs = relevant_docs[:num_docs_final]

    # Build the final prompt
    context = "\nExtracted documents:\n"
    context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])

    final_prompt = RAG_PROMPT_TEMPLATE.format(question=question, context=context)

    # Redact an answer
    answer = llm(final_prompt)

    return answer, relevant_docs

### Benchmark RAG pipeline

In [None]:
from langchain_core.language_models import BaseChatModel


def run_rag_tests(
    eval_dataset: datasets.Dataset,
    llm,
    knowledge_index: VectorStore,
    output_file: str,
    reranker: Optional[RAGPretrainedModel] = None,
    verbose: Optional[bool] = True,
    test_settings: Optional[str] = None,  # To document the test settings used
):
    """Runs RAG tests on the given dataset and saves the results to the given output file."""
    try:  # load previous generations if they exist
        with open(output_file, "r") as f:
            outputs = json.load(f)
    except:
        outputs = []

    for example in tqdm(eval_dataset):
        question = example["question"]
        if question in [output["question"] for output in outputs]:
            continue

        answer, relevant_docs = answer_with_rag(question, llm, knowledge_index, reranker=reranker)
        if verbose:
            print("=======================================================")
            print(f"Question: {question}")
            print(f"Answer: {answer}")
            print(f'True answer: {example["answer"]}')
        result = {
            "question": question,
            "true_answer": example["answer"],
            "source_doc": example["source_doc"],
            "generated_answer": answer,
            "retrieved_docs": [doc for doc in relevant_docs],
        }
        if test_settings:
            result["test_settings"] = test_settings
        outputs.append(result)

        with open(output_file, "w") as f:
            json.dump(outputs, f)

In [None]:
EVALUATION_PROMPT = """###Task Description:
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
3. The output format should look as follows: \"Feedback: {{write a feedback for criteria}} [RESULT] {{an integer number between 1 and 5}}\"
4. Please do not generate any other opening, closing, and explanations. Be sure to include [RESULT] in your output.

###The instruction to evaluate:
{instruction}

###Response to evaluate:
{response}

###Reference Answer (Score 5):
{reference_answer}

###Score Rubrics:
[Is the response correct, accurate, and factual based on the reference answer?]
Score 1: The response is completely incorrect, inaccurate, and/or not factual.
Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
Score 3: The response is somewhat correct, accurate, and/or factual.
Score 4: The response is mostly correct, accurate, and factual.
Score 5: The response is completely correct, accurate, and factual.

###Feedback:"""

from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import SystemMessage


evaluation_prompt_template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(content="You are a fair evaluator language model."),
        HumanMessagePromptTemplate.from_template(EVALUATION_PROMPT),
    ]
)

In [None]:
# from prometheus_eval.litellm import LiteLLM
from langchain.chat_models import ChatLiteLLM


# model = LiteLLM('huggingface/prometheus-eval/prometheus-7b-v2.0')
eval_chat_model = ChatLiteLLM(model = 'huggingface/prometheus-eval/prometheus-13b-v2.0')
evaluator_name = "prometheus-13b"

def evaluate_answers(
    answer_path: str,
    eval_chat_model,
    evaluator_name: str,
    evaluation_prompt_template: ChatPromptTemplate,
) -> None:
    """Evaluates generated answers. Modifies the given answer file in place for better checkpointing."""
    answers = []
    if os.path.isfile(answer_path):  # load previous generations if they exist
        answers = json.load(open(answer_path, "r"))

    for experiment in tqdm(answers):
        if f"eval_score_{evaluator_name}" in experiment:
            continue

        eval_prompt = evaluation_prompt_template.format_messages(
            instruction=experiment["question"],
            response=experiment["generated_answer"],
            reference_answer=experiment["true_answer"],
        )
        eval_result = eval_chat_model.invoke(eval_prompt)
        feedback, score = [item.strip() for item in eval_result.content.split("[RESULT]")]
        experiment[f"eval_score_{evaluator_name}"] = score
        experiment[f"eval_feedback_{evaluator_name}"] = feedback

        with open(answer_path, "w") as f:
            json.dump(answers, f)

### Define text splitters

In [None]:
from langchain_text_splitters import TokenTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface import HuggingFaceEmbeddings


chunking_type = "page_chunking"

chunk_size = 256
chunk_overlap = 100

semantic_chunking_type = "gradient"
semantic_chunking_model = "sentence-transformers/all-MiniLM-L12-v2"
embeddings = HuggingFaceEmbeddings(model_name = semantic_chunking_model)

all_chunkings = {
    "page_chunking": None,
    "fixed_number": TokenTextSplitter(chunk_size = chunk_size, chunk_overlap = chunk_overlap),
    "semantic": SemanticChunker(embeddings = embeddings, 
                                breakpoint_threshold_type = semantic_chunking_type)
}

### Define embeddings

In [None]:
from langchain_ollama import OllamaEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings

all_embeddings = {
    "llama3.2:1b": OllamaEmbeddings(model = "llama3.2:1b"),
     
    "llama3.2:3b": OllamaEmbeddings(model = "llama3.2:3b"),
     
    "gemma2b": OllamaEmbeddings(model = "llama3.2:1b"),
     
    "mpnet_base_v2": HuggingFaceEmbeddings(model_name = "sentence-transformers/all-mpnet-base-v2"),
     
    "minilm_l6": HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2"),
    
    "minilm_l12": HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L12-v2")
}

### Create vector store

In [None]:
from langchain_core.vectorstores import InMemoryVectorStore


if chunking_type == "page_chunking":
    chunking = chunking_type
elif chunking_type == "semantic":
    chunking = chunking_type + "_" + semantic_chunking_type + "_minilm_l12"
else:
    chunking = str(chunk_size) + "_" + str(chunk_overlap)

model_name = "mpnet_base_v2"
embeddings = all_embeddings[model_name]
vector_store_path = "../indexing/models/No OCR/" + model_name + "/" + chunking + "/" + model_name + "_cleaned"
vector_store = InMemoryVectorStore.load(path = vector_store_path, embedding = embeddings)

### Run benchmark

In [None]:
if not os.path.exists("./output"):
    os.mkdir("./output")

# for chunk_size in [200]:  # Add other chunk sizes (in tokens) as needed
#     for embeddings in ["thenlper/gte-small"]:  # Add other embeddings as needed
#         for rerank in [True, False]:

settings_name = f"chunk:{chunk_size}_embeddings:{embeddings.replace('/', '~')}_rerank:{rerank}_reader-model:{READER_MODEL_NAME}"
output_file_name = f"./output/rag_{settings_name}.json"

print(f"Running evaluation for {settings_name}:")

print("Loading knowledge base embeddings...")
# knowledge_index = load_embeddings(
#     RAW_KNOWLEDGE_BASE,
#     chunk_size=chunk_size,
#     embedding_model_name=embeddings,
# )


print("Running RAG...")
reranker = None
run_rag_tests(
    eval_dataset = eval_dataset,
    llm = READER_LLM,
    knowledge_index = vector_store,
    output_file = output_file_name,
    reranker = reranker,
    verbose = False,
    test_settings = settings_name,
)

print("Running evaluation...")
evaluate_answers(
    output_file_name,
    eval_chat_model,
    evaluator_name,
    evaluation_prompt_template,
)