# Install pre-requisites

In [None]:
!pip install -q torch transformers langchain sentence-transformers tqdm openpyxl openai pandas datasets ragatouille ratelimit retry duckdb

# Model preparations

To go through the evaluation process, we need following models:

1. Document model: Embedding model to generate document embeddings which will persisted in vector index. 
2. Reader model: A text completion model to answer the final question with augmented context.
3. Evaluator model: A chat completion model that will give final verdict about RAG output. As this model will affect scoring considerably, stronger model should be used. 

As the choice of different models is not subject of this article and won't impact the comparison between RAG frameworks, we are determined to use completed local solution for this experiment for better speed and lower cost. 

To be more precise, following models that are already optimized in Ollama are used:

* [all-minilm](https://huggingface.co/google/gemma-2b) for `Embedding model`;
* [Mixtral-8x7B](https://ollama.com/library/mixtral) for `Evaluator model` and `Reader model`
   

In [None]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.llms import Ollama
from langchain_community.chat_models import ChatOllama, MiniMaxChat
from langchain_openai import ChatOpenAI, OpenAI
import os

# points to a vLLM server
MIXTRAL_ENDPOINT = "http://192.168.0.134:30253"

# points to a ollama server
MINILM_ENDPOINT = "http://192.168.0.29:11434"

READER_MODEL_NAME = "mixtral:instruct"
EMBEDDING_NAME = "all-minilm"
EVALUATOR_NAME = "mixtral:instruct"

EMBEDDING_MODEL = OllamaEmbeddings(model=EMBEDDING_NAME, base_url = MINILM_ENDPOINT)
READER_LLM = Ollama(model=READER_MODEL_NAME, base_url = MIXTRAL_ENDPOINT)
EVAL_MODEL = ChatOllama(model=EVALUATOR_NAME, base_url = MIXTRAL_ENDPOINT)

LANGCHAIN_DATA_ROOT = "./data/langchain"
INSTINCT_DOC_AGENT_DATA_ROOT = "./data/doc_agent"



In [None]:
# Test all these models

EMBEDDING_MODEL.embed_query("hello")

READER_LLM.invoke("hello")

EVAL_MODEL.invoke("hello")


# Build RAG pipeline using `langchain` 

1. transform training data in `m-ric/huggingface_doc` to `langchain`'s document objects
2. Load into faiss index if index file is absent
3. prompt with eval data `m-ric/huggingface_doc` using `READER_MODEL` 

## Knowledge base preparations

In [None]:
from tqdm.auto import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import json
import datasets
pd.set_option("display.max_colwidth", None)

In [None]:
QA_generation_prompt = """
Your task is to write a factoid question and an answer given a context.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".

Provide your answer as follows:

Output:::
Factoid question: (your factoid question)
Answer: (your answer to the factoid question)

Now here is the context.

Context: {context}\n
Output:::"""

In [None]:
question_groundedness_critique_prompt = """
You will be given a context and a question.
Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.
Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here are the question and context.

Question: {question}\n
Context: {context}\n
Answer::: """

question_relevance_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how useful this question can be to machine learning developers building NLP applications with the Hugging Face ecosystem.
Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

question_standalone_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how context-independant this question is.
Give your answer on a scale of 1 to 5, where 1 means that the question depends on additional information to be understood, and 5 means that the question makes sense by itself.
For instance, if the question refers to a particular setting, like 'in the context' or 'in the document', the rating must be 1.
The questions can contain obscure technical nouns or acronyms like Gradio, Hub, Hugging Face or Space and still be a 5: it must simply be clear to an operator with access to documentation what the question is about.

For instance, "What is the name of the checkpoint from which the ViT model is imported?" should receive a 1, since there is an implicit mention of a context, thus the question is not independent from the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

In [None]:
EVAL_DATASET = datasets.load_dataset("m-ric/huggingface_doc_qa_eval", split="train")


In [None]:
from langchain.docstore.document import Document as LangchainDocument

RAW_KNOWLEDGE_BASE = [
    LangchainDocument(page_content=doc["text"], metadata={"source": doc["source"]}) for doc in tqdm(datasets.load_dataset("m-ric/huggingface_doc", split="train"))
]

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter


def split_documents(
    chunk_size: int,
    knowledge_base: List[LangchainDocument]
) -> List[LangchainDocument]:
    """
    Split documents into chunks of size `chunk_size` characters and return a list of documents.
    """
    
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        model_name="gpt-4",
        chunk_size=chunk_size,
        # chunk_overlap=int(chunk_size / 10),
        chunk_overlap=0,
        add_start_index=True,
        strip_whitespace=True,
        separators=["\n\n", "\n", ".", " ", ""],
        disallowed_special=[],
        allowed_special="all"
    )

    docs_processed = []
    for doc in knowledge_base:
        docs_processed += text_splitter.split_documents([doc])

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique

In [None]:
from langchain.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy
import os
from langchain_core.embeddings import Embeddings


def load_embeddings(
    langchain_docs: List[LangchainDocument],
    chunk_size: int,
    embedding_model: Embeddings,
    embedding_model_name: str
) -> FAISS:
    """
    Creates a FAISS index from the given embedding model and documents. Loads the index directly if it already exists.

    Args:
        langchain_docs: list of documents
        chunk_size: size of the chunks to split the documents into
        embedding_model: the embedding
        embedding_model_name: name of the embedding model to use

    Returns:
        FAISS index
         
    """
    # load embedding_model


    # Check if embeddings already exist on disk
    index_name = f"index_chunk:{chunk_size}_embeddings:{embedding_model_name}"
    index_folder_path = os.path.join(LANGCHAIN_DATA_ROOT, index_name)
    if os.path.isdir(index_folder_path):
        return FAISS.load_local(
            index_folder_path,
            embedding_model,
            distance_strategy=DistanceStrategy.COSINE,
        )

    else:
        docs_processed = split_documents(
            chunk_size,
            langchain_docs
        )
        print(f"Index not found, generating it... {len(docs_processed)} docs in total")
        knowledge_index = FAISS.from_documents(
            docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
        )
        knowledge_index.save_local(index_folder_path)
        return knowledge_index

## QA chain

In [None]:
RAG_PROMPT_TEMPLATE = """
<|system|>
Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer.</s>
<|user|>
Context:
{context}
---
Now here is the question you need to answer.

Question: {question}
</s>
<|assistant|>
"""

In [None]:
from ragatouille import RAGPretrainedModel
from langchain_core.vectorstores import VectorStore
from langchain_core.language_models.llms import LLM


def answer_with_rag(
    question: str,
    llm: LLM,
    knowledge_index: VectorStore,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 30,
    num_docs_final: int = 7,
) -> Tuple[str, List[LangchainDocument]]:
    """Answer a question using RAG with the given knowledge index."""
    # Gather documents with retriever
    relevant_docs = knowledge_index.similarity_search(query=question, k=num_retrieved_docs)
    relevant_docs = [doc.page_content for doc in relevant_docs]  # keep only the text

    # Optionally rerank results
    if reranker:
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
        relevant_docs = [doc["content"] for doc in relevant_docs]

    relevant_docs = relevant_docs[:num_docs_final]

    # Build the final prompt
    context = "\nExtracted documents:\n"
    context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])

    final_prompt = RAG_PROMPT_TEMPLATE.format(question=question, context=context)
    
    print("final prompt size:", len(final_prompt))

    # Redact an answer
    answer = llm.invoke(final_prompt)

    return answer, relevant_docs

# Generating answers

## Test function with langchain

In [None]:
from langchain_core.language_models import BaseChatModel 

def run_langchain_rag_tests(
    eval_dataset: datasets.Dataset,
    llm: LLM,
    knowledge_index: VectorStore,
    output_file: str,
    reranker: Optional[RAGPretrainedModel] = None,
    verbose: Optional[bool] = True,
    test_settings: Optional[str] = None,  # To document the test settings used
):
    """Runs RAG tests on the given dataset and saves the results to the given output file."""
    try:  # load previous generations if they exist
        with open(output_file, "r") as f:
            outputs = json.load(f)
    except:
        outputs = []

    for example in tqdm(eval_dataset):
        question = example["question"]
        
        if question in [output["question"] for output in outputs]:
            continue

        answer, relevant_docs = answer_with_rag(question, llm, knowledge_index, reranker=reranker)
        if verbose:
            print("=======================================================")
            print(f"Question: {question}")
            print(f"Answer: {answer}")
            print(f'True answer: {example["answer"]}')
        result = {
            "question": question,
            "true_answer": example["answer"],
            "source_doc": example["source_doc"],
            "generated_answer": answer,
            "retrieved_docs": [doc for doc in relevant_docs],
        }
        if test_settings:
            result["test_settings"] = test_settings
        outputs.append(result)

        with open(output_file, "w") as f:
            json.dump(outputs, f)

In [None]:
def run_langchain_test_all() -> str:
    """
    Build index and run langchain test with fixed parameter and model selections
    :return: 
    """
    if not os.path.exists("./output"):
        os.mkdir("./output")
    
    chunk_size = 200
    rerank = False
    
    settings_name = f"langchain_chunk:{chunk_size}_rerank:{rerank}_reader-model:{READER_MODEL_NAME}_embedding-model:{EMBEDDING_NAME}"
    output_file_name = os.path.join("./output", f"{settings_name}.json")
    

    print("Loading knowledge base embeddings...")
    knowledge_index = load_embeddings(
        RAW_KNOWLEDGE_BASE,
        chunk_size=chunk_size,
        embedding_model=EMBEDDING_MODEL,
        embedding_model_name=EMBEDDING_NAME
    )
    
    print(f"Running RAG with {settings_name}")
    reranker = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0") if rerank else None
    run_langchain_rag_tests(
        eval_dataset=EVAL_DATASET,
        llm=READER_LLM,
        knowledge_index=knowledge_index,
        output_file=output_file_name,
        reranker=reranker,
        verbose=True,
        test_settings=settings_name,
    )
    
    return output_file_name 

In [None]:
# execute test for langchain
LANGCHAIN_TEST_OUTPUT = run_langchain_test_all()

## Test function with doc-agent in instinct.cpp

You have to manually start `doc-agent` locally.

To build knowledge index with same knowledge base data from HF:

```shell
$DOC_AGENT_BIN --verbose \
  --parent_child_retriever \
  --child_chunk_size=200 \
  --chat_model_model_name=gemma:2b \
  --embedding_model_model_name=all-minilm:latest \
  --db_path=./data/instinct/index.db \
  --vector_table_dimension=384 \
  build \
  --force \
  --file=https://huggingface.co/api/datasets/m-ric/huggingface_doc/parquet/default/train/0.parquet \
  --type=PARQUET \
  --parquet_mapping=0:txt,1:metadata:source:varchar
```

To start http server for query:

```shell
$DOC_AGENT_BIN --verbose \
  --parent_child_retriever \
  --child_chunk_size=200 \
  --chat_model_model_name=gemma:2b \
  --embedding_model_model_name=all-minilm:latest \
  --db_path=/tmp/rag_eval.db \
  --vector_table_dimension=384 \
  serve \
  --port=9090 
```

Next, we will begin QA tests.

In [None]:
def answer_with_doc_agent(question: str):
    import requests
    res = requests.post("http://localhost:9090/v1/chat/completions", json={"messages": [{"content": question, "role": "human"}], "stream": False})
    assert res.status_code == 200
    body = res.json()
    return body["choices"][0]["message"]["content"]
    

def run_doc_agent_rag_tests(
    eval_dataset: datasets.Dataset,
    output_file: str,
    verbose: Optional[bool] = True,
    test_settings: Optional[str] = None,  # To document the test settings used
):
    """Runs RAG tests on the given dataset and saves the results to the given output file."""
    try:  # load previous generations if they exist
        with open(output_file, "r") as f:
            outputs = json.load(f)
    except:
        outputs = []

    for example in tqdm(eval_dataset):
        question = example["question"]
        if question in [output["question"] for output in outputs]:
            continue

        answer = answer_with_doc_agent(question)
        if verbose:
            print("=======================================================")
            print(f"Question: {question}")
            print(f"Answer: {answer}")
            print(f'True answer: {example["answer"]}')
        result = {
            "question": question,
            "true_answer": example["answer"],
            "source_doc": example["source_doc"],
            "generated_answer": answer
        }
        if test_settings:
            result["test_settings"] = test_settings
        outputs.append(result)

        with open(output_file, "w") as f:
            json.dump(outputs, f)

In [None]:
def run_doc_agent_test_all():
    if not os.path.exists("./output"):
        os.mkdir("./output")
    
    chunk_size = 200
    rerank = False
    
    settings_name = f"doc_agent_chunk:{chunk_size}_rerank:{rerank}_reader-model:{READER_MODEL_NAME}_embedding-model:{EMBEDDING_NAME}"
    output_file_name = f"./output/rag_{settings_name}.json"
    
    print(f"Running RAG with settings {settings_name}")
    run_doc_agent_rag_tests(
        eval_dataset=EVAL_DATASET,
        output_file=output_file_name,
        test_settings=settings_name
    )
    
    return output_file_name

In [None]:
DOC_AGENT_TEST_OUTPUT = run_doc_agent_test_all()

# Evaluation Runner

In [None]:
from ratelimit import limits,sleep_and_retry
from retry import retry
from langchain_core.prompts import ChatPromptTemplate

@sleep_and_retry
@limits(calls=6, period=60)
def throttled_invoke(eval_chat_model, eval_prompt):
    return eval_chat_model.invoke(eval_prompt)



@retry(exceptions=Exception, tries=6)
def evaluate_single_answer(
        evaluation_prompt_template: ChatPromptTemplate,
        experiment: dict,
        throttled:bool,
        eval_chat_model: BaseChatModel
):
    eval_prompt = evaluation_prompt_template.format_messages(
            instruction=experiment["question"],
            response=experiment["generated_answer"],
            reference_answer=experiment["true_answer"],
        )
    if throttled:
        eval_result = throttled_invoke(eval_chat_model, eval_prompt)
    else:
        eval_result = eval_chat_model.invoke(eval_prompt)
    splits = [item.strip() for item in eval_result.content.split("[RESULT]")]
    if len(splits) != 2:
        print(splits)
        raise Exception("Evaluation did not complete successfully")
    assert 1 <= int(splits[1]) <= 5
    return splits


def evaluate_answers(
    answer_path: str,
    eval_chat_model: BaseChatModel,
    evaluator_name: str,
    evaluation_prompt_template: ChatPromptTemplate,
    throttled:bool = True
) -> None:
    """Evaluates generated answers. Modifies the given answer file in place for better checkpointing."""
    answers = []
    if os.path.isfile(answer_path):  # load previous generations if they exist
        answers = json.load(open(answer_path, "r"))

    for experiment in tqdm(answers):
        if f"eval_score_{evaluator_name}" in experiment and experiment[f"eval_score_{evaluator_name}"]:
            continue
        
        splits = evaluate_single_answer(evaluation_prompt_template, experiment, throttled, eval_chat_model)
        
        if len(splits) != 2:
            print(splits)
            # experiment[f"eval_score_{evaluator_name}"] = ""
            # experiment[f"eval_feedback_{evaluator_name}"] = ""
            continue
        feedback, score = splits 
        experiment[f"eval_score_{evaluator_name}"] = score
        experiment[f"eval_feedback_{evaluator_name}"] = feedback

        with open(answer_path, "w") as f:
            json.dump(answers, f)

In [None]:
EVALUATION_PROMPT = """ You are a fair evaluator language model.
###Task Description:
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
3. The output format should look as follows: \"Feedback: {{write a feedback for criteria}} [RESULT] {{an integer number between 1 and 5}}\"
4. Please do not generate any other opening, closing, and explanations. Be sure to include [RESULT] in your output.

###The instruction to evaluate:
{instruction}

###Response to evaluate:
{response}

###Reference Answer (Score 5):
{reference_answer}

###Score Rubrics:
[Is the response correct, accurate, and factual based on the reference answer?]
Score 1: The response is completely incorrect, inaccurate, and/or not factual.
Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
Score 3: The response is somewhat correct, accurate, and/or factual.
Score 4: The response is mostly correct, accurate, and factual.
Score 5: The response is completely correct, accurate, and factual.

###Feedback:"""

from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import SystemMessage


EVALUATION_PROMPT_TEMPLATE = ChatPromptTemplate.from_messages(
    [
        # SystemMessage(content="You are a fair evaluator language model."),
        HumanMessagePromptTemplate.from_template(EVALUATION_PROMPT),
    ]
)

## Run evaluations

In [None]:
def generate_eval_results():
    import glob
    for output_file_name in glob.glob("./output/*.json"):
        print(f"Evaluating {output_file_name}")
        evaluate_answers(
            output_file_name,
            EVAL_MODEL,
            EVALUATOR_NAME,
            EVALUATION_PROMPT_TEMPLATE,
            # throttling is not needed for local model
            False
        )

generate_eval_results()

In [None]:
import pandas as pd
import json

def load_eval_results():
    import glob
    outputs = []
    for file in glob.glob("./output/*.json"):
        output = pd.DataFrame(json.load(open(file, "r")))
        output["settings"] = file
        outputs.append(output)
    return pd.concat(outputs)

EVAL_RESULTS = load_eval_results()
display(EVAL_RESULTS)

In [None]:
# Get diffs
import duckdb
DIFF_SQL = "SELECT tbl1.question, tbl1.true_answer, tbl1.generated_answer as langchain_answer, tbl1.score as langchain_score, tbl2.generated_answer as doc_agent_answer, tbl2.score as doc_agent_score "\
           "FROM "\
           f"(SELECT *, \"eval_score_{EVALUATOR_NAME}\" as score FROM EVAL_RESULTS where test_settings like 'langchain%') AS tbl1 "\
           "JOIN "\
           f"(SELECT *, \"eval_score_{EVALUATOR_NAME}\" as score FROM EVAL_RESULTS where test_settings like 'doc_agent%') AS tbl2 "\
           "ON tbl1.question = tbl2.question " \
           f"WHERE tbl1.score > tbl2.score"

DIFFS = duckdb.query(DIFF_SQL).to_df()

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
display(DIFFS)

DIFFS.to_excel("./output/diffs.xlsx")

## Scoring evaluation results

In [None]:
import pandas as pd

def scoring_output(eval_result: pd.DataFrame, evaluator_name: str):
    score_field = f"eval_score_{evaluator_name}"
    result = eval_result.loc[:, [score_field, "settings"]].copy()
    
    result[score_field] = result[score_field].apply(lambda x: int(x) if isinstance(x, str) else 1)
    
    result[score_field] = (result[score_field] - 1) / 4    
    average_scores = result.groupby("settings")[score_field].mean()

    average_scores.sort_values()
    return average_scores

scores = scoring_output(EVAL_RESULTS, EVALUATOR_NAME)
display(scores)