In [None]:
# %conda activate my_pyenv

%pip install --upgrade langchain langchain-community langchain-ollama langchain-huggingface
%pip install --upgrade ragas
%pip install --upgrade datasetscc

In [None]:
from tqdm.auto import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import json
import datasets
from huggingface_hub import notebook_login


pd.set_option("display.max_colwidth", None)
HF_API_TOKEN = "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"
notebook_login()

### Load chunks

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.docstore.document import Document
import os


files = []
for fname in os.listdir("../sources"):
    complete_path = os.path.join("../sources", fname)
    if os.path.isfile(complete_path):
        files.append(complete_path)

chunk_size = 10_000
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = chunk_size,
    chunk_overlap = chunk_size/10,
    add_start_index = True,
)

docs = []
page_contents = []
giant_docs = []

for file in files:
    loader = PyMuPDFLoader(file, extract_images = False)
    giant_doc = {"page_content": "", "metadata": ""}
    first = True
    async for doc in loader.alazy_load():
        if first:
            metadata = {k: v for k, v in doc.metadata.items() if k != "page"}
            giant_doc["metadata"] = metadata
        giant_doc["page_content"] += doc.page_content
        first = False
    giant_docs.append(giant_doc)

for gdoc in giant_docs:
    page_contents = text_splitter.split_text(gdoc["page_content"])
    docs += [{"metadata": gdoc["metadata"], "page_content": pc} for pc in page_contents]

docs = [Document(metadata = doc["metadata"], page_content = doc["page_content"]) for doc in docs]

In [None]:
print(len(docs))

### Context cleaning for QA generation

In [20]:
import re


def decapitalize_content(pages: list[str]):

    """Turns document content into lower case"""

    for p in pages:
        p.page_content = p.page_content.lower()


def remove_non_ASCII(pages: list[str]):

    """Removes non ASCII characters from document. Not suitable for many non english languages 
    which have several non ASCII characters """

    for p in pages:
        if "non-en" not in p.metadata["keywords"]:
            p.page_content = re.sub(r"[^\x00-\x7F]+", "", p.page_content)


def remove_bullets(pages: list[str]):

    """Removes bullets from document """

    for p in pages:
        p.page_content = re.sub(r"^[→•▪\-*✔➢●✗]\s*", "", p.page_content, flags = re.MULTILINE)
        p.page_content = re.sub(r"\d+\.(?=\s*[a-zA-Z])", "", p.page_content)


def remove_escape(pages: list[str]):

    """Turns multiple consecutive escape characters into a single white space"""
    
    for p in pages:
        p.page_content = ' '.join(p.page_content.split())


remove_non_ASCII(docs)
decapitalize_content(docs)
remove_bullets(docs)
remove_escape(docs)

### Setup agent for questions generation

In [None]:
from huggingface_hub import InferenceClient


repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"

llm_client = InferenceClient(
    model=repo_id,
    timeout = None,
)


def call_llm(inference_client: InferenceClient, prompt: str):
    response = inference_client.post(
        json={
            "inputs": prompt,
            "parameters": {"max_new_tokens": 1000},
            "task": "text-generation",
        },
    )
    return json.loads(response.decode())[0]["generated_text"]


call_llm(llm_client, "This is a test context")

In [22]:
QA_generation_prompt = """
Your task is to write a factoid question and an answer given a context.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".

Provide your answer as follows:

Output:::
Factoid question: (your factoid question)
Answer: (your answer to the factoid question)

Now here is the context.

Context: {context}\n
Output:::"""

### Generate questions

In [None]:
import random


N_GENERATIONS = 10
print(f"Generating {N_GENERATIONS} QA couples...")

outputs = []
for sampled_context in tqdm(random.sample(docs, N_GENERATIONS)):
    # Generate QA couple
    output_QA_couple = call_llm(llm_client, QA_generation_prompt.format(context=sampled_context.page_content))
    try:
        question = output_QA_couple.split("Factoid question: ")[-1].split("Answer: ")[0]
        answer = output_QA_couple.split("Answer: ")[-1]
        assert len(answer) < 300, "Answer is too long"
        outputs.append(
            {
                "context": sampled_context.page_content,
                "question": question,
                "answer": answer,
                "source_doc": sampled_context.metadata["source"],
            }
        )
    except:
        continue

with open("dataset/all_QA_1.json", "w") as f:
    json.dump(outputs, f)

In [None]:
with open("dataset/all_QA.json", "r") as f:
    outputs = json.load(f)

In [None]:
display(pd.DataFrame(outputs).head(100))

### Setup RAG generator (LLM model)

In [None]:
from langchain import hub


RAG_PROMPT_TEMPLATE = """
    You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum, keep the answer concise and DO NOT mention from which documents you take information.
    Question: {question}\n 
    Context: {context}\n 
    Answer: 
"""

In [None]:
from langchain_huggingface import HuggingFaceEndpoint


repo_id = "HuggingFaceH4/zephyr-7b-beta"
READER_MODEL_NAME = "Zephyr_7B"
HF_API_TOKEN = "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"

READER_LLM = HuggingFaceEndpoint(
    repo_id=repo_id,
    task="text-generation",
    huggingfacehub_api_token=HF_API_TOKEN,
    max_new_tokens = 512,
    top_k = 30,
    temperature = 0.1,
    repetition_penalty = 1.03
)

In [9]:
from ragatouille import RAGPretrainedModel
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_core.language_models.llms import LLM
from langchain.docstore.document import Document


def answer_with_rag(
    question: str,
    llm: LLM,
    knowledge_index: InMemoryVectorStore,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 30,
    num_docs_final: int = 7,
) -> Tuple[str, List[Document]]:
    
    """Answer a question using RAG with the given knowledge index."""
    # Gather documents with retriever
    relevant_docs = knowledge_index.similarity_search(query=question, k=num_retrieved_docs)
    relevant_docs = [doc.page_content for doc in relevant_docs]  # keep only the text

    # Optionally rerank results
    if reranker:
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
        relevant_docs = [doc["content"] for doc in relevant_docs]

    relevant_docs = relevant_docs[:num_docs_final]

    # Build the final prompt
    context = "\nExtracted documents:\n"
    context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])

    final_prompt = RAG_PROMPT_TEMPLATE.format(question=question, context=context)

    # Redact an answer
    answer = llm(final_prompt)

    return answer, relevant_docs

### Benchmark RAG pipeline

In [10]:
from langchain_core.language_models import BaseChatModel
from langchain_core.vectorstores import VectorStore


def run_rag_tests(
    eval_dataset: datasets.Dataset,
    llm,
    knowledge_index: VectorStore,
    output_file: str,
    reranker: Optional[RAGPretrainedModel] = None,
    verbose: Optional[bool] = True,
    test_settings: Optional[str] = None,  # To document the test settings used
    k = 4, 
    k_final = 4
):
    """Runs RAG tests on the given dataset and saves the results to the given output file."""
    try:  # load previous generations if they exist
        with open(output_file, "r") as f:
            outputs = json.load(f)
    except:
        outputs = []

    for example in tqdm(eval_dataset):
        question = example["question"]
        if question in [output["question"] for output in outputs]:
            continue

        answer, relevant_docs = answer_with_rag(question, llm, 
                                                knowledge_index, 
                                                reranker = reranker,
                                                num_retrieved_docs = k,
                                                num_docs_final = k_final)

        if verbose:
            print("=======================================================")
            print(f"Question: {question}")
            print(f"Answer: {answer}")
            print(f'True answer: {example["answer"]}')
        result = {
            "question": question,
            "true_answer": example["answer"],
            "source_doc": example["source_doc"],
            "generated_answer": answer,
            "retrieved_docs": [doc for doc in relevant_docs],
        }
        if test_settings:
            result["test_settings"] = test_settings
        outputs.append(result)

        with open(output_file, "w") as f:
            json.dump(outputs, f)

### Define embeddings

In [None]:
from langchain_ollama import OllamaEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings


all_embeddings = {
    "llama3.2:1b": OllamaEmbeddings(model = "llama3.2:1b"),
     
    "llama3.2:3b": OllamaEmbeddings(model = "llama3.2:3b"),
     
    "gemma2b": OllamaEmbeddings(model = "llama3.2:1b"),
     
    "mpnet_base_v2": HuggingFaceEmbeddings(model_name = "sentence-transformers/all-mpnet-base-v2"),
     
    "minilm_l6": HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2"),
    
    "minilm_l12": HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L12-v2")
}

### Generate answers with RAG

In [None]:
import os


def run_tests(chunking_type, semantic_chunking_type, model_name, chunk_size, chunk_overlap, eval_dataset):

    if chunking_type == "page_chunking":
        chunking = chunking_type
    elif chunking_type == "semantic":
        chunking = f"{chunking_type}_{semantic_chunking_type}"
    else:
        chunking = f"{chunk_size}_{chunk_overlap}"

    settings_name = f"chunk:{chunking}_embeddings:{model_name}_reader-model:{READER_MODEL_NAME}"
    output_file_name = f"./output/rag_{settings_name}/dataset.json"

    if os.path.exists(output_file_name):
        return
    os.mkdir(f"./output/rag_{settings_name}")
    

    try:
        with open(f"output/rag_{settings_name}/dataset.json", "r") as f:
            dataset = json.load(f)
    except:
        print("Running RAG...")
        print(f"Configuration: model: {model_name}, chunking: {chunking}")
        reranker = None
        vector_store_path = f"../indexing/models/No OCR/{model_name}/{chunking}/{model_name}"
        embeddings = all_embeddings[model_name]
        vector_store = InMemoryVectorStore.load(path = vector_store_path, embedding = embeddings)
        run_rag_tests(
            eval_dataset = eval_dataset,
            llm = READER_LLM,
            knowledge_index = vector_store,
            output_file = output_file_name,
            reranker = reranker,
            verbose = False,
            test_settings = settings_name,
            k = 4,
            k_final = 4
        )

    return


if not os.path.exists("./output"):
    os.mkdir("./output")

chunking_types = ["page_chunking", "fixed_number", "semantic"]
chunk_sizes = [384, 256]
chunk_overlaps = [0, 20, 50, 100]
semantic_chunking_types = ["percentile", "standard_deviation", "interquartile", "gradient"]
model_names = ["mpnet_base_v2", "minilm_l6", "minilm_l12"]
READER_MODEL_NAME = "Zephyr_7B"

with open("dataset/all_QA.json", "r") as f:
    eval_dataset = json.load(f)

for model_name in model_names:
    for chunking_type in chunking_types:

        if chunking_type == "fixed_number":
            for chunk_size in chunk_sizes:
                for chunk_overlap in chunk_overlaps:
                    run_tests(chunking_type = chunking_type, 
                              semantic_chunking_type = None, 
                              model_name = model_name,
                              chunk_size = chunk_size,
                              chunk_overlap = chunk_overlap,
                              eval_dataset = eval_dataset)
                    
        elif chunking_type == "semantic":
            for semantic_chunking_type in semantic_chunking_types:
                run_tests(chunking_type = chunking_type, 
                              semantic_chunking_type = semantic_chunking_type, 
                              model_name = model_name,
                              chunk_size = None,
                              chunk_overlap = None,
                              eval_dataset = eval_dataset)
                
        else:
            run_tests(chunking_type = chunking_type, 
                              semantic_chunking_type = None, 
                              model_name = model_name,
                              chunk_size = None,
                              chunk_overlap = None,
                              eval_dataset = eval_dataset)

### Evaluate RAG with RAGAS

In [None]:
from datasets import Dataset
from ragas import evaluate
from ragas.run_config import RunConfig
from langchain_ollama import ChatOllama
from langchain_ollama import OllamaEmbeddings
from ragas.metrics import context_precision, answer_relevancy, faithfulness, context_recall
from ragas import EvaluationDataset
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpoint
import os
import json


HF_API_TOKEN = "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"
os.environ["OPENAI_API_KEY"] = ""
run_config = RunConfig(timeout = 600, max_wait = 600)

chunking_type = "page_chunking"
chunk_size = 256
chunk_overlap = 100
chunking = chunking_type if chunking_type != "fixed_number" else f"{chunk_size}_{chunk_overlap}"
model_name = "mpnet_base_v2"
READER_MODEL_NAME = "Zephyr_7B"

settings_name = f"chunk:{chunking}_embeddings:{model_name}_reader-model:{READER_MODEL_NAME}"

with open(f"output/rag_{settings_name}.json", "r") as f:
    dataset = json.load(f)

dataset = dataset[:1]
d = {
    "user_input": [entry["question"] for entry in dataset],
    "retrieved_contexts": [entry["retrieved_docs"] for entry in dataset],
    "response": [entry["generated_answer"] for entry in dataset],
    "reference": [entry["true_answer"] for entry in dataset],
}

eval_dataset = EvaluationDataset.from_hf_dataset(Dataset.from_dict(d))

# llm = HuggingFaceEndpoint(
#     repo_id = 'meta-llama/Llama-3.2-3B',
#     huggingfacehub_api_token = HF_API_TOKEN,
# )
llm = ChatOllama(model = "llama3.2:1b")
embeddings = HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L12-v2")
# embeddings = OllamaEmbeddings(model = "llama3.2:1b")

results = evaluate(dataset = eval_dataset, 
                   metrics = [context_precision], 
                   llm = llm, 
                   embeddings = embeddings,
                   run_config = run_config)

results.to_pandas()