In [None]:
import datasets
import matplotlib.pyplot as plt
import os
import pandas as pd
import torch
from IPython.display import display
from pathlib import Path
from typing import Optional, List, Tuple

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, Pipeline, pipeline

from ragatouille import RAGPretrainedModel

from langchain.docstore.document import Document as LangchainDocument
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy

pd.set_option("display.max_colwidth", None)

# Settings

In [None]:
DATASET = "data/path/to/document/directory"
QUESTIONS = [
    "insert multiple questions here",
    "some should be relevant to the document use case",
    "some should be irrelevant to the use case",
    ]
EMBEDDING_MODEL_NAME = "model/path/to/document/parsing/model" # puts data in the knowledge base
READER_MODEL_NAME = "model/path/to/user/interface/model" # handles recieving user prompt and generating the response
RERANKER_MODEL = "model/path/to/raranking/model" # reranks the documents for more relevant documents used in the response
CHUNK_SIZE = 512 # Choose a chunk size to best fit the embedding model

# Document Ingest

## Initial Data Preperation

Ingest a directory of the documents to be turned into a knowledge base. This will result in a Dictionary with each entry having the `text` of each document and its `filepath` for reference later in the response generation.

In [None]:
def ingest_markdown_directory(rootdir: str) -> dict:
    files_dict = []
    for subdir, dirs, files in os.walk(rootdir):
        for file in files:
            filepath = subdir + os.sep + file

            if filepath.endswith(".md" or "mdx"):
                files_dict.append({
                    "text": Path(filepath).read_text(),
                    "source" : filepath
                })
    return files_dict

ds = ingest_markdown_directory(DATASET)

In [None]:
ds = ingest_markdown_directory(DATASET)

RAW_KNOWLEDGE_BASE = [
    LangchainDocument(page_content=doc["text"], metadata={"source": doc["source"]}) for doc in ds
]

MARKDOWN_SEPARATORS = [
    "\n#{1,6} ",
    "```\n",
    "\n\\*\\*\\*+\n",
    "\n---+\n",
    "\n___+\n",
    "\n\n",
    "\n",
    " ",
    "",
]

In [None]:
def split_documents(
    chunk_size: int,
    knowledge_base: List[LangchainDocument],
    tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME,
) -> List[LangchainDocument]:
    """
    Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
    """
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
        separators=MARKDOWN_SEPARATORS,
    )

    docs_processed = []
    for doc in knowledge_base:
        docs_processed += text_splitter.split_documents([doc])

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique

## Process the Documents

Take the documents and split them into chunks based on the `CHUNK_SIZE` variable. Different chunks can produce different results so this will be analyzed in the next step. These chunks are then run through the embedding model to be converted into document vectors before being loaded into a FAISS vector database which will serve as the knowledge base for the LLM

In [None]:
docs_processed = split_documents(
    CHUNK_SIZE,
    RAW_KNOWLEDGE_BASE,
    tokenizer_name=EMBEDDING_MODEL_NAME,

)

embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},  # set True for cosine similarity
)

KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
    docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
)

# Document Processing Analysis


## Document Splitting Analysis

Analysis of the document splitting process. As documents a split into chunks, if the splitting is too aggressive chunks won't be long enough to have the nessessary context, but if they are too long then information can get lost in the size of the chunk. Also, most documents should be similar in length so the size of the chunks should appear to be a left skewed bell curve.

In [None]:
from sentence_transformers import SentenceTransformer

# To get the value of the max sequence_length, we will query the underlying `SentenceTransformer` object used in the RecursiveCharacterTextSplitter.
print(f"Model's maximum sequence length: {SentenceTransformer('thenlper/gte-small').max_seq_length}")

tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME)
lengths = [len(tokenizer.encode(doc.page_content)) for doc in docs_processed]

# Plot the distrubution of document lengths, counted as the number of tokens
fig = pd.Series(lengths).hist()
plt.title("Distribution of document lengths in the knowledge base (in count of tokens)")
plt.show()

## Vector Clustering Analysis

This looks at how the individual document chunks have been turned into vectors that will be used in responding to a prompt. While not the be all and end all, this analysis can highlight any clusters of documents that may be out of alignment with the rest of the corpus. If that is the case, consider if that set of documents is nessesary for the inteneded use case or if another embedding model should be used.

In [None]:
import pacmap
import numpy as np
import plotly.express as px

# embed a user query in the same space
query_vectors = [embedding_model.embed_query(question) for question in QUESTIONS]

embedding_projector = pacmap.PaCMAP(n_components=2, n_neighbors=None, MN_ratio=0.5, FP_ratio=2.0, random_state=1)
embeddings_2d = [
    list(KNOWLEDGE_VECTOR_DATABASE.index.reconstruct_n(idx, 1)[0]) for idx in range(len(docs_processed))
]
for vector in query_vectors:
    embeddings_2d.append(vector)
# fit the data (The index of transformed data corresponds to the index of the original data)
documents_projected = embedding_projector.fit_transform(np.array(embeddings_2d), init="pca")

df = pd.DataFrame.from_dict(
    [
        {
            "x": documents_projected[i, 0],
            "y": documents_projected[i, 1],
            "source": docs_processed[i].metadata["source"].split("/")[3],
            "extract": docs_processed[i].page_content[:100] + "...",
            "symbol": "circle",
            "size_col": 4,
        }
        for i in range(len(docs_processed))
    ]
    + [
        {
            "x": documents_projected[len(docs_processed) + i, 0],
            "y": documents_projected[len(docs_processed)  +i, 1],
            "source": "User query",
            "extract": QUESTIONS[i],
            "size_col": 100,
            "symbol": "star",
        }
        for i in range(len(QUESTIONS))
    ]
)

# visualize the embedding
fig = px.scatter(
    df,
    x="x",
    y="y",
    color="source",
    hover_data="extract",
    size="size_col",
    symbol="symbol",
    color_discrete_map={"User query": "black"},
    width=1000,
    height=700,
)
fig.update_traces(
    marker=dict(opacity=1, line=dict(width=0, color="DarkSlateGrey")),
    selector=dict(mode="markers"),
)
fig.update_layout(
    legend_title_text="<b>Chunk source</b>",
    title="<b>2D Projection of Chunk Embeddings via PaCMAP</b>",
)
fig.show()

# Response Generator Setup

## Configure the Reader LLM

Set the Reader LLM that will parse the prompt and generate the response

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)
model = AutoModelForCausalLM.from_pretrained(READER_MODEL_NAME, quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)

READER_LLM = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=True,
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=500,
)

Here is the full prompt that will be sent to the LLM for all prompts. The given prompt will be inserted into the `question` field while the relevant document chunks will be given to the LLM in the `context` field. Also included is the prompt used when the LLM does not have an attached knowledge base.

In [None]:
rag_prompt_in_chat_format = [
    {
        "role": "system",
        "content": """Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer.""",
    },
    {
        "role": "user",
        "content": """Context:
{context}
---
Now here is the question you need to answer.

Question: {question}""",
    },
]
RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(
    rag_prompt_in_chat_format, tokenize=False, add_generation_prompt=True
)

llm_prompt_in_chat_format = [
    {
        "role": "system",
        "content": """Give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the source when relevant.
If the answer cannot be deduced from the context, do not give an answer.""",
    },
    {
        "role": "user",
        "content": """
Here is the question you need to answer.

Question: {question}""",
    },
]
LLM_PROMPT_TEMPLATE = tokenizer.apply_chat_template(
    llm_prompt_in_chat_format, tokenize=False, add_generation_prompt=True
)

In [None]:
RERANKER = RAGPretrainedModel.from_pretrained(RERANKER_MODEL)

In [None]:
def answer_with_rag(
    question: str,
    llm: Pipeline,
    knowledge_index: FAISS,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 30,
    num_docs_final: int = 5,
) -> Tuple[str, List[LangchainDocument]]:
    # Gather documents with retriever
    print("=> Retrieving documents...")
    relevant_docs = knowledge_index.similarity_search(query=question, k=num_retrieved_docs)
    doc_sources = [doc.metadata for doc in relevant_docs]
    relevant_docs_content = [doc.page_content for doc in relevant_docs]  # keep only the text

    # Optionally rerank results
    if reranker:
        print("=> Reranking documents...")
        ranked_docs = reranker.rerank(question, relevant_docs_content, k=num_docs_final)
        # doc_sources = [doc.metadata for doc in relevant_docs]
        ranked_sources = []
        for doc in ranked_docs:
            index = relevant_docs_content.index(doc["content"])
            ranked_sources.append(doc_sources[index])
        doc_sources = ranked_sources
        relevant_docs_content = [doc["content"] for doc in ranked_docs]


    relevant_docs_content = relevant_docs_content[:num_docs_final]

    # Build the final prompt
    context = "\nExtracted documents:\n"
    context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs_content)])

    final_prompt = RAG_PROMPT_TEMPLATE.format(question=question, context=context)

    # Generate an answer
    print("=> Generating answer...")
    answer = llm(final_prompt)[0]["generated_text"]

    sources = [doc_source['source'] for doc_source in doc_sources[:num_docs_final]]

    return answer, sources

def answer_with_no_rag(
    question: str,
    llm: Pipeline,
) -> str:
    prompt = LLM_PROMPT_TEMPLATE.format(question=question)

    # Generate an answer
    print("=> Generating answer...")
    answer = llm(prompt)[0]["generated_text"]

    return answer

# Running the Model

## Control Model

This is the control version of the model, producing a response only based on the base model. There is no additional context provided by the knowledge base

In [None]:
plain_dict = []
for question in QUESTIONS:
    answer = answer_with_no_rag(question, READER_LLM)
    plain_dict.append({
        "prompt": question,
        "response": answer,
    })
plain_df = pd.DataFrame.from_dict(plain_dict)
display(plain_df)

## Basic RAG Model

This model uses the provided documents with the provided model in a RAG system. The response included both the answer to the given question and the document used to source the answer.

In [None]:
rag_dict = []
for question in QUESTIONS:
    answer, relevant_docs = answer_with_rag(question, READER_LLM, KNOWLEDGE_VECTOR_DATABASE)
    rag_dict.append({
        "prompt": question,
        "response": answer,
        "sources": relevant_docs
    })
rag_df = pd.DataFrame.from_dict(rag_dict)
display(rag_df)

## RAG Model with Reranking

This model uses the provided documents with the provided model in a RAG system. Documents are assessed for vector similarity to the given prompt then reranked based on a second reranking model to provided more relevant source documents to use in the response. The response included both the answer to the given question and the document used to source the answer.

In [None]:
rank_dict = []
for question in QUESTIONS:
    answer, relevant_docs = answer_with_rag(question, READER_LLM, KNOWLEDGE_VECTOR_DATABASE, reranker=RERANKER)
    rank_dict.append({
        "prompt": question,
        "response": answer,
        "sources": relevant_docs
    })
rank_df = pd.DataFrame.from_dict(rank_dict)
display(rank_df)