# Visualize  your RAG Data - EDA for Retrieval-Augmented Generation
## How to use UMAP dimensionality reduction for Embeddings to show  Questions, Answers and their relationships to source documents with OpenAI, Langchain and ChromaDB
- This notebook is part of an article to-be-published. 
- You can skip all preparation steps and **go directly to the visualizations section if you only want to check out the visualization** of prepared data.

### Get ready

In [None]:
!pip install langchain langchain-openai chromadb renumics-spotlight
# %env OPENAI_API_KEY=<your-api-key> # uncomment and set your OpenAI API key

In [None]:
# helper functions for caching expensive LMM Calls
from pathlib import Path
import pandas as pd
import json
import hashlib


def write_dict_to_file(data_dict, filename):
    """write a dictionary as a json line to a file - allowing for appending"""
    with open(filename, "a") as f:
        f.write(json.dumps(data_dict) + "\n")


def read_dicts_from_file(filename):
    """Read a json line file as a generator of dictionaries - allowing to load multiple dictionaries as list."""
    with open(filename, "r") as f:
        for line in f:
            yield json.loads(line)


def add_cached_column_from_file(df, file_name, merge_on, column):
    """Read a file with cached list of dicts data write it to a dataframe."""

    if Path(file_name).exists():

        cached_answer_correctness = (
            pd.DataFrame(list(read_dicts_from_file(file_name)))
            .drop_duplicates(
                subset=[merge_on],
            )[[column, merge_on]]
            .dropna()
            .reset_index(drop=True)
        )
        return df.merge(
            cached_answer_correctness,
            on=merge_on,
            how="left",
        ).reset_index(drop=True)
    else:
        return df.insert(0, column, None)


def stable_hash_meta(metadata) -> str:
    """Stable hash document based on its metadata."""
    return hashlib.sha1(json.dumps(metadata, sort_keys=True).encode()).hexdigest()

### Prepare documents
In the section we will create
- `embeddings_model`: A OpenAI based model to create embeddings for documents
- `docs`: A list of documents collected from the ./docs/ folder
- `docs_vectorstore`: A vectorstore with the embeddings of the documents

In [None]:
# create embeddings model and vector store
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores.chroma import Chroma

embeddings_model = OpenAIEmbeddings(model="text-embedding-ada-002")
docs_vectorstore = Chroma(
    collection_name="docs_store",
    embedding_function=embeddings_model,
    persist_directory="docs-db",
)

In [None]:
# load documents with the LangChain document loader
from langchain_community.document_loaders import BSHTMLLoader, DirectoryLoader

loader = DirectoryLoader(
    "docs",
    glob="*.html",
    loader_cls=BSHTMLLoader,
    loader_kwargs={"open_encoding": "utf-8"},
    recursive=True,
    show_progress=True,
)
docs = loader.load()

In [None]:
# divide documents into splits
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
splits = text_splitter.split_documents(docs)
splits_ids = [
    {"doc": split, "id": stable_hash_meta(split.metadata)} for split in splits
]

In [None]:
# only keep splits that are not already in the vector store
existing_ids = docs_vectorstore.get()["ids"]
new_splits_ids = [split for split in splits_ids if split["id"] not in existing_ids]

In [None]:
from langchain_core.documents import Document

# add new splits to the vector store
from langchain_core.documents import Document

if new_splits_ids:
    docs_vectorstore.add_documents(
        documents=[split["doc"] for split in new_splits_ids],
        ids=[split["id"] for split in new_splits_ids],
    )
docs_vectorstore.persist()

### Build the LangChain
In this section, we will use OpenAI and LanChain to prepare a
- `rag_chain`: A LangChain that uses gpt-3.5 and the `docs_vectorestore` as retrieve

In [None]:
# create language model and retriever
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0.0)
retriever = docs_vectorstore.as_retriever(search_kwargs={"k": 20})

In [None]:
# create a RAG prompt that includes the question and the source documents
from langchain_core.prompts import ChatPromptTemplate

template = """
You are an assistant for question-answering tasks.
Given the following extracted parts of a long document and a question, create a final answer with references ("SOURCES").
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
ALWAYS return a "SOURCES" part in your answer.

QUESTION: {question}
=========
{source_documents}
=========
FINAL ANSWER: """
prompt = ChatPromptTemplate.from_template(template)

In [None]:
# create a RAG chain that retrieves documents, generates an answer, and formats the answer
from typing import List

from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser


def format_docs(docs: List[Document]) -> str:
    return "\n\n".join(
        f"Content: {doc.page_content}\nSource: {doc.metadata['source']}" for doc in docs
    )


rag_chain_from_docs = (
    RunnablePassthrough.assign(
        source_documents=(lambda x: format_docs(x["source_documents"]))
    )
    | prompt
    | llm
    | StrOutputParser()
)
rag_chain = RunnableParallel(
    {
        "source_documents": retriever,
        "question": RunnablePassthrough(),
    }
).assign(answer=rag_chain_from_docs)

### Generate and ask evaluation questions
In this section you will create a dataframe with the following columns:
- `question`: the question you want to ask
- `ground_truth`: the correct answer to the question
- `question_by`: the methode used to create the question
- `answer`: the answer of the RAG System
- `source`: the ids of source documents retrieved by the RAG System
- `embedding`: the embedding of the question

You can skip this part and download the dataframe from: xy.com/df_questions_answers.json

In [None]:
import json

# load questios and answer pairs that were genrated using the chatGPT web interface
qa_gpt4 = json.load(open("qa_gpt4.json"))
qa_gpt35 = json.load(open("qa_gpt35.json"))

In [None]:
from ragas.testset import TestsetGenerator
import pickle

# generate testset using ragas and gpt-3.5 for generation and gpt-4 for filtering
if not Path("testset_generation_100a_gpt35-40.pickle").exists():
    generator = TestsetGenerator.from_default()
    testset_100_gpt35_40 = generator.generate(docs, 100)

    with open("testset_generation_100a_gpt35-40.pickle", "wb") as f:
        pickle.dump(testset_100_gpt35_40, f)

else:
    testset_100_gpt35_40 = pickle.load(
        open("testset_generation_100a_gpt35-40.pickle", "rb")
    )

# generate testset using ragas and gpt-3.5 for generation and for filtering
if not Path("testset_generation_100a.pickle").exists():
    generator = TestsetGenerator.from_default(
        openai_generator_llm="gpt-3.5-turbo-16k", openai_filter_llm="gpt-3.5-turbo-16k"
    )
    testset_100_gpt35 = generator.generate(docs, 100)

    with open("testset_generation_100a.pickle", "wb") as f:
        pickle.dump(testset_100_gpt35, f)
else:
    testset_100_gpt35 = pickle.load(open("testset_generation_100a.pickle", "rb"))

# generate testset using ragas and gpt-3.5 for generation and for filtering
if not Path("testset_generation_100b.pickle").exists():
    generator = TestsetGenerator.from_default(
        openai_generator_llm="gpt-3.5-turbo-16k", openai_filter_llm="gpt-4-turbo-16k"
    )
    testset_100_gpt35_b = generator.generate(docs, 100)

    with open("testset_generation_100b.pickle", "wb") as f:
        pickle.dump(testset_100_gpt35_b, f)
else:
    testset_100_gpt35_b = pickle.load(open("testset_generation_100b.pickle", "rb"))

In [None]:
# combine all questions in one list with dict with the keys question, ground_truth, question_by
questions_all = [
    {"question": qa["question"], "ground_truth": qa["answer"], "question_by": "gpt4"}
    for qa in qa_gpt4
]
questions_all += [
    {"question": qa["question"], "ground_truth": qa["answer"], "question_by": "gpt35"}
    for qa in qa_gpt35
]
questions_all += [
    {
        "question": qa.question,
        "ground_truth": qa.ground_truth,
        "question_by": "rags_gpt35_40",
    }
    for qa in testset_100_gpt35_40.test_data
]
questions_all += [
    {
        "question": qa.question,
        "ground_truth": qa.ground_truth,
        "question_by": "ragas_gpt35",
    }
    for qa in testset_100_gpt35.test_data
]
questions_all += [
    {
        "question": qa.question,
        "ground_truth": qa.ground_truth,
        "question_by": "ragas_gpt35",
    }
    for qa in testset_100_gpt35_b.test_data
]

len(questions_all)

In [None]:
# build a dataframe with questions and ground truth answers

import pandas as pd

df_questions = pd.DataFrame(
    {
        "id": [f"Question {i}" for i, _ in enumerate(questions_all)],
        "question": [qa["question"] for qa in questions_all],
        "ground_truth": [qa["ground_truth"] for qa in questions_all],
        "question_by": [qa["question_by"] for qa in questions_all],
    }
)
# keep only the first question if questions are duplicated
df_questions = df_questions.drop_duplicates(subset=["question"])
df_questions

In [None]:
# extract embeddings for all documents from the vector store and store them in a dataframe
import pandas as pd

all_docs = docs_vectorstore.get(include=["metadatas", "documents", "embeddings"])
df_docs = pd.DataFrame(
    {
        "id": [stable_hash_meta(metadata) for metadata in all_docs["metadatas"]],
        "source": [metadata.get("source") for metadata in all_docs["metadatas"]],
        "page": [metadata.get("page", -1) for metadata in all_docs["metadatas"]],
        "document": all_docs["documents"],
        "embedding": all_docs["embeddings"],
    }
)

In [None]:
# load the cached RAG answers and source_documents ids from a file - or create an empty column
df_questions_answers = add_cached_column_from_file(
    df_questions, "rag_response_cache.txt", "question", "answer"
)
df_questions_answers
df_questions_answers = add_cached_column_from_file(
    df_questions_answers, "rag_response_cache.txt", "question", "source_documents"
)
df_questions_answers

In [None]:
for i, row in df_questions_answers.iterrows():
    if row["answer"] is None or pd.isnull(row["answer"]):
        response = rag_chain.invoke(row["question"])

        df_questions_answers.loc[df_questions_answers.index[i], "answer"] = response[
            "answer"
        ]
        df_questions_answers.loc[df_questions_answers.index[i], "source_documents"] = [
            stable_hash_meta(source_document.metadata)
            for source_document in response["source_documents"]
        ]

        # optionally save the response to cache
        response_dict = {
            "question": row["question"],
            "answer": response["answer"],
            "source_documents": [
                stable_hash_meta(source_document.metadata)
                for source_document in response["source_documents"]
            ],
        }
        write_dict_to_file(response_dict, "rag_response_cache.txt")

# get the context documents content for each question
df_questions_answers["contexts"] = df_questions_answers["source_documents"].apply(
    lambda doc: [df_docs[df_docs["id"] == i]["document"].values[0] for i in doc]
)

In [None]:
# addtionaly get embeddings for questions

if not Path("question_embeddings_202402201312.pickle").exists():
    question_embeddings = [
        embeddings_model.embed_query(question)
        for question in df_questions_answers["question"]
    ]
    with open("question_embeddings_202402201312.pickle", "wb") as f:
        pickle.dump(question_embeddings, f)

question_embeddings = pickle.load(open("question_embeddings_202402201312.pickle", "rb"))
# answer_embeddings = pickle.load(open("answer_embeddings_2040214_1111.pickle", "rb"))
df_questions_answers["embedding"] = question_embeddings
df_questions_answers

## Evaluate

In this section we add a new column to the dataframe:
- `answer_correctness`: the correctness of the answer of the rag_chain evaluated by ragas

In [None]:
docs_vectorstore.get(include=["metadatas", "documents", "embeddings"])

In [None]:
df_questions_answers = add_cached_column_from_file(
    df_questions_answers, "ragas_result_cache.txt", "question", "answer_correctness"
)
df_questions_answers

In [None]:
# prepare the dataframe for evaluation
df_qa_eval = df_questions_answers.copy()


# adapt the ground truth to the ragas name and format
df_qa_eval.rename(columns={"ground_truth": "ground_truths"}, inplace=True)
df_qa_eval["ground_truths"] = [
    [gt] if not isinstance(gt, list) else gt for gt in df_qa_eval["ground_truths"]
]

In [None]:
from ragas import evaluate
from ragas.metrics import answer_correctness
from datasets import Dataset

# evaluate the answer correctness if not already done
fields = ["question", "answer", "contexts", "ground_truths"]
for i, row in df_qa_eval.iterrows():
    if row["answer_correctness"] is None or pd.isnull(row["answer_correctness"]):
        evaluation_result = evaluate(
            Dataset.from_pandas(df_qa_eval.iloc[i : i + 1][fields]),
            [answer_correctness],
        )
        df_qa_eval.loc[i, "answer_correctness"] = evaluation_result[
            "answer_correctness"
        ]

        # optionally save the response to cache
        response_dict = {
            "question": row["question"],
            "answer_correctness": evaluation_result["answer_correctness"],
        }
        write_dict_to_file(response_dict, "ragas_result_cache.txt")

# write the answer correctness to the original dataframe
df_questions_answers["answer_correctness"] = df_qa_eval["answer_correctness"]

## Link from documents to questions, that used the document as source
This section adds a column to `df_documents` containing the ids of the questions that used the document as source.

In [None]:
# add the infos about questions using each document to the documents dataframe


# Explode 'source_documents' so each document ID is in its own row alongside the question ID
df_questions_exploded = df_qa_eval.explode("source_documents")

# Group by exploded 'source_documents' (document IDs) and aggregate
agg = (
    df_questions_exploded.groupby("source_documents")
    .agg(
        num_questions=("id", "count"),  # Count of questions referencing the document
        question_ids=(
            "id",
            lambda x: list(x),
        ),  # List of question IDs referencing the document
    )
    .reset_index()
    .rename(columns={"source_documents": "id"})
)

# Merge the aggregated information back into df_documents
df_documents_agg = pd.merge(df_docs, agg, on="id", how="left")

# Use apply to replace NaN values with empty lists for 'question_ids'
df_documents_agg["question_ids"] = df_documents_agg["question_ids"].apply(
    lambda x: x if isinstance(x, list) else []
)
# Replace NaN values in 'num_questions' with 0
df_documents_agg["num_questions"] = df_documents_agg["num_questions"].fillna(0)

In [None]:
df = pd.concat([df_qa_eval, df_documents_agg], axis=0)

In [None]:
# create UMAP only using documents and apply it to the documents and questions
from umap import UMAP
import numpy as np

df_questions = df[~df["question"].isna()]
umap = UMAP(n_neighbors=20, min_dist=0.15, metric="cosine", random_state=42).fit(
    df_questions["embedding"].values.tolist()
)
umap_questions = umap.transform(df["embedding"].values.tolist())


df_without_questions = df[df["question"].isna()]
umap = UMAP(n_neighbors=20, min_dist=0.15, metric="cosine", random_state=42).fit(
    df_without_questions["embedding"].values.tolist()
)
umap_docs = umap.transform(df["embedding"].values.tolist())
df["umap_docs"] = umap_docs.tolist()

umap = UMAP(n_neighbors=20, min_dist=0.15, metric="cosine", random_state=42).fit(
    df["embedding"].values.tolist()
)
umap_all = umap.transform(df["embedding"].values.tolist())
df["umap"] = umap_all.tolist()


# find the nearet question (by embedding) for each document
question_embeddings = np.array(df[df["question"].notna()]["embedding"].tolist())

df["nearest_question_dist"] = [  # brute force, could be optimized using ChromaDB
    np.min([np.linalg.norm(np.array(doc_emb) - question_embeddings, axis=1)])
    for doc_emb in df["embedding"].values
]

# write the dataframe to parquet for later use
df.to_parquet("df_f1_rag_docs_and_questions_umaps_local.parquet")

## Visualize
Adapt the first cell to use the downloaded dataframes if you skipped the preparation steps.

In [None]:
# concat the df containing the questions and the df containing the documents
import pandas as pd

# df = pd.concat([df_qa_eval, df_documents_agg], axis=0)

# OR Load the data from downloaded file https://spotlightpublic.blob.core.windows.net/docs-data/rag_demo/df_f1_rag_docs_and_questions_umaps_v3.parquet
df = pd.read_parquet("df_f1_rag_docs_and_questions_umaps_v3.parquet")

In [None]:
# show the dataframe with the question and answer in spotlight
from renumics import spotlight
from renumics.spotlight import Embedding
import pandas as pd

spotlight.show(
    df,
    layout="https://spotlightpublic.blob.core.windows.net/docs-data/rag_demo/layout_rag_3.json",
    dtype={x: Embedding for x in df.keys() if "umap" in x},
)

##  UMAP visualization froms cluster of the questions, workaround: UMAP only on documents