# Visualize  your RAG Data - EDA for Retrieval-Augmented Generation
## How to use UMAP dimensionality reduction for Embeddings to show  Questions, Answers and their relationships to source documents with OpenAI, Langchain and ChromaDB
This notebook is part of an [article at ITNEXT.](https://itnext.io/visualize-your-rag-data-eda-for-retrieval-augmented-generation-0701ee98768f)

### Get ready

In [None]:
!pip install langchain langchain-openai chromadb renumics-spotlight
%env OPENAI_API_KEY=<your-api-key>

### Prepare documents

In [None]:
# create embeddings model and vector store
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores.chroma import Chroma

embeddings_model = OpenAIEmbeddings(model="text-embedding-ada-002")
docs_vectorstore = Chroma(
    collection_name="docs_store",
    embedding_function=embeddings_model,
    persist_directory="docs-db",
)

In [None]:
# load documents with the LangChain document loader
from langchain_community.document_loaders import BSHTMLLoader, DirectoryLoader

loader = DirectoryLoader(
    "docs",
    glob="*.html",
    loader_cls=BSHTMLLoader,
    loader_kwargs={"open_encoding": "utf-8"},
    recursive=True,
    show_progress=True,
)
docs = loader.load()

In [None]:
# divide documents into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
splits = text_splitter.split_documents(docs)

In [None]:
# add documents to the vector store - use an id that can be reconstructed from the metadata
import hashlib
import json
from langchain_core.documents import Document


def stable_hash(doc: Document) -> str:
    """
    Stable hash document based on its metadata.
    """
    return hashlib.sha1(json.dumps(doc.metadata, sort_keys=True).encode()).hexdigest()


split_ids = list(map(stable_hash, splits))
docs_vectorstore.add_documents(splits, ids=split_ids)
docs_vectorstore.persist()

### Build the LangChain

In [None]:
# create language model and retriever
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4", temperature=0.0)
retriever = docs_vectorstore.as_retriever(search_kwargs={"k": 20})

In [None]:
# create a RAG prompt that includes the question and the source documents
from langchain_core.prompts import ChatPromptTemplate

template = """
You are an assistant for question-answering tasks.
Given the following extracted parts of a long document and a question, create a final answer with references ("SOURCES").
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
ALWAYS return a "SOURCES" part in your answer.

QUESTION: {question}
=========
{source_documents}
=========
FINAL ANSWER: """
prompt = ChatPromptTemplate.from_template(template)

In [None]:
# create a RAG chain that retrieves documents, generates an answer, and formats the answer
from typing import List

from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser


def format_docs(docs: List[Document]) -> str:
    return "\n\n".join(
        f"Content: {doc.page_content}\nSource: {doc.metadata['source']}" for doc in docs
    )


rag_chain_from_docs = (
    RunnablePassthrough.assign(
        source_documents=(lambda x: format_docs(x["source_documents"]))
    )
    | prompt
    | llm
    | StrOutputParser()
)
rag_chain = RunnableParallel(
    {
        "source_documents": retriever,
        "question": RunnablePassthrough(),
    }
).assign(answer=rag_chain_from_docs)

### Ask a Question

In [None]:
question = "Who built the nuerburgring"
response = rag_chain.invoke(question)
answer = response["answer"]
answer

### Visualize

In [None]:
# extract embeddings for the documents from the vector store and store them in a dataframe
import pandas as pd

response = docs_vectorstore.get(include=["metadatas", "documents", "embeddings"])
df = pd.DataFrame(
    {
        "id": response["ids"],
        "source": [metadata.get("source") for metadata in response["metadatas"]],
        "page": [metadata.get("page", -1) for metadata in response["metadatas"]],
        "document": response["documents"],
        "embedding": response["embeddings"],
    }
)
df["contains_answer"] = df["document"].apply(lambda x: "Eichler" in x)
df["contains_answer"].to_numpy().nonzero()

In [None]:
df

In [None]:
# add the question and answer with their embeddings to the dataframe
question_row = pd.DataFrame(
    {
        "id": ["question"],
        "question": [question],
        "embedding": [embeddings_model.embed_query(question)],
    }
)
answer_row = pd.DataFrame(
    {
        "id": ["answer"],
        "answer": [answer],
        "embedding": [embeddings_model.embed_query(answer)],
    }
)
df = pd.concat([question_row, answer_row, df])
df

In [None]:
# calculate the distance (L2 norm) between the question and the document embeddings
import numpy as np

question_embedding = embeddings_model.embed_query(question)
df["dist"] = df.apply(
    lambda row: np.linalg.norm(np.array(row["embedding"]) - question_embedding),
    axis=1,
)

In [None]:
# show the dataframe with the question and answer in spotlight
from renumics import spotlight

spotlight.show(df)