In [1]:
%pip install langchain langchain-openai chromadb renumics-spotlight python-dotenv beautifulsoup4

Collecting httpx<0.24.0,>=0.23.0 (from renumics-spotlight)
  Obtaining dependency information for httpx<0.24.0,>=0.23.0 from https://files.pythonhosted.org/packages/ac/a2/0260c0f5d73bdf06e8d3fc1013a82b9f0633dc21750c9e3f3cb1dba7bb8c/httpx-0.23.3-py3-none-any.whl.metadata
  Using cached httpx-0.23.3-py3-none-any.whl.metadata (7.1 kB)
Collecting httpcore<0.17.0,>=0.15.0 (from httpx<0.24.0,>=0.23.0->renumics-spotlight)
  Obtaining dependency information for httpcore<0.17.0,>=0.15.0 from https://files.pythonhosted.org/packages/04/7e/ef97af4623024e8159993b3114ce208de4f677098ae058ec5882a1bf7605/httpcore-0.16.3-py3-none-any.whl.metadata
  Using cached httpcore-0.16.3-py3-none-any.whl.metadata (16 kB)
Using cached httpx-0.23.3-py3-none-any.whl (71 kB)
Using cached httpcore-0.16.3-py3-none-any.whl (69 kB)
Installing collected packages: httpcore, httpx
  Attempting uninstall: httpcore
    Found existing installation: httpcore 1.0.4
    Uninstalling httpcore-1.0.4:
      Successfully u

In [2]:
from dotenv import load_dotenv
import os

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [3]:
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores.chroma import Chroma

embeddings_model = OpenAIEmbeddings(model="text-embedding-ada-002")
docs_vectorstore = Chroma(
    collection_name="docs_store",
    embedding_function=embeddings_model,
    persist_directory="docs-db",
)

In [4]:
from langchain_community.document_loaders import BSHTMLLoader, DirectoryLoader

loader = DirectoryLoader(
    "docs",
    glob="*.html",
    loader_cls=BSHTMLLoader,
    loader_kwargs={"open_encoding": "utf-8"},
    recursive=True,
    show_progress=True,
    silent_errors=True,
)
docs = loader.load()

100%|██████████| 979/979 [00:31<00:00, 31.42it/s]


In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
splits = text_splitter.split_documents(docs)

In [6]:
import uuid

def split_list(input_list, chunk_size):
    for i in range(0, len(input_list), chunk_size):
        yield input_list[i:i + chunk_size]

split_docs_chunked = split_list(splits, 5461)

for chunk in split_docs_chunked:
    split_ids = [str(uuid.uuid4()) for _ in chunk]
    docs_vectorstore.add_documents(chunk, ids=split_ids)
    docs_vectorstore.persist()


In [7]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4-0125-preview", temperature=0.0)
retriever = docs_vectorstore.as_retriever(search_kwargs={"k": 20})

In [8]:
from langchain_core.prompts import ChatPromptTemplate

template = """
You are an assistant for question-answering tasks.
Given the following extracted parts of a long document and a question, create a final answer with references ("SOURCES").
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
ALWAYS return a "SOURCES" part in your answer.

QUESTION: {question}
=========
{source_documents}
=========
FINAL ANSWER: """
prompt = ChatPromptTemplate.from_template(template)

In [9]:
from langchain_core.documents import Document
from typing import List

from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser


def format_docs(docs: List[Document]) -> str:
    return "\n\n".join(
        f"Content: {doc.page_content}\nSource: {doc.metadata['source']}" for doc in docs
    )


rag_chain_from_docs = (
    RunnablePassthrough.assign(
        source_documents=(lambda x: format_docs(x["source_documents"]))
    )
    | prompt
    | llm
    | StrOutputParser()
)
rag_chain = RunnableParallel(
    {
        "source_documents": retriever,
        "question": RunnablePassthrough(),
    }
).assign(answer=rag_chain_from_docs)

In [25]:
question = "Who built the nuerburgring"
response = rag_chain.invoke(question)
answer = response["answer"]
answer

"The Nürburgring was built following a proposal for a dedicated race track around the ancient castle of the town of Nürburg, inspired by Italy's Monza and Targa Florio courses, and Berlin's AVUS. The construction of the track was designed by the Eichler Architekturbüro from Ravensburg, led by architect Gustav Eichler, and began in September 1925. The original purpose of the Nürburgring was to showcase German automotive engineering and racing talent.\n\nSOURCES:\n- docs/Nürburgring.html"

In [26]:
import pandas as pd

response = docs_vectorstore.get(include=["metadatas", "documents", "embeddings"])
df = pd.DataFrame(
    {
        "id": response["ids"],
        "source": [metadata.get("source") for metadata in response["metadatas"]],
        "page": [metadata.get("page", -1) for metadata in response["metadatas"]],
        "document": response["documents"],
        "embedding": response["embeddings"],
    }
)
df["contains_answer"] = df["document"].apply(lambda x: "Eichler" in x)
df["contains_answer"].to_numpy().nonzero()

(array([6415]),)

In [28]:
response["documents"]

['A meeting between the GPDA and FIA president Max Mosley, scheduled for the British Grand Prix, was cancelled by Mosley because of statements made by David Coulthard. Mosley claimed Coulthard\'s statements to the media were a "distortion" of the purpose of the meeting and accused him of stirring up dissent. In retaliation the GPDA released a letter that had been sent to Mosley accusing him of jeopardising the GPDA\'s drive for improved safety:',
 'Wheatcroft has also invested heavily to restore the circuit infrastructure to its former glories. The infield which was excavated during the late 2000s has been completely restored and raised even higher in some areas, while pit and paddock facilities have also been improved. Outside the circuit boundaries, an all terrain course has been constructed, as well as improvements to hospitality buildings and conferencing suites.[2]',
 'The association between BMW and Brabham lasted from 1981 until 1987. Pictured is Nelson Piquet driving the BMW-en

In [24]:
df

Unnamed: 0,id,question,embedding,answer,source,page,document,contains_answer,dist
0,question,Who built the nuerburgring,0.005165,,,,,,1.025942
1,question,Who built the nuerburgring,-0.011626,,,,,,1.086981
2,question,Who built the nuerburgring,-0.008171,,,,,,1.041258
3,question,Who built the nuerburgring,-0.031259,,,,,,1.559077
4,question,Who built the nuerburgring,-0.009418,,,,,,1.055984
...,...,...,...,...,...,...,...,...,...
29848,fffa6e37-f898-4d12-abb0-88f60a7d6748,,"[-0.02245561219751835, -0.012931512668728828, ...",,docs/Silverstone Circuit.html,-1.0,"^ ""British GT Championship Silverstone 1993"". ...",False,0.770408
29849,fffaa6d6-b832-4308-bc6c-31b8210c4c2a,,"[0.02033507637679577, 8.452025940641761e-05, 0...",,docs/Safety car.html,-1.0,The Full Course Yellow condition is the Formul...,False,0.771581
29850,fffbad72-d0a6-4062-b5c2-43b40b7abe08,,"[-0.01694202981889248, 0.008712857030332088, -...",,docs/Circuit de la Sarthe.html,-1.0,"Speed record[edit]\nIn 1988, Team WM Peugeot w...",False,0.681080
29851,fffd94a9-b01f-4fc9-955e-297c9e921806,,"[0.02040393464267254, -0.015941396355628967, -...",,docs/Formula One engines.html,-1.0,"^ a b ""F1 2021: Liberty's 4WD, Porsche & Spec ...",False,0.760815


In [29]:
question_row = pd.DataFrame(
    {
        "id": ["question"],
        "question": [question],
        "embedding": [embeddings_model.embed_query(question)],
    }
)
answer_row = pd.DataFrame(
    {
        "id": ["answer"],
        "answer": [answer],
        "embedding": [embeddings_model.embed_query(answer)],
    }
)
df = pd.concat([question_row, answer_row, df])

In [30]:
import numpy as np
question_embedding = embeddings_model.embed_query(question)
df["dist"] = df.apply(
    lambda row: np.linalg.norm(
        np.array(row["embedding"]) - question_embedding
    ),
    axis=1,
)

In [31]:
from renumics import spotlight
spotlight.show(df)

VBox(children=(Label(value='Spotlight running on http://127.0.0.1:63608/'), HBox(children=(Button(description=…

# Output looks

![visualization image](visualize.png)
