In [1]:
from dotenv import load_dotenv, find_dotenv
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma


_ = load_dotenv(find_dotenv())

In [2]:
urls = [
    "https://lilianweng.github.io/posts/2023-06-23-agent/",
    "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
    "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/",
]

In [3]:
from langchain_community.document_loaders import UnstructuredURLLoader
from unstructured.cleaners.core import remove_punctuation, clean, clean_extra_whitespace
from langchain_community.vectorstores.utils import filter_complex_metadata

In [4]:
def generate_document(urls):
    "Given an URL, return a langchain Document to futher processing"
    loader = UnstructuredURLLoader(
        urls=urls,
        mode="elements",
        post_processors=[
            clean,
            remove_punctuation,
            clean_extra_whitespace,
        ],
    )
    elements = loader.load()
    return filter_complex_metadata(elements)

In [65]:
docs_list = generate_document(urls)

In [54]:
# print("Length of docs: ", len(docs_list))
# print("Docs List: ", docs_list)

In [66]:
# vectorstore = Chroma.from_documents(docs_list, 
#                                     embedding=OpenAIEmbeddings(), 
#                                     collection_name="rag_chroma",
#                                     persist_directory="./.chroma")

In [5]:
retriever = Chroma(
    collection_name="rag_chroma",
    persist_directory="./.chroma",
    embedding_function=OpenAIEmbeddings(),
).as_retriever()

In [14]:
question = "What does LLM stand for?"

In [15]:
docs = retriever.invoke(question)

In [16]:
print(docs)

[Document(page_content='The LLM is provided with a list of tool names, descriptions of their utility, and details about the expected input/output.', metadata={'category': 'ListItem', 'category_depth': 1, 'filetype': 'text/html', 'parent_id': '81f91154a470509bc00dbd208b8c910b', 'url': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}), Document(page_content='Adversarial Attacks on LLMs', metadata={'category': 'Title', 'category_depth': 0, 'filetype': 'text/html', 'url': 'https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/'}), Document(page_content='Adversarial Attacks on LLMs', metadata={'category': 'Title', 'category_depth': 0, 'filetype': 'text/html', 'url': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}), Document(page_content='Challenges in long-term planning and task decomposition: Planning over a lengthy history and effectively exploring the solution space remain challenging. LLMs struggle to adjust plans when faced with unexpected errors, making them less

In [17]:
print(docs[0].page_content)

The LLM is provided with a list of tool names, descriptions of their utility, and details about the expected input/output.
