In [1]:
import os
from pathlib import Path
import re

from dotenv import find_dotenv, load_dotenv
from rich import print as rprint
from rich.pretty import Pretty, pprint
from rich.text import Text

In [2]:
load_dotenv(find_dotenv('.env'))

True

In [3]:
os.environ["LANGCHAIN_PROJECT"] = "RAG From Scratch: Part 3 (Retrieval)"

In [4]:
DATA_PATH = Path('./data')
VECTORSTORE_PATH = DATA_PATH / 'vectorstore'

# Part 3: Retrieval

![](images/03-01-retrieval.png)

![](images/03-02-retrieval.png)

## Configure components

In [5]:
from langchain_openai import OpenAIEmbeddings

In [6]:
embeddings_model_name = "text-embedding-3-small"
embeddings = OpenAIEmbeddings(model=embeddings_model_name)
len(embeddings.embed_query("Hello"))

1536

## Load documents

In [7]:
import bs4
from langchain_community.document_loaders import WebBaseLoader

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [8]:
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()
len(docs)

1

In [9]:
print(docs[0].page_content[:1000])



      LLM Powered Autonomous Agents
    
Date: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng


Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.
Agent System Overview#
In a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by several key components:

Planning

Subgoal and decomposition: The agent breaks down large tasks into smaller, manageable subgoals, enabling efficient handling of complex tasks.
Reflection and refinement: The agent can do self-criticism and self-reflection over past actions, learn from mistakes and refine them for future steps, thereby improving the quality of final results.


Memory

Short-term memory: I 

## Split documents

In [10]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [11]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
len(splits)

66

## Store documents

**Similarity search**

In [12]:
from langchain_core.vectorstores import InMemoryVectorStore

In [13]:
vectorstore = InMemoryVectorStore(embeddings)
doc_ids = vectorstore.add_documents(documents=splits)
len(doc_ids), len(vectorstore.store)

(66, 66)

In [14]:
records = list(vectorstore.store.items())
Pretty(records[0], max_length=20, no_wrap=False)

## Semantic search (vector store)

In [15]:
query = "What is Task Decomposition?"
relevant_documents = vectorstore.similarity_search(query)
len(relevant_documents)

4

In [16]:
for document in relevant_documents:
    rprint(document)

In [17]:
rprint(query)

chunk_pattern = re.compile(r'^Chunk \d+.*:$', flags=re.MULTILINE)
terms_pattern = re.compile(rf'\b({"|".join(query.split())})\b', flags=re.IGNORECASE)

for chunk_id, document in enumerate(relevant_documents, start=1):
    text = Text(f"Chunk {chunk_id}:\n{document.page_content}")
    text.highlight_regex(chunk_pattern, "bold green")
    text.highlight_regex(terms_pattern, "underline red")
    rprint(text)

## Semantic search (retriever)

In [18]:
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 4}
)
retriever

VectorStoreRetriever(tags=['InMemoryVectorStore', 'OpenAIEmbeddings'], vectorstore=<langchain_core.vectorstores.in_memory.InMemoryVectorStore object at 0x7fca9ceeb090>, search_kwargs={'k': 4})

In [19]:
query = "What is Task Decomposition?"
relevant_documents = retriever.invoke(query)
len(relevant_documents)

4

In [20]:
for document in relevant_documents:
    rprint(document)

In [21]:
rprint(query)

chunk_pattern = re.compile(r'^Chunk \d+.*:$', flags=re.MULTILINE)
terms_pattern = re.compile(rf'\b({"|".join(query.split())})\b', flags=re.IGNORECASE)

for chunk_id, document in enumerate(relevant_documents, start=1):
    text = Text(f"Chunk {chunk_id}:\n{document.page_content}")
    text.highlight_regex(chunk_pattern, "bold green")
    text.highlight_regex(terms_pattern, "underline red")
    rprint(text)