In [1]:
import os
from pathlib import Path
import re

from dotenv import find_dotenv, load_dotenv

In [2]:
load_dotenv(find_dotenv('.env'))

True

In [3]:
os.environ["LANGCHAIN_PROJECT"] = "RAG From Scratch: Part 2 (Indexing)"

In [4]:
DATA_PATH = Path('./data')
VECTORSTORE_PATH = DATA_PATH / 'vectorstore'

# Part 2: Indexing

![](images/02-indexing.png)

## Configure components

In [5]:
from langchain_openai import OpenAIEmbeddings

In [6]:
embeddings_model_name = "text-embedding-3-small"
embeddings = OpenAIEmbeddings(model=embeddings_model_name)
len(embeddings.embed_query("Hello"))

1536

## Load documents

In [7]:
import bs4
from langchain_community.document_loaders import WebBaseLoader

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [8]:
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()
len(docs)

1

In [9]:
print(docs[0].page_content[:1000])



      LLM Powered Autonomous Agents
    
Date: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng


Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.
Agent System Overview#
In a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by several key components:

Planning

Subgoal and decomposition: The agent breaks down large tasks into smaller, manageable subgoals, enabling efficient handling of complex tasks.
Reflection and refinement: The agent can do self-criticism and self-reflection over past actions, learn from mistakes and refine them for future steps, thereby improving the quality of final results.


Memory

Short-term memory: I 

## Split documents

In [10]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [11]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
len(splits)

66

## Store documents

In [12]:
import chromadb
from chromadb.config import Settings
from langchain_chroma import Chroma
from langchain_core.documents import Document

In [13]:
def get_collection_size(vectorstore):
    try:
        collection_size = len(vectorstore.get()["ids"])
    except Exception as _:
        collection_size = 0

    return collection_size

In [14]:
collection_name="embeddings"

vectorstore_settings = Settings(anonymized_telemetry=False)
client = chromadb.PersistentClient(
    path=str(VECTORSTORE_PATH), settings=vectorstore_settings
)

Chroma(collection_name=collection_name, client=client).delete_collection()

vectorstore = Chroma(
    collection_name=collection_name, embedding_function=embeddings, client=client
)

get_collection_size(vectorstore)

0

In [15]:
vectorstore.add_documents(splits)
get_collection_size(vectorstore)

66

**Tokenization**
- [OpenAI tokenizer](https://platform.openai.com/tokenizer)

In [16]:
import tiktoken

In [17]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [18]:
query = "What kinds of pets do I like?"
document = "My favorite pet is a cat."

In [19]:
openai_encoding_name = tiktoken.encoding_for_model(embeddings_model_name).name
openai_encoding_name

'cl100k_base'

In [20]:
num_tokens_from_string(query, openai_encoding_name)

8

In [21]:
num_tokens_from_string(document, openai_encoding_name)

7

**Embeddings**

In [22]:
query_embeddings = embeddings.embed_query(query)
document_embeddings = embeddings.embed_documents([document])[0]

len(query_embeddings), len(document_embeddings)

(1536, 1536)

**Cosine similarity**

In [23]:
import numpy as np

In [24]:
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

In [25]:
similarity = cosine_similarity(query_embeddings, document_embeddings)
print("Cosine Similarity:", similarity)

Cosine Similarity: 0.546556128332727


In [26]:
non_relevant_document = "The weather is fine."
non_relevant_document_embeddings = embeddings.embed_documents([non_relevant_document])[0]

similarity = cosine_similarity(query_embeddings, non_relevant_document_embeddings)
print("Cosine Similarity:", similarity)

Cosine Similarity: 0.09272330847288396


In [27]:
similarity = cosine_similarity(query_embeddings, query_embeddings)
print("Cosine Similarity:", similarity)

Cosine Similarity: 1.0
