YT: https://www.youtube.com/watch?v=sVcwVQRHIc8&t=3s <br>
code: https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_1_to_4.ipynb

In [73]:
# setup langsmith

import getpass
import os
import numpy as np
import pandas as pd

try:
    # load environment variables from .env file (requires `python-dotenv`)
    from dotenv import load_dotenv

    load_dotenv()
except ImportError:
    pass

os.environ["LANGSMITH_TRACING"] = "true"
if "LANGSMITH_API_KEY" not in os.environ:
    os.environ["LANGSMITH_API_KEY"] = getpass.getpass(
        prompt="Enter your LangSmith API key (optional): "
    )
if "LANGSMITH_PROJECT" not in os.environ:
    os.environ["LANGSMITH_PROJECT"] = getpass.getpass(
        prompt='Enter your LangSmith Project Name (default = "default"): '
    )
    if not os.environ.get("LANGSMITH_PROJECT"):
        os.environ["LANGSMITH_PROJECT"] = "default"
# if "OPENAI_API_KEY" not in os.environ:
#     os.environ["OPENAI_API_KEY"] = getpass.getpass(
#         prompt="Enter your OpenAI API key (required if using OpenAI): "
#     )

In [2]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "../serious-hold-453009-g1-eff08c861e11.json"

In [3]:
os.environ["LANGSMITH_PROJECT"]

'rag-101'

In [9]:
# Ensure your VertexAI credentials are configured

from langchain.chat_models import init_chat_model
from langchain_google_vertexai import VertexAIEmbeddings

llm = init_chat_model("gemini-2.0-flash-001", model_provider="google_vertexai")
embeddings = VertexAIEmbeddings(model="text-embedding-004")

# Get started RAG

### Overview

In [32]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

In [6]:
#### INDEXING ####

# Load Documents
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

In [7]:
# Embed
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=embeddings)
retriever = vectorstore.as_retriever()

In [63]:
def inspect_chroma_vectorstore(vectorstore):
    """
    Display information about documents stored in a Chroma vectorstore.
    """
    import pandas as pd
    
    count = vectorstore._collection.count()
    print(f"Total documents in vectorstore: {count}")
    
    vectorstore_data = vectorstore._collection.get()
    print(f"Available keys: {vectorstore_data.keys()}")
    
    data = {
        "id": vectorstore_data['ids'],
        "document": vectorstore_data['documents'],
    }
    
    # Add metadata if available
    if 'metadatas' in vectorstore_data and vectorstore_data['metadatas']:
        for key in vectorstore_data['metadatas'][0].keys():
            data[f"metadata_{key}"] = [m.get(key) for m in vectorstore_data['metadatas']]
    
    df = pd.DataFrame(data)
    return df

df = inspect_chroma_vectorstore(vectorstore)
display(df.head())

Total documents in vectorstore: 66
Available keys: dict_keys(['ids', 'embeddings', 'documents', 'uris', 'data', 'metadatas', 'included'])


Unnamed: 0,id,document,metadata_source
0,ffa41241-b0a9-4339-a545-548e6206220c,LLM Powered Autonomous Agents\n \nDate: Jun...,https://lilianweng.github.io/posts/2023-06-23-...
1,eba019db-8810-44f2-8bc8-74253b23b4b6,Memory\n\nShort-term memory: I would consider ...,https://lilianweng.github.io/posts/2023-06-23-...
2,b6bbeb31-f03f-43b6-a54a-1f1b268b9a20,Fig. 1. Overview of a LLM-powered autonomous a...,https://lilianweng.github.io/posts/2023-06-23-...
3,c6de9c76-0a58-42e1-9e9c-00aeb96c9ede,Tree of Thoughts (Yao et al. 2023) extends CoT...,https://lilianweng.github.io/posts/2023-06-23-...
4,8135ced6-a8ef-44a0-8baf-003f5cd6b864,"Another quite distinct approach, LLM+P (Liu et...",https://lilianweng.github.io/posts/2023-06-23-...


In [67]:
prompt = hub.pull("rlm/rag-prompt")
display(prompt)
print()
display(prompt.messages[0].prompt.template)
print(prompt.input_variables)

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})])




"You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"

['context', 'question']


In [65]:
#### RETRIEVAL and GENERATION ####

# Prompt
prompt = hub.pull("rlm/rag-prompt")

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# Question
rag_chain.invoke("What is Task Decomposition?")

'Task decomposition is a method of breaking down complex tasks into smaller, more manageable steps. This can be achieved through prompting the model to "think step by step" or by using task-specific instructions. It can also be accomplished with human input.\n'

In [66]:
# Question
rag_chain.invoke("Briefly describe the HuggingGPT and tell what are the main components to implement it?")

'HuggingGPT is a framework that uses ChatGPT as a task planner to select models available on the HuggingFace platform and summarize responses based on the execution results. The main components include task planning, instruction, and response generation. It uses LLM to parse user requests, guide task parsing, and provide summarized results.\n'

In [68]:
# Question
rag_chain.invoke("What is vision transformer?")

"I'm sorry, but the provided context does not contain information about vision transformers. Therefore, I cannot answer your question.\n"

### Indexing

In [70]:
# Documents
question = "What kinds of pets do I like?"
document = "My favorite pet is a cat."

import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

print(f"Total tokens used: {num_tokens_from_string(question, 'cl100k_base')}")

query_result = embeddings.embed_query(question)
document_result = embeddings.embed_query(document)
len(query_result)

Total tokens used: 8


768

In [74]:
np.array(query_result).shape

(768,)

In [75]:
import numpy as np

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

similarity = cosine_similarity(query_result, document_result)
print("Cosine Similarity:", similarity)

Cosine Similarity: 0.6704918572044731
