# 0. Setting Up The Env

## 0.1 Install

In [4]:

!uv pip install bs4 langchainhub langchain_community tiktoken langchain-openai langchainhub chromadb langchain

[2mUsing Python 3.13.3 environment at: D:\01 Work\10-New-Learnings\.venv[0m
[2mAudited [1m8 packages[0m [2min 28ms[0m[0m


## 0.2 Import

In [35]:
import numpy as np

In [5]:
import os
from dotenv import load_dotenv


In [32]:
import tiktoken

In [11]:
# # Old Codes & Package

# import bs4
# from langchain import hub
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain_community.document_loaders import WebBaseLoader
# from langchain_community.vectorstores import Chroma
# from langchain_core.output_parsers import StrOutputParser
# from langchain_core.runnables import RunnablePassthrough
# from langchain_openai import ChatOpenAI, OpenAIEmbeddings

In [None]:
import bs4

# LangChain core
# from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# Hub
# from langchain import hub
from langsmith import Client
# Loaders & Vector DBs 
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma

# OpenAI 
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

In [21]:
from langchain_text_splitters import RecursiveCharacterTextSplitter


## 0.3 Constants

In [16]:

os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'

In [17]:

load_dotenv(override=True)


openai_api_key = os.getenv('OPENAI_API_KEY')
langchain_api_key = os.getenv('LANGCHAIN_API_KEY')


In [18]:
if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")

OpenAI API Key exists and begins sk-proj-


In [19]:
if langchain_api_key:
    print(f"OpenAI API Key exists and begins {langchain_api_key[:8]}")
else:
    print("OpenAI API Key not set")

OpenAI API Key exists and begins lsv2_pt_


In [20]:
os.environ['OPENAI_API_KEY'] = openai_api_key
os.environ['LANGCHAIN_API_KEY'] = langchain_api_key

In [23]:
client = Client()

# 1. Overview

## 1.1 Indexing

In [24]:

# Load Documents
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()



In [26]:
# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

# Embed
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever()


  _EPOCH_DATETIME_NAIVE = datetime.datetime.utcfromtimestamp(0)


## 1.2 Retreival & Generation

In [27]:
# Prompt
prompt = client.pull_prompt("rlm/rag-prompt")

# LLM
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

In [28]:
# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [29]:
# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [30]:
# Question
rag_chain.invoke("What is Task Decomposition?")

'Task Decomposition is a technique used to break down complex tasks into smaller and simpler steps, allowing for easier execution and understanding. This approach can be implemented through various methods such as simple prompting, task-specific instructions, or relying on external classical planners. By decomposing tasks, agents can effectively plan ahead and navigate through the different stages of a task more efficiently.'

# 2. Indexing

**Refere Doc**
* [Count Tokens](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb)
* [Token ~ 4 Chars](https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them)
* [OpenAI Embeddings](https://docs.langchain.com/oss/python/integrations/text_embedding/openai)
* [cl100k_base & cosine similarity](https://platform.openai.com/docs/guides/embeddings#faq)
* ()


In [31]:

# Documents
question = "What kinds of pets do I like?"
document = "My favorite pet is a cat."

In [33]:


def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

num_tokens_from_string(question, "cl100k_base")

8

In [34]:
from langchain_openai import OpenAIEmbeddings
embd = OpenAIEmbeddings()
query_result = embd.embed_query(question)
document_result = embd.embed_query(document)
len(query_result)

1536

In [36]:
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

similarity = cosine_similarity(query_result, document_result)
print("Cosine Similarity:", similarity)

Cosine Similarity: 0.8806915835035412


In [37]:
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
blog_docs = loader.load()

In [38]:
# Split
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300, 
    chunk_overlap=50)

# Make splits
splits = text_splitter.split_documents(blog_docs)

In [39]:
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever()

# 3. Retrieving

In [40]:
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=OpenAIEmbeddings())


retriever = vectorstore.as_retriever(search_kwargs={"k": 1})

In [44]:
docs = retriever.invoke("What is Task Decomposition?")
len(docs)

1

# 4. Generation

In [47]:
from langchain_core.prompts import ChatPromptTemplate


In [48]:
# Prompt
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Answer the question based only on the following context:\n{context}\n\nQuestion: {question}\n'), additional_kwargs={})])

In [49]:
# LLM
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# Chain =  Prompt + LLM
chain = prompt | llm


# Run
chain.invoke({"context":docs,"question":"What is Task Decomposition?"})

AIMessage(content='Task Decomposition is a technique used by agents to break down complex tasks into smaller and simpler steps in order to plan ahead and enhance model performance. It involves transforming big tasks into multiple manageable tasks and exploring multiple reasoning possibilities at each step. This can be done through prompting techniques like Chain of Thought or Tree of Thoughts, as well as using task-specific instructions or human inputs.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 74, 'prompt_tokens': 315, 'total_tokens': 389, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'id': 'chatcmpl-CsuNfm76ATCLivJZCX9HXUAgqxHzW', 'service_tier': 'default', 'finish_reason': 'stop', 'l

In [50]:
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("What is Task Decomposition?")

'Task Decomposition is a technique used by agents to break down complex tasks into smaller and simpler steps in order to plan ahead and enhance model performance. It involves transforming big tasks into multiple manageable tasks and exploring multiple reasoning possibilities at each step. This can be done through prompting techniques like Chain of Thought or Tree of Thoughts, as well as using task-specific instructions or human inputs.'

# End