In [1]:
!pip install langchain_community langchain-openai chromadb langchainhub langchain tiktoken

Collecting langchain_community
  Downloading langchain_community-0.3.15-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-openai
  Downloading langchain_openai-0.3.1-py3-none-any.whl.metadata (2.7 kB)
Collecting chromadb
  Downloading chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting langchainhub
  Downloading langchainhub-0.1.21-py3-none-any.whl.metadata (659 bytes)
Collecting tiktoken
  Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain
  Downloading langchain-0.3.15-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.31 (from langchain_community)
  Downloading langchain_core-0.3.31-py3-none-any.whl.metadata (6.3 

In [25]:
from google.colab import userdata


In [24]:
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = userdata.get('LANGCHAIN_API_KEY')
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

In [23]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings


In [37]:
# loading external knowledge base from Internet
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2024-11-28-reward-hacking/",),
    bs_kwargs=dict(
        parse_only = bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    )
)
docs = loader.load()

In [38]:
# splitting data into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split = text_splitter.split_documents(docs)

In [39]:
# embedding chunks of doc through openai embedding model and then store them in chroma db

vector_store = Chroma.from_documents(
    documents=split,
    embedding=OpenAIEmbeddings()
)


In [40]:
# creating a retriever that will do a sementic search in vector db
rag_retriever = vector_store.as_retriever(search_kwargs={"k": 3})

In [41]:
# Prompt
prompt = hub.pull("rlm/rag-prompt")

In [42]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

In [43]:
# formatting docs together in single line with \n\n
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [46]:
# now creating a final chain

rag_chain = (
    {"context": rag_retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)
rag_chain.invoke("What is Spurious Correlation ?")

'Spurious correlation, also known as shortcut learning, refers to the phenomenon where a classifier overfits to irrelevant features instead of learning the intended task. This can lead to poor generalization and performance on out-of-distribution data. Reward hacking examples in real life include optimizing for proxy metrics that do not align with the true goals, such as maximizing engagement at the expense of user well-being.'