In [None]:
%pip install langchain_community
%pip install langchain_text_splitters
%pip install langchain-openai 
%pip install langchainhub 
%pip install chromadb 
%pip install langchain
%pip install beautifulsoup4

In [None]:
import os
from langchain_community.document_loaders import WebBaseLoader
import bs4
import openai
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
import chromadb
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
# OpenAI Setup
os.environ['OPENAI_API_KEY'] = ''
openai.api_key = os.environ['OPENAI_API_KEY']

In [None]:
#### INDEXING ####

In [None]:
# Load Documents
loader = WebBaseLoader(
    web_paths=("https://kbourne.github.io/chapter1.html",), 
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

In [None]:
# Split
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)
splits = text_splitter.split_documents(docs)

In [None]:
# Embed
vectorstore = Chroma.from_documents(
    documents=splits, 
    embedding=OpenAIEmbeddings()
)

retriever = vectorstore.as_retriever()

In [None]:
#### RETRIEVAL and GENERATION ####

In [None]:
# Prompt
prompt = hub.pull("jclemens24/rag-prompt")

In [None]:
# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [None]:
# LLM
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

In [None]:
# Chain it all together with LangChain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
# Question - run the chain
rag_chain.invoke("What are the advantages of using RAG?")