# RAG from Scratch

In [None]:
%pip install 

In [None]:
import os
from access import Access

os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = Access.LANGCHAIN_API_KEY

In [1]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/")
docs = loader.load()

loader = WebBaseLoader("https://lilianweng.github.io/posts/2024-07-07-hallucination/")
docs.extend(loader.load())

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
docs

[Document(metadata={'source': 'https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/', 'title': "Prompt Engineering | Lil'Log", 'description': 'Prompt Engineering, also known as In-Context Prompting, refers to methods for how to communicate with LLM to steer its behavior for desired outcomes without updating the model weights. It is an empirical science and the effect of prompt engineering methods can vary a lot among models, thus requiring heavy experimentation and heuristics.\nThis post only focuses on prompt engineering for autoregressive language models, so nothing with Cloze tests, image generation or multimodality models. At its core, the goal of prompt engineering is about alignment and model steerability. Check my previous post on controllable text generation.', 'language': 'en'}, page_content='\n\n\n\n\n\nPrompt Engineering | Lil\'Log\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nLil\'Log\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n|\n\n\n\n

In [5]:
import uuid

from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama import ChatOllama

chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template("Summarize the following document:\n\n{doc}")
    | ChatOllama(model="llama3.1")
    | StrOutputParser()
)

summaries = chain.batch(docs, {"max_concurrency": 5})

In [6]:
summaries

['This is a comprehensive guide to prompt engineering for large language models (LLMs). The article covers various aspects of prompt engineering, including:\n\n1. **Chain-of-Thought Prompting**: This approach involves generating intermediate reasoning steps and presenting them as part of the input to the LLM.\n2. **In-Context Learning**: This technique involves providing the LLM with a set of context examples that illustrate how to answer a specific question or complete a task.\n3. **Active Prompting**: This method involves actively selecting prompts for the LLM based on its performance and adaptability.\n4. **Automatic Chain-of-Thought Prompting**: This approach uses techniques such as natural language processing (NLP) and machine learning to automatically generate chain-of-thought prompts.\n5. **Augmented Language Models**: This technique involves combining LLMs with external tools or APIs to enhance their capabilities.\n\nThe article also discusses various methods for improving the 

In [15]:
from langchain_core.retrievers import BaseRetriever
from langchain_core.documents import Document
from typing import List, Any

class SimpleMultiVectorRetriever(BaseRetriever):
    vectorstore: Any
    byte_store: Any
    id_key: str = "doc_id"
    k: int = 4

    def _get_relevant_documents(self, query: str) -> List[Document]:
        # 1. search summary vectors
        summary_docs = self.vectorstore.similarity_search(query, k=self.k)

        # 2. collect unique document IDs
        doc_ids = []
        for doc in summary_docs:
            doc_id = doc.metadata.get(self.id_key)
            if doc_id and doc_id not in doc_ids:
                doc_ids.append(doc_id)

        # 3. fetch full documents from byte store
        docs = []
        for doc_id in doc_ids:
            raw = self.byte_store.mget([doc_id])[0]
            if raw is not None:
                docs.append(
                    Document(page_content=raw.decode("utf-8"))
                )

        return docs


In [17]:
from langchain_core.stores import InMemoryByteStore
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
# from langchain.retrievers.multi_vector import MultiVectorRetriever

vectorstore = Chroma(collection_name="summaries",
                     embedding_function=OllamaEmbeddings(model="mxbai-embed-large"))

store = InMemoryByteStore()
id_key = "doc_id"

retriever = SimpleMultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
    k=4
)
doc_ids = [str(uuid.uuid4()) for _ in docs]

summary_docs = [
    Document(page_content=s, metadata={id_key: doc_ids[i]}) for i, s in enumerate(summaries)
]

vectorstore.add_documents(summary_docs)

# retriever.docstore.mset(list(zip(doc_ids, docs)))
store.mset([
    (doc_ids[i], docs[i].page_content.encode("utf-8"))
    for i in range(len(docs))
])

In [19]:
query = "Prompt Engineering"
sub_docs = vectorstore.similarity_search(query, k=1)
sub_docs[0]

Document(metadata={'doc_id': '9e82ad24-7c65-49f8-9854-bdd70ea55762'}, page_content='This is a comprehensive guide to prompt engineering for large language models (LLMs). The article covers various aspects of prompt engineering, including:\n\n1. **Chain-of-Thought Prompting**: This approach involves generating intermediate reasoning steps and presenting them as part of the input to the LLM.\n2. **In-Context Learning**: This technique involves providing the LLM with a set of context examples that illustrate how to answer a specific question or complete a task.\n3. **Active Prompting**: This method involves actively selecting prompts for the LLM based on its performance and adaptability.\n4. **Automatic Chain-of-Thought Prompting**: This approach uses techniques such as natural language processing (NLP) and machine learning to automatically generate chain-of-thought prompts.\n5. **Augmented Language Models**: This technique involves combining LLMs with external tools or APIs to enhance th

In [26]:
retrieved_docs = retriever._get_relevant_documents(query)
retrieved_docs[0].page_content

'\n\n\n\n\n\nPrompt Engineering | Lil\'Log\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nLil\'Log\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n|\n\n\n\n\n\n\nPosts\n\n\n\n\nArchive\n\n\n\n\nSearch\n\n\n\n\nTags\n\n\n\n\nFAQ\n\n\n\n\n\n\n\n\n\n      Prompt Engineering\n    \nDate: March 15, 2023  |  Estimated Reading Time: 21 min  |  Author: Lilian Weng\n\n\n \n\n\nTable of Contents\n\n\n\nBasic Prompting\n\nZero-Shot\n\nFew-shot\n\nTips for Example Selection\n\nTips for Example Ordering\n\n\n\nInstruction Prompting\n\nSelf-Consistency Sampling\n\nChain-of-Thought (CoT)\n\nTypes of CoT prompts\n\nTips and Extensions\n\n\nAutomatic Prompt Design\n\nAugmented Language Models\n\nRetrieval\n\nProgramming Language\n\nExternal APIs\n\n\nCitation\n\nUseful Resources\n\nReferences\n\n\n\n\n\nPrompt Engineering, also known as In-Context Prompting, refers to methods for how to communicate with LLM to steer its behavior for desired outcomes without updating the model weights

### Big idea
1. we save the article by summarising it and converting it into vector embeddings and save it in Chroma vectortores.
2. We also save the original documents as bytestore and have the id as the key.
3. During retrieval, we find the most similar summary to the query, but instead of fetching the summary, we fetch the full documents.

this is effective because long text is not suitable for embeddings similarity as there are too many existing words. so by storing the summary in vectorstores that is retrievable and save the full docs as bytestore, and map it by id, we can do effective similarity search but still get the original docs.

Inside the vectorstore, we have the docs id as the metadata, but for the bytestore we can have it as the key so that it automaticall returns the correct document.