https://arxiv.org/pdf/2212.10496.pdf
langchain version: https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_5_to_9.ipynb
video course: https://youtu.be/SaDzIVkYqyY?si=RmuR3tra6QFWGWt7

In [43]:
!wget -O ./tmp/Baidu-Inc-Code-of-Business-Conduct-and-Ethics.pdf https://dl.dropbox.com/scl/fi/boxb3mvc0mv7d7tndmy0j/BAIDU-INC.-CODE-OF-BUSINESS-CONDUCT-AND-ETHICS.pdf?rlkey=gs378jtvi6gwzy0ncn0jx0876
file_path = "./tmp/Baidu-Inc-Code-of-Business-Conduct-and-Ethics.pdf"

zsh:1: no matches found: https://dl.dropbox.com/scl/fi/boxb3mvc0mv7d7tndmy0j/BAIDU-INC.-CODE-OF-BUSINESS-CONDUCT-AND-ETHICS.pdf?rlkey=gs378jtvi6gwzy0ncn0jx0876


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [44]:
from rich.pretty import pprint
from typing import Any


def pretty_print(title: str = None, content: Any = None):
    if title is None:
        print(content)
        return
    print(title)
    pprint(content)

In [45]:
SIM_TOP_K = 5
RERANK_TOP_K = 5
WIN_SZ = 5

In [46]:
from typing import List

from llama_index.core import (ServiceContext, SimpleDirectoryReader,
                              VectorStoreIndex)
from llama_index.core.embeddings.utils import EmbedType
from llama_index.core.indices.base import BaseIndex
from llama_index.core.base.llms.base import BaseLLM
from llama_index.core.llms.utils import LLMType
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.llms.ollama import Ollama
from llama_index.llms.groq import Groq

#llm: BaseLLM = Ollama(model="gemma:2b-instruct", temperature=0)
llm: BaseLLM = Groq(model="mixtral-8x7b-32768", temperature=0)

def create_service_context(llm: LLMType, embs: EmbedType) -> ServiceContext:
    node_parser = SentenceWindowNodeParser.from_defaults(
        window_size=WIN_SZ,
        window_metadata_key="window",
        original_text_metadata_key="original_text",
    )

    return ServiceContext.from_defaults(
        node_parser=node_parser,
        llm=llm,
        embed_model=embs,
    )


embs = "local:BAAI/bge-small-en-v1.5"  # OpenAIEmbedding(model=OpenAIEmbeddingModelType.TEXT_EMBED_ADA_002)
service_context: ServiceContext = create_service_context(llm, embs)

input_files: List[str] = [file_path]
docs: SimpleDirectoryReader = SimpleDirectoryReader(
    input_files=input_files,
).load_data()


vector_index: BaseIndex = VectorStoreIndex.from_documents(
    docs,
    service_context=service_context,
    show_progress=True,
)

base_retriever = vector_index.as_retriever()

  return ServiceContext.from_defaults(
Parsing nodes: 100%|██████████| 10/10 [00:00<00:00, 594.89it/s]
Generating embeddings: 100%|██████████| 160/160 [00:13<00:00, 11.50it/s]


In [47]:
from typing import List

from llama_index.core import PromptTemplate, QueryBundle
from llama_index.core.base.llms.base import BaseLLM
from llama_index.core.retrievers import BaseRetriever
from llama_index.core.schema import NodeWithScore


class HyDERetriever(BaseRetriever):
    def __init__(self, base_retriever: BaseRetriever, hypo_gen_model: BaseLLM):
        self.base_retriever = base_retriever
        self.hypo_gen_model = hypo_gen_model
        self.hypothesis_template = PromptTemplate(
            """Write a hypothesis paper about question as you can.

            Only return the paper content without any other information, ie. leading text and so on.
            
            Question: {question}

            """
        )
    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        return []
    
    async def _aretrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        query_str: str = query_bundle.query_str
        gen_answer = self.hypo_gen_model.complete(
            self.hypothesis_template.format(question=query_str)
        ).text.strip()
        pretty_print("Generated info", gen_answer)
        return await self.base_retriever.aretrieve(gen_answer)

In [48]:
from llama_index.core.indices.postprocessor import (
    MetadataReplacementPostProcessor,
    SentenceTransformerRerank,
)
from llama_index.core.base.llms.base import BaseLLM
from llama_index.core.postprocessor.types import BaseNodePostprocessor
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.llms.ollama import Ollama
from llama_index.llms.groq import Groq
from llama_index.core.response_synthesizers.base import BaseSynthesizer
from llama_index.core import get_response_synthesizer
from llama_index.core.response_synthesizers.type import ResponseMode
from llama_index.core.base.response.schema import RESPONSE_TYPE

query_text = """What is the "CODE OF BUSINESS CONDUCT AND ETHICS" of BAIDU Inc.? 
    """


# hypo_gen_model: BaseLLM = Ollama(model="gemma:2b-instruct", temperature=1.5)
hypo_gen_model: BaseLLM = Groq(model="mixtral-8x7b-32768", temperature=1.5)
hyde_retriever = HyDERetriever(base_retriever, hypo_gen_model)
postproc: BaseNodePostprocessor = MetadataReplacementPostProcessor(
    target_metadata_key="window"
)
rerank: BaseNodePostprocessor = SentenceTransformerRerank(
    top_n=RERANK_TOP_K, model="BAAI/bge-reranker-base"
)
response_synthesizer: BaseSynthesizer = get_response_synthesizer(
    service_context=service_context,
    response_mode=ResponseMode.REFINED,
)
final_res: RESPONSE_TYPE = await RetrieverQueryEngine(
    hyde_retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[postproc, rerank],
).aquery(query_text)

Generated info


In [51]:
from IPython.display import HTML
from llama_index.llms.groq import Groq
from llama_index.llms.ollama import Ollama

pretty_print("final_res.response", final_res.response)

fmt_model: BaseLLM = Groq(model="mixtral-8x7b-32768", temperature=1.5)
#Ollama(model="gemma:2b-instruct", temperature=1.5, request_timeout=120)
final_text: str = fmt_model.complete(
    f"""Format the text in HTML with appropriate markup for clarity and organization, bullet points if necessary, and so on.
    Do not add any additional information, only format the text.

    text: 
    
    {final_res.response}

    """
).text.strip()


HTML(final_text)

final_res.response
