In [20]:
%pip install -q llama-index chromadb loguru pypdf openai sentence_transformers

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/132.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.8/132.8 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import openai

from google.colab import userdata
openai.api_key = userdata.get('OPENAI_API_KEY')

In [3]:
!wget -O Baidu-Inc-Code-of-Business-Conduct-and-Ethics.pdf https://dl.dropbox.com/scl/fi/boxb3mvc0mv7d7tndmy0j/BAIDU-INC.-CODE-OF-BUSINESS-CONDUCT-AND-ETHICS.pdf?rlkey=gs378jtvi6gwzy0ncn0jx0876

--2024-02-02 22:03:09--  https://dl.dropbox.com/scl/fi/boxb3mvc0mv7d7tndmy0j/BAIDU-INC.-CODE-OF-BUSINESS-CONDUCT-AND-ETHICS.pdf?rlkey=gs378jtvi6gwzy0ncn0jx0876
Resolving dl.dropbox.com (dl.dropbox.com)... 162.125.64.15, 2620:100:6022:15::a27d:420f
Connecting to dl.dropbox.com (dl.dropbox.com)|162.125.64.15|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://dl.dropboxusercontent.com/scl/fi/boxb3mvc0mv7d7tndmy0j/BAIDU-INC.-CODE-OF-BUSINESS-CONDUCT-AND-ETHICS.pdf?rlkey=gs378jtvi6gwzy0ncn0jx0876 [following]
--2024-02-02 22:03:09--  https://dl.dropboxusercontent.com/scl/fi/boxb3mvc0mv7d7tndmy0j/BAIDU-INC.-CODE-OF-BUSINESS-CONDUCT-AND-ETHICS.pdf?rlkey=gs378jtvi6gwzy0ncn0jx0876
Resolving dl.dropboxusercontent.com (dl.dropboxusercontent.com)... 162.125.71.15, 2620:100:6021:15::a27d:410f
Connecting to dl.dropboxusercontent.com (dl.dropboxusercontent.com)|162.125.71.15|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 221928 (217K) [app

In [4]:
FILE_NAME = "Baidu-Inc-Code-of-Business-Conduct-and-Ethics.pdf"

In [31]:
from typing import List
import os
import chromadb
from chromadb.api.models.Collection import Collection
from llama_index import (
    QueryBundle,
    ServiceContext,
    SimpleDirectoryReader,
    StorageContext,
    VectorStoreIndex,
    get_response_synthesizer,
)

from llama_index.response_synthesizers.type import ResponseMode
from llama_index.core.base_query_engine import BaseQueryEngine
from llama_index.core.base_retriever import BaseRetriever
from llama_index.embeddings import OpenAIEmbedding
from llama_index.embeddings.openai import OpenAIEmbeddingModelType
from llama_index.embeddings.utils import EmbedType
from llama_index.indices.base import BaseIndex
from llama_index.indices.document_summary import DocumentSummaryIndex
from llama_index.llms import OpenAI
from llama_index.llms.utils import LLMType
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.response.schema import RESPONSE_TYPE
from llama_index.response_synthesizers.base import BaseSynthesizer
from llama_index.schema import NodeWithScore
from llama_index.vector_stores import ChromaVectorStore
from loguru import logger
from llama_index.indices.document_summary import (
    DocumentSummaryIndexEmbeddingRetriever,
    DocumentSummaryIndexLLMRetriever,
)
from llama_index.node_parser import SentenceWindowNodeParser
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor
from llama_index.indices.postprocessor import SentenceTransformerRerank
from llama_index.postprocessor.types import BaseNodePostprocessor

In [26]:
MODE = "OR"
TEMPERATURE = 0.0
SIM_TOP_K = 3
RERANK_TOP_K = 3
CHUNK_OVERLAP = 100
CHUNK_SIZE = 5000
WIN_SZ = 3
DOC_DIR = "./"

In [7]:
def create_vectors(path: str, collection_name="tmp_collection") -> Collection:
    chroma_client = chromadb.PersistentClient(path)
    # https://github.com/run-llama/llama_index/issues/6528
    return chroma_client.get_or_create_collection(collection_name)

In [8]:
class MultiVectorSummaryRetriever(BaseRetriever):

    def __init__(
        self,
        summary_retriever: BaseRetriever,
        vector_retriever: BaseRetriever,
        mode: str = "OR",
    ) -> None:
        """Init params."""

        self._summary_retriever = summary_retriever
        self._vector_retriever = vector_retriever
        if mode not in ("AND", "OR"):
            raise ValueError("Invalid mode.")
        self._mode = mode
        super().__init__()

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Retrieve nodes given query."""

        summary_nodes = self._summary_retriever.retrieve(query_bundle)
        vector_nodes = self._vector_retriever.retrieve(query_bundle)

        summary_ids = {n.node.node_id for n in summary_nodes}
        vector_ids = {n.node.node_id for n in vector_nodes}

        combined_dict = {n.node.node_id: n for n in vector_nodes}
        combined_dict.update({n.node.node_id: n for n in summary_nodes})

        if self._mode == "AND":
            retrieve_ids = vector_ids.intersection(summary_ids)
        else:
            retrieve_ids = vector_ids.union(summary_ids)

        retrieve_nodes = [combined_dict[rid] for rid in retrieve_ids]
        return retrieve_nodes

In [9]:
def create_service_context(llm: LLMType, embs: EmbedType) -> ServiceContext:
    node_parser = SentenceWindowNodeParser.from_defaults(
        window_size=WIN_SZ,
        window_metadata_key="window",
        original_text_metadata_key="original_text",
    )

    return ServiceContext.from_defaults(
        #chunk_overlap=CHUNK_OVERLAP,
        #chunk_size=CHUNK_SIZE,
        node_parser=node_parser,
        llm=llm,
        embed_model=embs,
    )


def create_storage_context() -> StorageContext:
    path: str = "./db/LlamaIndexMultiVectorSummary"
    return StorageContext.from_defaults(
        vector_store=ChromaVectorStore(create_vectors(path=path))
    )

In [10]:
llm = OpenAI(model="gpt-4-1106-preview", temperature=TEMPERATURE)
embs = "local:BAAI/bge-small-en-v1.5"#OpenAIEmbedding(model=OpenAIEmbeddingModelType.TEXT_EMBED_ADA_002)

service_context: ServiceContext = (
    create_service_context(llm, embs)
)

storage_context: StorageContext = (
    create_storage_context()
)

required_exts: List[str] = [".pdf"]
# Create a directory and put some files in it for querying.
input_files: List[str] = [FILE_NAME]
docs: SimpleDirectoryReader = SimpleDirectoryReader(
    DOC_DIR,
    required_exts=required_exts,
    input_files=input_files,
).load_data()

summary_index: BaseIndex = DocumentSummaryIndex.from_documents(
    docs,
    storage_context=storage_context,
    service_context=service_context,
    show_progress=True,
)

vector_index: BaseIndex = VectorStoreIndex.from_documents(
    docs,
    service_context=service_context,
    storage_context=storage_context,
    show_progress=True,
)

logger.info("Loading index from storage")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Parsing nodes:   0%|          | 0/10 [00:00<?, ?it/s]

Summarizing documents:   0%|          | 0/10 [00:00<?, ?it/s]

current doc id: 8fd8b1d0-55d2-4198-8a2d-1dcfd42c8817
current doc id: c87c09bf-204b-4d67-9fe5-6a9c2edf25a5
current doc id: 7c19cb1d-6f44-4d67-8fd1-20fe82cb83c2
current doc id: 7ab088a3-f36b-43cb-ba8e-665773788e93
current doc id: e269692a-5538-4218-a6e0-ed1f8047ce0f
current doc id: 3071c51b-b3ae-43ee-8bd7-2194d4a8ed71
current doc id: 1e94837f-90db-4bd3-abf0-2d63a11901b3
current doc id: 5fdd883e-f9af-44a2-a13c-3695dc2f71f3
current doc id: 7750fbe7-5e31-4ed4-9f19-672638f4a235
current doc id: 890efcaa-7f06-4551-9bb4-6be3413e3d5c


Generating embeddings:   0%|          | 0/10 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/10 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/160 [00:00<?, ?it/s]

[32m2024-02-02 22:06:12.536[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 35>[0m:[36m35[0m - [1mLoading index from storage[0m


In [32]:
# DocumentSummaryIndexEmbeddingRetriever, for embedding retriever mode
# DocumentSummaryIndexLLMRetriever, for LLM based retriever mode
summary_retriever: BaseRetriever = DocumentSummaryIndexLLMRetriever(
    summary_index,
    similarity_top_k=SIM_TOP_K,
)
vector_retriever: BaseRetriever = vector_index.as_retriever(
    similarity_top_k=SIM_TOP_K,
)

multi_vec_sum_retriever: BaseRetriever = MultiVectorSummaryRetriever(
    summary_retriever=summary_retriever,
    vector_retriever=vector_retriever,
    mode=MODE,
)
response_synthesizer: BaseSynthesizer = get_response_synthesizer(
    service_context=service_context,
    response_mode=ResponseMode.REFINE,
)

postproc: BaseNodePostprocessor = MetadataReplacementPostProcessor(target_metadata_key="window")
rerank :BaseNodePostprocessor = SentenceTransformerRerank(
    top_n=RERANK_TOP_K, model="BAAI/bge-reranker-base"
)

query_engine: BaseQueryEngine = RetrieverQueryEngine(
    retriever=multi_vec_sum_retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[postproc, rerank]
)

# Query

The settings of

SIM_TOP_K = 3

RERANK_TOP_K = 3

can affect the query duration.

In [33]:
prompt = """What is the topic of content?
    The response should be presented as a list of key points, after creating the title of the content,
    formatted in HTML with appropriate markup for clarity and organization.
    """
result: RESPONSE_TYPE = query_engine.query(prompt)

In [35]:
from IPython.display import display, HTML

display(HTML(result.response))