In [1]:
# !pip install -U pyvespa FlagEmbedding

In [2]:
# from FlagEmbedding import BGEM3FlagModel

# model = BGEM3FlagModel("BAAI/bge-m3", use_fp16=True)

In [3]:
# # cria a pasta model dentro do pkg para salvar o modelo
# import os

# os.makedirs("pkg/model", exist_ok=True)

# # salva o modelo como um tokenizador JSON e um modelo ONNX
# bm_tokenizer = model.model.tokenizer # AutoModel
# bm_model = model.model.model # AutoModelWithLMHead

# # bm_tokenizer.save_pretrained("pkg/model/chekpoint")
# # bm_model.save_pretrained("pkg/model/chekpoint")

In [4]:
# passage = [
#     "BGE M3 is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction."
# ]
# passage_embeddings = model.encode(
#     passage, return_dense=True, return_sparse=True, return_colbert_vecs=True
# )

In [5]:
# passage_embeddings.keys()

First, we define a Vespa schema with the fields we want to store and their type. We use Vespa tensors to represent the three different M3 representations.

We use a mapped tensor denoted by t{} to represent the sparse lexical representation

We use an indexed tensor denoted by x[1024] to represent the dense single vector representation of 1024 dimensions

For the colbert_rep (multi-vector), we use a mixed tensor that combines a mapped and an indexed dimension. This mixed tensor allows us to represent variable lengths.

We use bfloat16 tensor cell type, saving 50% storage compared to float.

In [6]:
from vespa.package import Schema, Document, Field, FieldSet

m_schema = Schema(
    name="findmypastabgem3",
    document=Document(
        fields=[
            Field(name="id", type="string", indexing=["summary"]),
            Field(
                name="body",
                type="string",
                indexing=["summary", "index"],
                index="enable-bm25",
            ),
            Field(
                name="title",
                type="string",
                indexing=["summary", "index"],
                index="enable-bm25",
            ),
            Field(
                name="lexical_rep",
                type="tensor<bfloat16>(t{})",
                indexing=["summary", "attribute"],
            ),
            Field(
                name="dense_rep",
                type="tensor<bfloat16>(x[1024])",
                indexing=["summary", "attribute"],
                attribute=["distance-metric: angular"],
            ),
            Field(
                name="colbert_rep",
                type="tensor<bfloat16>(t{}, x[1024])",
                indexing=["summary", "attribute"],
            ),
        ],
    ),
    fieldsets=[FieldSet(name="default", fields=["body"])],
)

In [7]:
from vespa.package import Component, Parameter
component = Component(
            id="bge_m3",
            type="model",
            parameters=[
                Parameter("transformer-model", {"path": "https://huggingface.co/BAAI/bge-m3/raw/main/onnx/model.onnx"}),
                Parameter("tokenizer-model", {"path": "https://huggingface.co/BAAI/bge-m3/raw/main/onnx/tokenizer.json"}),
            ],
        )

In the last step, we configure ranking by adding rank-profile’s to the schema.

We define three functions that implement the three different scoring functions for the different representations

dense (dense cosine similarity)

sparse (sparse dot product)

max_sim (The colbert max sim operation)

Then, we combine these three scoring functions using a linear combination with weights, as suggested by the authors here.

In [8]:
from vespa.package import RankProfile, Function, FirstPhaseRanking


semantic = RankProfile(
    name="m3hybrid",
    inputs=[
        ("query(q_dense)", "tensor<bfloat16>(x[1024])"),
        ("query(q_lexical)", "tensor<bfloat16>(t{})"),
        ("query(q_colbert)", "tensor<bfloat16>(qt{}, x[1024])"),
        ("query(q_len_colbert)", "float"),
    ],
    functions=[
        Function(
            name="dense",
            expression="cosine_similarity(query(q_dense), attribute(dense_rep),x)",
        ),
        Function(
            name="lexical", expression="sum(query(q_lexical) * attribute(lexical_rep))"
        ),
        Function(
            name="max_sim",
            expression="sum(reduce(sum(query(q_colbert) * attribute(colbert_rep) , x),max, t),qt)/query(q_len_colbert)",
        ),
    ],
    first_phase=FirstPhaseRanking(
        expression="0.4*dense + 0.2*lexical +  0.4*max_sim", rank_score_drop_limit=0.0
    ),
    match_features=["dense", "lexical", "max_sim", "bm25(body)"],
)
m_schema.add_rank_profile(semantic)

In [9]:
from vespa.package import ApplicationPackage

vespa_app_name = "findmypastabgem3"
vespa_application_package = ApplicationPackage(name=vespa_app_name, schema=[m_schema], components=[component])
# vespa_application_package.to_files('./pkg/')

In [10]:
# # function to convert a datapoint to a vespa fields
# def datapoint_to_vespa_fields(datapoint):
#     passage = [datapoint["body"]]
#     passage_embeddings = model.encode(
#         passage, return_dense=True, return_sparse=True, return_colbert_vecs=True
#     )
#     vespa_fields = {
#         "id": datapoint["id"],
#         "title": datapoint["title"],
#         "body": passage[0],
#         "lexical_rep": {
#             key: float(value)
#             for key, value in passage_embeddings["lexical_weights"][0].items()
#         },
#         "dense_rep": passage_embeddings["dense_vecs"][0].tolist(),
#         "colbert_rep": {
#             index: passage_embeddings["colbert_vecs"][0][index].tolist()
#             for index in range(passage_embeddings["colbert_vecs"][0].shape[0])
#         },
#     }
#     return vespa_fields

In [11]:
# import pandas as pd

# food_dataset = pd.read_csv("./input/food_dataset.csv")
# food_dataset = food_dataset[:5000]
# food_dataset

In [12]:
# from tqdm import tqdm
# batch_size = 100
# for i in tqdm(range(0, len(food_dataset), batch_size), total=len(food_dataset) // batch_size, desc="Embedding"):
#   print(f"Batch {i}")
#   food_dataset_batch = food_dataset[i:i+batch_size].copy()
#   vespa_fields = food_dataset_batch[["id", "body", "title"]].apply(datapoint_to_vespa_fields, axis=1)
#   vespa_fields.to_pickle(f"./embeddings/BGE-m3_batch_{i}.pkl")
# demorou 68min pra rodar 5000

In [13]:
from vespa.deployment import VespaDocker

vespa_docker = VespaDocker()
app = vespa_docker.deploy(application_package=vespa_application_package)


DockerException: Error while fetching server API version: Not supported URL scheme http+docker

In [None]:
# # loading the batches to feed:
# # listing the piclked files inside the embeddings folder
# import os
# import pickle

# embeddings_folder = "./embeddings"
# embeddings_files = os.listdir(embeddings_folder)
# embeddings_files

In [None]:
# from vespa.io import VespaResponse
# from ipywidgets import IntProgress, VBox, Label, Layout  # Import Layout for styling
# from IPython.display import display
# import time
# from vespa.application import Vespa

# class VespaFeeder:
#     def __init__(self, app):
#         self.app = app

#     def feed(self, data_to_feed):
#         self.vespa_feed_slice = data_to_feed.apply(self.to_vespa_format, axis=1)

#         # Create a progress bar widget
#         self.progress_bar = IntProgress(min=0, max=len(self.vespa_feed_slice), description='Progress:', layout=Layout(width='50%'))
#         self.progress_label = Label(value="Feeding documents: 0/{}".format(len(self.vespa_feed_slice)))

#         display(VBox([self.progress_bar, self.progress_label]))
#         self.start_time = time.time()

#         self.app.feed_iterable(self.vespa_feed_slice, schema="doc", namespace="findmypasta", callback=self.callback)

#     def to_vespa_format(self, x):
#         return {"id": x["id"], "fields": { "title": x["title"], "body": x["body"], "id": x["id"]}}


#     def callback(self, response: VespaResponse, id: str):
#         if not response.is_successful():
#             print(f"Error when feeding document {id}: {response.get_json()}")
        
#         # Update the progress bar value
#         self.progress_bar.value += 1
#         self.progress_label.value = f"Feeding documents: {self.progress_bar.value}/{self.progress_bar.max} ({self.progress_bar.value * 100 / self.progress_bar.max:.2f}%)"
#         self.update_estimated_time()

#     def update_estimated_time(self):
#         if self.progress_bar.value > 0:
#             self.progress_bar.bar_style = 'info'
#             self.progress_bar.style.bar_color = '#00AA00'
#             self.progress_bar.style.description_width = 'initial'
#             remaining_documents = self.progress_bar.max - self.progress_bar.value
#             time_per_document = (time.time() - self.start_time) / self.progress_bar.value
#             estimated_remaining_time = remaining_documents * time_per_document
#             self.progress_bar.description = f'Progress: (ETA: {self.format_time(estimated_remaining_time)})'

#     def format_time(self, time_in_seconds: float) -> str:
#         hours = int(time_in_seconds // 3600)
#         time_in_seconds = time_in_seconds - (hours * 3600)
#         minutes = int(time_in_seconds // 60)
#         seconds = int(time_in_seconds - (minutes * 60))
#         return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
