In [1]:
# !pip install -U FlagEmbedding -qq

In [8]:
%load_ext autoreload
%autoreload 2

In [36]:
import getpass
import json
import os
from typing import List

from datasets import Dataset, load_dataset
from FlagEmbedding import BGEM3FlagModel

from dotenv import load_dotenv

load_dotenv()

True

In [3]:
model = BGEM3FlagModel(
    "BAAI/bge-m3", use_fp16=True
)  # Setting use_fp16 to True speeds up computation with a slight performance degradation

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [4]:
# sentences_1 = ["What is BGE M3?", "Defination of BM25"]
# sentences_2 = ["BGE M3 is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction.",
sample = "BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document"
batch = [sample] * 2


def get_sparse_vector(batch: List[str]):
    output = model.encode(
        batch, return_dense=False, return_sparse=True, return_colbert_vecs=False
    )
    return output["lexical_weights"]

In [5]:
corpus = load_dataset("BeIR/scifact", "corpus")["corpus"]

Downloading data:   0%|          | 0.00/4.58M [00:00<?, ?B/s]

Generating corpus split:   0%|          | 0/5183 [00:00<?, ? examples/s]

In [6]:
def batch_iterator(iterable, batch_size=128):
    """
    Iterates over an iterable in batches of a given size.

    Args:
        iterable: An iterable object.
        batch_size: The size of each batch.

    Yields:
        A batch of items from the iterable.
    """

    iterable_length = len(iterable)
    for ndx in range(0, iterable_length, batch_size):
        yield iterable[ndx : min(ndx + batch_size, iterable_length)]


# Example usage:
for batch in batch_iterator(range(10), 12):
    print(batch)

range(0, 10)


In [7]:
corpus = load_dataset("BeIR/scifact", "corpus")["corpus"]

In [10]:
corpus.features

{'_id': Value(dtype='string', id=None),
 'title': Value(dtype='string', id=None),
 'text': Value(dtype='string', id=None)}

In [39]:
raw_sparse_vectors = get_sparse_vector(corpus["text"])

Inference Embeddings: 100%|██████████| 432/432 [10:38<00:00,  1.48s/it]


In [1]:
raw_sparse_vectors[0]

NameError: name 'raw_sparse_vectors' is not defined

In [70]:
# change type of all values to float from float16
float_sparse_vectors = []
for sv in raw_sparse_vectors:
    # print(sv, type(sv))
    new_sv = {}
    for k, v in sv.items():
        new_sv[k] = float(v)
    float_sparse_vectors.append(new_sv)

# convert to json
json_sparse_vectors = [json.dumps(sv) for sv in float_sparse_vectors]

In [71]:
new_ds = corpus.add_column("bge_m3_sparse_vector", json_sparse_vectors)

In [72]:
new_ds[0]

{'_id': '4983',
 'title': 'Microstructural development of human newborn cerebral white matter assessed in vivo by diffusion tensor magnetic resonance imaging.',
 'text': 'Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A line scan diffusion-weighted magnetic resonance imaging (MRI) sequence with diffusion tensor analysis was applied to measure the apparent diffusion coefficient, to calculate relative anisotropy, and to delineate three-dimensional fiber architecture in cerebral white matter in preterm (n = 17) and full-term infants (n = 7). To assess effects of prematurity on cerebral white matter development, early gestation preterm infants (n = 10) were studied a second time at term. In the central white matter the mean apparent diffusion coefficient at 28 wk was high, 1.8 microm2/ms, and decreased toward term to 1.2 microm2/ms. In the posterior limb of the internal capsule, t

In [73]:
new_ds.features

{'_id': Value(dtype='string', id=None),
 'title': Value(dtype='string', id=None),
 'text': Value(dtype='string', id=None),
 'bge_m3_sparse_vector': Value(dtype='string', id=None)}

In [74]:
new_ds.push_to_hub(
    "nirantk/scifact-bge-m3-sparse-vectors",
    token=os.getenv("HF_WRITE_KEY", getpass.getpass("Enter token: ")),
)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/764k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/nirantk/scifact-bge-m3-sparse-vectors/commit/25678d95f13b366c6a8a50e0290ad0667ba79f75', commit_message='Upload dataset', commit_description='', oid='25678d95f13b366c6a8a50e0290ad0667ba79f75', pr_url=None, pr_revision=None, pr_num=None)