In [1]:
# !pip install -U FlagEmbedding -qq

In [8]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
from typing import List
import pandas as pd
import torch
from datasets import Dataset, load_dataset
from FlagEmbedding import BGEM3FlagModel
from tqdm.auto import tqdm
from transformers import AutoModelForMaskedLM, AutoTokenizer

In [3]:
model = BGEM3FlagModel('BAAI/bge-m3',  use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [4]:
# sentences_1 = ["What is BGE M3?", "Defination of BM25"]
# sentences_2 = ["BGE M3 is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction.", 
sample = "BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document"
batch = [sample]*2

def get_sparse_vector(batch: List[str]):
    output = model.encode(batch, return_dense=False, return_sparse=True, return_colbert_vecs=False)
    return output["lexical_weights"]

In [5]:
corpus = load_dataset("BeIR/scifact", "corpus")["corpus"]

Downloading data:   0%|          | 0.00/4.58M [00:00<?, ?B/s]

Generating corpus split:   0%|          | 0/5183 [00:00<?, ? examples/s]

In [6]:
def batch_iterator(iterable, batch_size=128):
    """
    Iterates over an iterable in batches of a given size.

    Args:
        iterable: An iterable object.
        batch_size: The size of each batch.

    Yields:
        A batch of items from the iterable.
    """

    iterable_length = len(iterable)
    for ndx in range(0, iterable_length, batch_size):
        yield iterable[ndx:min(ndx + batch_size, iterable_length)]

# Example usage:
for batch in batch_iterator(range(10), 12):
    print(batch)

range(0, 10)


In [7]:
corpus = load_dataset("BeIR/scifact", "corpus")["corpus"]

In [10]:
corpus.features

{'_id': Value(dtype='string', id=None),
 'title': Value(dtype='string', id=None),
 'text': Value(dtype='string', id=None)}

In [14]:
sparse_vectors = get_sparse_vector(corpus["text"])

Inference Embeddings: 100%|██████████| 432/432 [09:50<00:00,  1.37s/it]


In [15]:
corpus.add_column("bge_m3_sparse_vector", sparse_vectors)

Dataset({
    features: ['_id', 'title', 'text', 'bge_m3_sparse_vector'],
    num_rows: 5183
})

In [16]:
import getpass # for password input
corpus.push_to_hub("nirantk/scifact-bge-m3-sparse-vectors", token=getpass.getpass("Enter your Hugging Face API token: "))

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/nirantk/scifact-bge-m3-sparse-vectors/commit/13935d28521a588a4474586b47437aa0faf16368', commit_message='Upload dataset', commit_description='', oid='13935d28521a588a4474586b47437aa0faf16368', pr_url=None, pr_revision=None, pr_num=None)